29#include "llvm/IR/IntrinsicsAMDGPU.h"
32#define DEBUG_TYPE "amdgpu-isel"
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
49#include
"AMDGPUGenGlobalISel.inc"
52#include
"AMDGPUGenGlobalISel.inc"
64 MRI = &
MF.getRegInfo();
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
80 F,
"intrinsic not supported on subtarget",
I.getDebugLoc(),
DS_Error));
89 auto &RegClassOrBank = MRI.getRegClassOrRegBank(
Reg);
90 const TargetRegisterClass *RC =
93 const LLT Ty = MRI.getType(
Reg);
97 return MRI.getVRegDef(
Reg)->getOpcode() != AMDGPU::G_TRUNC &&
102 return RB->
getID() == AMDGPU::VCCRegBankID;
105bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(
MachineInstr &
MI,
106 unsigned NewOpc)
const {
107 MI.setDesc(TII.get(NewOpc));
111 MachineOperand &Dst =
MI.getOperand(0);
112 MachineOperand &Src =
MI.getOperand(1);
118 const TargetRegisterClass *DstRC
119 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
120 const TargetRegisterClass *SrcRC
121 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
122 if (!DstRC || DstRC != SrcRC)
125 if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
126 !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
128 const MCInstrDesc &MCID =
MI.getDesc();
130 MI.getOperand(0).setIsEarlyClobber(
true);
135bool AMDGPUInstructionSelector::selectCOPY(
MachineInstr &
I)
const {
138 I.setDesc(TII.get(TargetOpcode::COPY));
140 const MachineOperand &Src =
I.getOperand(1);
141 MachineOperand &Dst =
I.getOperand(0);
145 if (isVCC(DstReg, *MRI)) {
146 if (SrcReg == AMDGPU::SCC) {
147 const TargetRegisterClass *RC
148 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
151 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
154 if (!isVCC(SrcReg, *MRI)) {
156 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
159 const TargetRegisterClass *SrcRC
160 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
162 std::optional<ValueAndVReg> ConstVal =
166 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
168 .
addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
170 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
177 assert(Subtarget->useRealTrue16Insts());
178 const int64_t NoMods = 0;
179 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
185 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
192 bool IsSGPR = TRI.isSGPRClass(SrcRC);
193 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
200 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
206 if (!MRI->getRegClassOrNull(SrcReg))
207 MRI->setRegClass(SrcReg, SrcRC);
212 const TargetRegisterClass *RC =
213 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
214 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
220 for (
const MachineOperand &MO :
I.operands()) {
221 if (MO.getReg().isPhysical())
224 const TargetRegisterClass *RC =
225 TRI.getConstrainedRegClassForOperand(MO, *MRI);
228 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
233bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(
MachineInstr &
I)
const {
236 Register VCCReg =
I.getOperand(1).getReg();
240 if (STI.hasScalarCompareEq64()) {
242 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
245 Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
246 Cmp =
BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
253 Register DstReg =
I.getOperand(0).getReg();
257 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
260bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(
MachineInstr &
I)
const {
264 Register DstReg =
I.getOperand(0).getReg();
265 Register SrcReg =
I.getOperand(1).getReg();
266 std::optional<ValueAndVReg> Arg =
270 const int64_t
Value = Arg->Value.getZExtValue();
272 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
279 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
285 unsigned SelectOpcode =
286 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
296bool AMDGPUInstructionSelector::selectReadAnyLane(
MachineInstr &
I)
const {
297 Register DstReg =
I.getOperand(0).getReg();
298 Register SrcReg =
I.getOperand(1).getReg();
303 auto RFL =
BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
311bool AMDGPUInstructionSelector::selectPHI(
MachineInstr &
I)
const {
312 const Register DefReg =
I.getOperand(0).getReg();
313 const LLT DefTy = MRI->getType(DefReg);
325 MRI->getRegClassOrRegBank(DefReg);
327 const TargetRegisterClass *DefRC =
336 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
345 for (
unsigned i = 1; i !=
I.getNumOperands(); i += 2) {
346 const Register SrcReg =
I.getOperand(i).getReg();
348 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
350 const LLT SrcTy = MRI->getType(SrcReg);
351 const TargetRegisterClass *SrcRC =
352 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
353 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
358 I.setDesc(TII.get(TargetOpcode::PHI));
359 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
365 unsigned SubIdx)
const {
369 Register DstReg = MRI->createVirtualRegister(&SubRC);
372 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.
getSubReg(), SubIdx);
374 BuildMI(*BB,
MI,
MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
400 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
402 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
404 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
410bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(
MachineInstr &
I)
const {
411 Register DstReg =
I.getOperand(0).getReg();
412 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
414 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
415 if (DstRB->
getID() != AMDGPU::SGPRRegBankID &&
416 DstRB->
getID() != AMDGPU::VCCRegBankID)
419 bool Is64 =
Size > 32 || (DstRB->
getID() == AMDGPU::VCCRegBankID &&
432bool AMDGPUInstructionSelector::selectG_ADD_SUB(
MachineInstr &
I)
const {
435 Register DstReg =
I.getOperand(0).getReg();
437 LLT Ty = MRI->getType(DstReg);
442 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
443 const bool IsSALU = DstRB->
getID() == AMDGPU::SGPRRegBankID;
444 const bool Sub =
I.getOpcode() == TargetOpcode::G_SUB;
448 const unsigned Opc =
Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
451 .
add(
I.getOperand(1))
452 .
add(
I.getOperand(2))
459 if (STI.hasAddNoCarryInsts()) {
460 const unsigned Opc =
Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
461 I.setDesc(TII.get(
Opc));
468 const unsigned Opc =
Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
470 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
474 .
add(
I.getOperand(1))
475 .
add(
I.getOperand(2))
482 assert(!
Sub &&
"illegal sub should not reach here");
484 const TargetRegisterClass &RC
485 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
486 const TargetRegisterClass &HalfRC
487 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
489 MachineOperand Lo1(getSubOperand64(
I.getOperand(1), HalfRC, AMDGPU::sub0));
490 MachineOperand Lo2(getSubOperand64(
I.getOperand(2), HalfRC, AMDGPU::sub0));
491 MachineOperand Hi1(getSubOperand64(
I.getOperand(1), HalfRC, AMDGPU::sub1));
492 MachineOperand Hi2(getSubOperand64(
I.getOperand(2), HalfRC, AMDGPU::sub1));
494 Register DstLo = MRI->createVirtualRegister(&HalfRC);
495 Register DstHi = MRI->createVirtualRegister(&HalfRC);
498 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
501 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
506 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
507 Register CarryReg = MRI->createVirtualRegister(CarryRC);
508 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
513 MachineInstr *Addc =
BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
523 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
530 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
537bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
542 Register Dst0Reg =
I.getOperand(0).getReg();
543 Register Dst1Reg =
I.getOperand(1).getReg();
544 const bool IsAdd =
I.getOpcode() == AMDGPU::G_UADDO ||
545 I.getOpcode() == AMDGPU::G_UADDE;
546 const bool HasCarryIn =
I.getOpcode() == AMDGPU::G_UADDE ||
547 I.getOpcode() == AMDGPU::G_USUBE;
549 if (isVCC(Dst1Reg, *MRI)) {
550 unsigned NoCarryOpc =
551 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
552 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
553 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
560 Register Src0Reg =
I.getOperand(2).getReg();
561 Register Src1Reg =
I.getOperand(3).getReg();
564 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
565 .
addReg(
I.getOperand(4).getReg());
568 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
569 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
571 auto CarryInst =
BuildMI(*BB, &
I,
DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
572 .
add(
I.getOperand(2))
573 .
add(
I.getOperand(3));
575 if (MRI->use_nodbg_empty(Dst1Reg)) {
576 CarryInst.setOperandDead(3);
578 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::COPY), Dst1Reg)
580 if (!MRI->getRegClassOrNull(Dst1Reg))
581 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
584 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
585 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
586 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
590 !RBI.constrainGenericRegister(
I.getOperand(4).getReg(),
591 AMDGPU::SReg_32RegClass, *MRI))
598bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
602 const bool IsUnsigned =
I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
603 bool UseNoCarry = Subtarget->hasMadNC64_32Insts() &&
604 MRI->use_nodbg_empty(
I.getOperand(1).getReg());
607 if (Subtarget->hasMADIntraFwdBug())
608 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
609 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
611 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
612 : AMDGPU::V_MAD_NC_I64_I32_e64;
614 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
619 I.setDesc(TII.get(
Opc));
621 I.addImplicitDefUseOperands(*
MF);
622 I.getOperand(0).setIsEarlyClobber(
true);
628bool AMDGPUInstructionSelector::selectG_EXTRACT(
MachineInstr &
I)
const {
630 Register DstReg =
I.getOperand(0).getReg();
631 Register SrcReg =
I.getOperand(1).getReg();
632 LLT DstTy = MRI->getType(DstReg);
633 LLT SrcTy = MRI->getType(SrcReg);
638 unsigned Offset =
I.getOperand(2).getImm();
639 if (
Offset % 32 != 0 || DstSize > 128)
647 const TargetRegisterClass *DstRC =
648 TRI.getConstrainedRegClassForOperand(
I.getOperand(0), *MRI);
649 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
652 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
653 const TargetRegisterClass *SrcRC =
654 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
659 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
664 *SrcRC,
I.getOperand(1));
666 BuildMI(*BB, &
I,
DL, TII.get(TargetOpcode::COPY), DstReg)
667 .
addReg(SrcReg, {}, SubReg);
673bool AMDGPUInstructionSelector::selectS16MergeToS32(
MachineInstr &
MI)
const {
678 LLT Src0Ty = MRI->getType(Src0);
679 LLT Src1Ty = MRI->getType(Src1);
681 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
682 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, TRI);
683 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, TRI);
684 const bool IsVector = DstBank->
getID() == AMDGPU::VGPRRegBankID;
690 MachineBasicBlock *BB =
MI.getParent();
695 if (Src0Bank->
getID() == AMDGPU::VGPRRegBankID &&
696 Src1Bank->
getID() == AMDGPU::VGPRRegBankID &&
698 BuildMI(*BB,
MI,
DL, TII.get(TargetOpcode::REG_SEQUENCE), Dst)
704 if (!RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI))
707 MI.eraseFromParent();
712 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
713 auto MIB =
BuildMI(*BB,
MI,
DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
718 MIB =
BuildMI(*BB,
MI,
DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
724 MI.eraseFromParent();
747 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
748 if (Shift0 && Shift1) {
749 Opc = AMDGPU::S_PACK_HH_B32_B16;
750 MI.getOperand(1).setReg(ShiftSrc0);
751 MI.getOperand(2).setReg(ShiftSrc1);
753 Opc = AMDGPU::S_PACK_LH_B32_B16;
754 MI.getOperand(2).setReg(ShiftSrc1);
758 if (ConstSrc1 && ConstSrc1->Value == 0) {
760 auto MIB =
BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
765 MI.eraseFromParent();
769 if (STI.hasSPackHL()) {
770 Opc = AMDGPU::S_PACK_HL_B32_B16;
771 MI.getOperand(1).setReg(ShiftSrc0);
775 MI.setDesc(TII.get(
Opc));
780bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(
MachineInstr &
MI)
const {
781 MachineBasicBlock *BB =
MI.getParent();
783 LLT DstTy = MRI->getType(DstReg);
784 LLT SrcTy = MRI->getType(
MI.getOperand(1).getReg());
790 MI.getNumOperands() == 3) {
791 return selectS16MergeToS32(
MI);
797 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
799 const TargetRegisterClass *DstRC =
800 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
804 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
805 MachineInstrBuilder MIB =
806 BuildMI(*BB, &
MI,
DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
807 for (
int I = 0,
E =
MI.getNumOperands() - 1;
I !=
E; ++
I) {
808 MachineOperand &Src =
MI.getOperand(
I + 1);
812 const TargetRegisterClass *SrcRC
813 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
814 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
818 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
821 MI.eraseFromParent();
825bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(
MachineInstr &
MI)
const {
826 MachineBasicBlock *BB =
MI.getParent();
827 const int NumDst =
MI.getNumOperands() - 1;
829 MachineOperand &Src =
MI.getOperand(NumDst);
833 LLT DstTy = MRI->getType(DstReg0);
834 LLT SrcTy = MRI->getType(SrcReg);
839 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
841 const TargetRegisterClass *SrcRC =
842 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
843 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
849 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
850 for (
int I = 0,
E = NumDst;
I !=
E; ++
I) {
851 MachineOperand &Dst =
MI.getOperand(
I);
853 if (SrcBank->
getID() == AMDGPU::SGPRRegBankID &&
854 SubRegs[
I] == AMDGPU::hi16) {
855 BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::S_LSHR_B32), Dst.getReg())
859 BuildMI(*BB, &
MI,
DL, TII.get(TargetOpcode::COPY), Dst.getReg())
860 .
addReg(SrcReg, {}, SubRegs[
I]);
864 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[
I]);
865 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
868 const TargetRegisterClass *DstRC =
869 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
870 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
874 MI.eraseFromParent();
878bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(
MachineInstr &
MI)
const {
879 assert(
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
880 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
884 LLT SrcTy = MRI->getType(Src0);
888 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
889 return selectG_MERGE_VALUES(
MI);
896 (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
900 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
901 if (DstBank->
getID() == AMDGPU::AGPRRegBankID)
904 assert(DstBank->
getID() == AMDGPU::SGPRRegBankID ||
905 DstBank->
getID() == AMDGPU::VGPRRegBankID);
906 const bool IsVector = DstBank->
getID() == AMDGPU::VGPRRegBankID;
909 MachineBasicBlock *BB =
MI.getParent();
919 const int64_t K0 = ConstSrc0->Value.getSExtValue();
920 const int64_t K1 = ConstSrc1->Value.getSExtValue();
921 uint32_t Lo16 =
static_cast<uint32_t
>(K0) & 0xffff;
922 uint32_t Hi16 =
static_cast<uint32_t
>(K1) & 0xffff;
923 uint32_t
Imm = Lo16 | (Hi16 << 16);
928 MI.eraseFromParent();
929 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
934 MI.eraseFromParent();
935 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
946 if (Src1Def->
getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
947 MI.setDesc(TII.get(AMDGPU::COPY));
950 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
951 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
952 RBI.constrainGenericRegister(Src0, RC, *MRI);
955 return selectS16MergeToS32(
MI);
958bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(
MachineInstr &
I)
const {
959 const MachineOperand &MO =
I.getOperand(0);
963 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
964 if ((!RC && !MRI->getRegBankOrNull(MO.
getReg())) ||
965 (RC && RBI.constrainGenericRegister(MO.
getReg(), *RC, *MRI))) {
966 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
973bool AMDGPUInstructionSelector::selectG_INSERT(
MachineInstr &
I)
const {
976 Register DstReg =
I.getOperand(0).getReg();
977 Register Src0Reg =
I.getOperand(1).getReg();
978 Register Src1Reg =
I.getOperand(2).getReg();
979 LLT Src1Ty = MRI->getType(Src1Reg);
981 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
984 int64_t
Offset =
I.getOperand(3).getImm();
987 if (
Offset % 32 != 0 || InsSize % 32 != 0)
994 unsigned SubReg = TRI.getSubRegFromChannel(
Offset / 32, InsSize / 32);
995 if (SubReg == AMDGPU::NoSubRegister)
998 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
999 const TargetRegisterClass *DstRC =
1000 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
1004 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
1005 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
1006 const TargetRegisterClass *Src0RC =
1007 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
1008 const TargetRegisterClass *Src1RC =
1009 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
1013 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
1014 if (!Src0RC || !Src1RC)
1017 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
1018 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
1019 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
1023 BuildMI(*BB, &
I,
DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
1028 I.eraseFromParent();
1032bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(
MachineInstr &
MI)
const {
1035 Register OffsetReg =
MI.getOperand(2).getReg();
1036 Register WidthReg =
MI.getOperand(3).getReg();
1038 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
1039 "scalar BFX instructions are expanded in regbankselect");
1040 assert(MRI->getType(
MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
1041 "64-bit vector BFX instructions are expanded in regbankselect");
1044 MachineBasicBlock *
MBB =
MI.getParent();
1046 bool IsSigned =
MI.getOpcode() == TargetOpcode::G_SBFX;
1047 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
1052 MI.eraseFromParent();
1057bool AMDGPUInstructionSelector::selectInterpP1F16(
MachineInstr &
MI)
const {
1058 if (STI.getLDSBankCount() != 16)
1064 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
1065 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
1066 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
1076 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1078 MachineBasicBlock *
MBB =
MI.getParent();
1082 BuildMI(*
MBB, &
MI,
DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
1085 .
addImm(
MI.getOperand(3).getImm());
1098 MI.eraseFromParent();
1107bool AMDGPUInstructionSelector::selectWritelane(
MachineInstr &
MI)
const {
1109 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1112 MachineBasicBlock *
MBB =
MI.getParent();
1116 Register LaneSelect =
MI.getOperand(3).getReg();
1119 auto MIB =
BuildMI(*
MBB, &
MI,
DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1121 std::optional<ValueAndVReg> ConstSelect =
1127 MIB.
addImm(ConstSelect->Value.getSExtValue() &
1130 std::optional<ValueAndVReg> ConstVal =
1136 STI.hasInv2PiInlineImm())) {
1137 MIB.
addImm(ConstVal->Value.getSExtValue());
1145 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1147 BuildMI(*
MBB, *MIB,
DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1155 MI.eraseFromParent();
1162bool AMDGPUInstructionSelector::selectDivScale(
MachineInstr &
MI)
const {
1166 LLT Ty = MRI->getType(Dst0);
1169 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1171 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1178 MachineBasicBlock *
MBB =
MI.getParent();
1182 unsigned ChooseDenom =
MI.getOperand(5).getImm();
1184 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1197 MI.eraseFromParent();
1202bool AMDGPUInstructionSelector::selectG_INTRINSIC(
MachineInstr &
I)
const {
1204 switch (IntrinsicID) {
1205 case Intrinsic::amdgcn_if_break: {
1210 BuildMI(*BB, &
I,
I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1211 .
add(
I.getOperand(0))
1212 .
add(
I.getOperand(2))
1213 .
add(
I.getOperand(3));
1215 Register DstReg =
I.getOperand(0).getReg();
1216 Register Src0Reg =
I.getOperand(2).getReg();
1217 Register Src1Reg =
I.getOperand(3).getReg();
1219 I.eraseFromParent();
1222 MRI->setRegClass(
Reg, TRI.getWaveMaskRegClass());
1226 case Intrinsic::amdgcn_interp_p1_f16:
1227 return selectInterpP1F16(
I);
1228 case Intrinsic::amdgcn_wqm:
1229 return constrainCopyLikeIntrin(
I, AMDGPU::WQM);
1230 case Intrinsic::amdgcn_softwqm:
1231 return constrainCopyLikeIntrin(
I, AMDGPU::SOFT_WQM);
1232 case Intrinsic::amdgcn_strict_wwm:
1233 case Intrinsic::amdgcn_wwm:
1234 return constrainCopyLikeIntrin(
I, AMDGPU::STRICT_WWM);
1235 case Intrinsic::amdgcn_strict_wqm:
1236 return constrainCopyLikeIntrin(
I, AMDGPU::STRICT_WQM);
1237 case Intrinsic::amdgcn_writelane:
1238 return selectWritelane(
I);
1239 case Intrinsic::amdgcn_div_scale:
1240 return selectDivScale(
I);
1241 case Intrinsic::amdgcn_icmp:
1242 case Intrinsic::amdgcn_fcmp:
1245 return selectIntrinsicCmp(
I);
1246 case Intrinsic::amdgcn_ballot:
1247 return selectBallot(
I);
1248 case Intrinsic::amdgcn_reloc_constant:
1249 return selectRelocConstant(
I);
1250 case Intrinsic::amdgcn_groupstaticsize:
1251 return selectGroupStaticSize(
I);
1252 case Intrinsic::returnaddress:
1253 return selectReturnAddress(
I);
1254 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1255 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1256 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1257 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1258 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1259 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1260 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1261 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1262 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1263 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1264 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1265 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1266 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1267 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1268 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1269 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1270 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1271 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1272 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1273 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1274 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1275 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1276 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1277 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1278 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1279 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1280 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1281 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1282 return selectSMFMACIntrin(
I);
1283 case Intrinsic::amdgcn_permlane16_swap:
1284 case Intrinsic::amdgcn_permlane32_swap:
1285 return selectPermlaneSwapIntrin(
I, IntrinsicID);
1286 case Intrinsic::amdgcn_wave_shuffle:
1287 return selectWaveShuffleIntrin(
I);
1288 case Intrinsic::amdgcn_fma_legacy:
1289 if (!STI.hasFmaLegacy32Insts()) {
1294 case Intrinsic::amdgcn_sudot4:
1295 case Intrinsic::amdgcn_sudot8:
1296 if (!STI.hasDot8Insts()) {
1311 if (
Size == 16 && !ST.has16BitInsts())
1314 const auto Select = [&](
unsigned S16Opc,
unsigned TrueS16Opc,
1315 unsigned FakeS16Opc,
unsigned S32Opc,
1318 return ST.hasTrue16BitInsts()
1319 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1330 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1331 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1332 AMDGPU::V_CMP_NE_U64_e64);
1334 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1335 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1336 AMDGPU::V_CMP_EQ_U64_e64);
1338 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1339 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1340 AMDGPU::V_CMP_GT_I64_e64);
1342 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1343 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1344 AMDGPU::V_CMP_GE_I64_e64);
1346 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1347 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1348 AMDGPU::V_CMP_LT_I64_e64);
1350 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1351 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1352 AMDGPU::V_CMP_LE_I64_e64);
1354 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1355 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1356 AMDGPU::V_CMP_GT_U64_e64);
1358 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1359 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1360 AMDGPU::V_CMP_GE_U64_e64);
1362 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1363 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1364 AMDGPU::V_CMP_LT_U64_e64);
1366 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1367 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1368 AMDGPU::V_CMP_LE_U64_e64);
1371 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1372 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1373 AMDGPU::V_CMP_EQ_F64_e64);
1375 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1376 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1377 AMDGPU::V_CMP_GT_F64_e64);
1379 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1380 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1381 AMDGPU::V_CMP_GE_F64_e64);
1383 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1384 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1385 AMDGPU::V_CMP_LT_F64_e64);
1387 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1388 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1389 AMDGPU::V_CMP_LE_F64_e64);
1391 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1392 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1393 AMDGPU::V_CMP_NEQ_F64_e64);
1395 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1396 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1397 AMDGPU::V_CMP_O_F64_e64);
1399 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1400 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1401 AMDGPU::V_CMP_U_F64_e64);
1403 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1404 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1405 AMDGPU::V_CMP_NLG_F64_e64);
1407 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1408 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1409 AMDGPU::V_CMP_NLE_F64_e64);
1411 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1412 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1413 AMDGPU::V_CMP_NLT_F64_e64);
1415 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1416 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1417 AMDGPU::V_CMP_NGE_F64_e64);
1419 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1420 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1421 AMDGPU::V_CMP_NGT_F64_e64);
1423 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1424 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1425 AMDGPU::V_CMP_NEQ_F64_e64);
1427 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1428 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1429 AMDGPU::V_CMP_TRU_F64_e64);
1431 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1432 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1433 AMDGPU::V_CMP_F_F64_e64);
1438 unsigned Size)
const {
1440 if (!STI.hasScalarCompareEq64())
1445 return AMDGPU::S_CMP_LG_U64;
1447 return AMDGPU::S_CMP_EQ_U64;
1456 return AMDGPU::S_CMP_LG_U32;
1458 return AMDGPU::S_CMP_EQ_U32;
1460 return AMDGPU::S_CMP_GT_I32;
1462 return AMDGPU::S_CMP_GE_I32;
1464 return AMDGPU::S_CMP_LT_I32;
1466 return AMDGPU::S_CMP_LE_I32;
1468 return AMDGPU::S_CMP_GT_U32;
1470 return AMDGPU::S_CMP_GE_U32;
1472 return AMDGPU::S_CMP_LT_U32;
1474 return AMDGPU::S_CMP_LE_U32;
1476 return AMDGPU::S_CMP_EQ_F32;
1478 return AMDGPU::S_CMP_GT_F32;
1480 return AMDGPU::S_CMP_GE_F32;
1482 return AMDGPU::S_CMP_LT_F32;
1484 return AMDGPU::S_CMP_LE_F32;
1486 return AMDGPU::S_CMP_LG_F32;
1488 return AMDGPU::S_CMP_O_F32;
1490 return AMDGPU::S_CMP_U_F32;
1492 return AMDGPU::S_CMP_NLG_F32;
1494 return AMDGPU::S_CMP_NLE_F32;
1496 return AMDGPU::S_CMP_NLT_F32;
1498 return AMDGPU::S_CMP_NGE_F32;
1500 return AMDGPU::S_CMP_NGT_F32;
1502 return AMDGPU::S_CMP_NEQ_F32;
1509 if (!STI.hasSALUFloatInsts())
1514 return AMDGPU::S_CMP_EQ_F16;
1516 return AMDGPU::S_CMP_GT_F16;
1518 return AMDGPU::S_CMP_GE_F16;
1520 return AMDGPU::S_CMP_LT_F16;
1522 return AMDGPU::S_CMP_LE_F16;
1524 return AMDGPU::S_CMP_LG_F16;
1526 return AMDGPU::S_CMP_O_F16;
1528 return AMDGPU::S_CMP_U_F16;
1530 return AMDGPU::S_CMP_NLG_F16;
1532 return AMDGPU::S_CMP_NLE_F16;
1534 return AMDGPU::S_CMP_NLT_F16;
1536 return AMDGPU::S_CMP_NGE_F16;
1538 return AMDGPU::S_CMP_NGT_F16;
1540 return AMDGPU::S_CMP_NEQ_F16;
1549bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(
MachineInstr &
I)
const {
1554 Register SrcReg =
I.getOperand(2).getReg();
1555 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1559 Register CCReg =
I.getOperand(0).getReg();
1560 if (!isVCC(CCReg, *MRI)) {
1561 int Opcode = getS_CMPOpcode(Pred,
Size);
1564 MachineInstr *ICmp =
BuildMI(*BB, &
I,
DL, TII.get(Opcode))
1565 .
add(
I.getOperand(2))
1566 .
add(
I.getOperand(3));
1567 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::COPY), CCReg)
1571 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1572 I.eraseFromParent();
1576 if (
I.getOpcode() == AMDGPU::G_FCMP)
1583 MachineInstrBuilder ICmp;
1586 ICmp =
BuildMI(*BB, &
I,
DL, TII.get(Opcode),
I.getOperand(0).getReg())
1588 .
add(
I.getOperand(2))
1590 .
add(
I.getOperand(3))
1593 ICmp =
BuildMI(*BB, &
I,
DL, TII.get(Opcode),
I.getOperand(0).getReg())
1594 .
add(
I.getOperand(2))
1595 .
add(
I.getOperand(3));
1599 *TRI.getBoolRC(), *MRI);
1601 I.eraseFromParent();
1605bool AMDGPUInstructionSelector::selectIntrinsicCmp(
MachineInstr &
I)
const {
1606 Register Dst =
I.getOperand(0).getReg();
1607 if (isVCC(Dst, *MRI))
1610 LLT DstTy = MRI->getType(Dst);
1616 Register SrcReg =
I.getOperand(2).getReg();
1617 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1625 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1626 I.eraseFromParent();
1627 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1634 MachineInstrBuilder SelectedMI;
1635 MachineOperand &
LHS =
I.getOperand(2);
1636 MachineOperand &
RHS =
I.getOperand(3);
1637 auto [Src0, Src0Mods] = selectVOP3ModsImpl(
LHS.getReg());
1638 auto [Src1, Src1Mods] = selectVOP3ModsImpl(
RHS.getReg());
1640 copyToVGPRIfSrcFolded(Src0, Src0Mods,
LHS, &
I,
true);
1642 copyToVGPRIfSrcFolded(Src1, Src1Mods,
RHS, &
I,
true);
1643 SelectedMI =
BuildMI(*BB, &
I,
DL, TII.get(Opcode), Dst);
1645 SelectedMI.
addImm(Src0Mods);
1646 SelectedMI.
addReg(Src0Reg);
1648 SelectedMI.
addImm(Src1Mods);
1649 SelectedMI.
addReg(Src1Reg);
1655 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1658 I.eraseFromParent();
1669 if (
MI->getParent() !=
MBB)
1673 if (
MI->getOpcode() == AMDGPU::COPY) {
1676 if (DstRB && SrcRB && DstRB->
getID() == AMDGPU::VCCRegBankID &&
1677 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1694bool AMDGPUInstructionSelector::selectBallot(
MachineInstr &
I)
const {
1697 Register DstReg =
I.getOperand(0).getReg();
1698 Register SrcReg =
I.getOperand(2).getReg();
1699 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1700 const unsigned WaveSize = STI.getWavefrontSize();
1704 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1707 std::optional<ValueAndVReg> Arg =
1712 if (BallotSize != WaveSize) {
1713 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1717 const int64_t
Value = Arg->Value.getZExtValue();
1720 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1727 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1733 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1737 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1747 if (BallotSize != WaveSize) {
1748 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1750 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1757 I.eraseFromParent();
1761bool AMDGPUInstructionSelector::selectRelocConstant(
MachineInstr &
I)
const {
1762 Register DstReg =
I.getOperand(0).getReg();
1763 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1764 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1765 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1768 const bool IsVALU = DstBank->
getID() == AMDGPU::VGPRRegBankID;
1770 Module *
M =
MF->getFunction().getParent();
1771 const MDNode *
Metadata =
I.getOperand(2).getMetadata();
1778 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1781 I.eraseFromParent();
1785bool AMDGPUInstructionSelector::selectGroupStaticSize(
MachineInstr &
I)
const {
1788 Register DstReg =
I.getOperand(0).getReg();
1789 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1790 unsigned Mov = DstRB->
getID() == AMDGPU::SGPRRegBankID ?
1791 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1799 const SIMachineFunctionInfo *MFI =
MF->getInfo<SIMachineFunctionInfo>();
1802 Module *
M =
MF->getFunction().getParent();
1803 const GlobalValue *GV =
1808 I.eraseFromParent();
1813bool AMDGPUInstructionSelector::selectReturnAddress(
MachineInstr &
I)
const {
1818 MachineOperand &Dst =
I.getOperand(0);
1820 unsigned Depth =
I.getOperand(2).getImm();
1822 const TargetRegisterClass *RC
1823 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1825 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1830 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1833 I.eraseFromParent();
1837 MachineFrameInfo &MFI =
MF.getFrameInfo();
1842 Register ReturnAddrReg = TRI.getReturnAddressReg(
MF);
1844 AMDGPU::SReg_64RegClass,
DL);
1847 I.eraseFromParent();
1851bool AMDGPUInstructionSelector::selectEndCfIntrinsic(
MachineInstr &
MI)
const {
1854 MachineBasicBlock *BB =
MI.getParent();
1855 BuildMI(*BB, &
MI,
MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1856 .
add(
MI.getOperand(1));
1859 MI.eraseFromParent();
1861 if (!MRI->getRegClassOrNull(
Reg))
1862 MRI->setRegClass(
Reg, TRI.getWaveMaskRegClass());
1866bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1868 MachineBasicBlock *
MBB =
MI.getParent();
1872 unsigned IndexOperand =
MI.getOperand(7).getImm();
1873 bool WaveRelease =
MI.getOperand(8).getImm() != 0;
1874 bool WaveDone =
MI.getOperand(9).getImm() != 0;
1876 if (WaveDone && !WaveRelease) {
1880 Fn,
"ds_ordered_count: wave_done requires wave_release",
DL));
1883 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1884 IndexOperand &= ~0x3f;
1885 unsigned CountDw = 0;
1888 CountDw = (IndexOperand >> 24) & 0xf;
1889 IndexOperand &= ~(0xf << 24);
1891 if (CountDw < 1 || CountDw > 4) {
1894 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
DL));
1902 Fn,
"ds_ordered_count: bad index operand",
DL));
1905 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1908 unsigned Offset0 = OrderedCountIndex << 2;
1909 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1912 Offset1 |= (CountDw - 1) << 6;
1915 Offset1 |= ShaderType << 2;
1917 unsigned Offset = Offset0 | (Offset1 << 8);
1925 MachineInstrBuilder
DS =
1926 BuildMI(*
MBB, &
MI,
DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1931 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1935 MI.eraseFromParent();
1941 case Intrinsic::amdgcn_ds_gws_init:
1942 return AMDGPU::DS_GWS_INIT;
1943 case Intrinsic::amdgcn_ds_gws_barrier:
1944 return AMDGPU::DS_GWS_BARRIER;
1945 case Intrinsic::amdgcn_ds_gws_sema_v:
1946 return AMDGPU::DS_GWS_SEMA_V;
1947 case Intrinsic::amdgcn_ds_gws_sema_br:
1948 return AMDGPU::DS_GWS_SEMA_BR;
1949 case Intrinsic::amdgcn_ds_gws_sema_p:
1950 return AMDGPU::DS_GWS_SEMA_P;
1951 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1952 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1958bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(
MachineInstr &
MI,
1960 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1961 !STI.hasGWSSemaReleaseAll()))
1965 const bool HasVSrc =
MI.getNumOperands() == 3;
1966 assert(HasVSrc ||
MI.getNumOperands() == 2);
1968 Register BaseOffset =
MI.getOperand(HasVSrc ? 2 : 1).getReg();
1969 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1970 if (OffsetRB->
getID() != AMDGPU::SGPRRegBankID)
1976 MachineBasicBlock *
MBB =
MI.getParent();
1979 MachineInstr *Readfirstlane =
nullptr;
1984 if (OffsetDef->
getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1985 Readfirstlane = OffsetDef;
1990 if (OffsetDef->
getOpcode() == AMDGPU::G_CONSTANT) {
2000 std::tie(BaseOffset, ImmOffset) =
2003 if (Readfirstlane) {
2006 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
2012 if (!RBI.constrainGenericRegister(BaseOffset,
2013 AMDGPU::SReg_32RegClass, *MRI))
2017 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2032 const MCInstrDesc &InstrDesc = TII.get(
Opc);
2037 int Data0Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::data0);
2038 const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);
2039 const TargetRegisterClass *SubRC =
2040 TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
2044 if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))
2054 Register DataReg = MRI->createVirtualRegister(DataRC);
2055 if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))
2058 Register UndefReg = MRI->createVirtualRegister(SubRC);
2077 MI.eraseFromParent();
2081bool AMDGPUInstructionSelector::selectDSAppendConsume(
MachineInstr &
MI,
2082 bool IsAppend)
const {
2083 Register PtrBase =
MI.getOperand(2).getReg();
2084 LLT PtrTy = MRI->getType(PtrBase);
2088 std::tie(PtrBase,
Offset) = selectDS1Addr1OffsetImpl(
MI.getOperand(2));
2091 if (!isDSOffsetLegal(PtrBase,
Offset)) {
2092 PtrBase =
MI.getOperand(2).getReg();
2096 MachineBasicBlock *
MBB =
MI.getParent();
2098 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2102 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
2109 MI.eraseFromParent();
2114bool AMDGPUInstructionSelector::selectInitWholeWave(
MachineInstr &
MI)
const {
2115 MachineFunction *
MF =
MI.getMF();
2116 SIMachineFunctionInfo *MFInfo =
MF->getInfo<SIMachineFunctionInfo>();
2127 TFE = TexFailCtrl & 0x1;
2129 LWE = TexFailCtrl & 0x2;
2132 return TexFailCtrl == 0;
2135bool AMDGPUInstructionSelector::selectImageIntrinsic(
2137 MachineBasicBlock *
MBB =
MI.getParent();
2143 Register ResultDef =
MI.getOperand(0).getReg();
2144 if (MRI->use_nodbg_empty(ResultDef))
2148 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2157 const unsigned ArgOffset =
MI.getNumExplicitDefs() + 1;
2159 Register VDataIn = AMDGPU::NoRegister;
2160 Register VDataOut = AMDGPU::NoRegister;
2162 int NumVDataDwords = -1;
2163 bool IsD16 =
MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2164 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2170 Unorm =
MI.getOperand(ArgOffset + Intr->
UnormIndex).getImm() != 0;
2174 bool IsTexFail =
false;
2176 TFE, LWE, IsTexFail))
2179 const int Flags =
MI.getOperand(ArgOffset + Intr->
NumArgs).getImm();
2180 const bool IsA16 = (
Flags & 1) != 0;
2181 const bool IsG16 = (
Flags & 2) != 0;
2184 if (IsA16 && !STI.hasG16() && !IsG16)
2188 unsigned DMaskLanes = 0;
2190 if (BaseOpcode->
Atomic) {
2192 VDataOut =
MI.getOperand(0).getReg();
2193 VDataIn =
MI.getOperand(2).getReg();
2194 LLT Ty = MRI->getType(VDataIn);
2197 const bool Is64Bit = BaseOpcode->
AtomicX2 ?
2202 assert(
MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2204 DMask = Is64Bit ? 0xf : 0x3;
2205 NumVDataDwords = Is64Bit ? 4 : 2;
2207 DMask = Is64Bit ? 0x3 : 0x1;
2208 NumVDataDwords = Is64Bit ? 2 : 1;
2211 DMask =
MI.getOperand(ArgOffset + Intr->
DMaskIndex).getImm();
2214 if (BaseOpcode->
Store) {
2215 VDataIn =
MI.getOperand(1).getReg();
2216 VDataTy = MRI->getType(VDataIn);
2221 VDataOut =
MI.getOperand(0).getReg();
2222 VDataTy = MRI->getType(VDataOut);
2223 NumVDataDwords = DMaskLanes;
2225 if (IsD16 && !STI.hasUnpackedD16VMem())
2226 NumVDataDwords = (DMaskLanes + 1) / 2;
2231 if (Subtarget->hasG16() && IsG16) {
2232 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2235 IntrOpcode = G16MappingInfo->
G16;
2239 assert((!IsTexFail || DMaskLanes >= 1) &&
"should have legalized this");
2249 int NumVAddrRegs = 0;
2250 int NumVAddrDwords = 0;
2253 MachineOperand &AddrOp =
MI.getOperand(ArgOffset +
I);
2254 if (!AddrOp.
isReg())
2262 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2269 NumVAddrRegs != 1 &&
2270 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2271 : NumVAddrDwords == NumVAddrRegs);
2272 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2283 NumVDataDwords, NumVAddrDwords);
2284 }
else if (IsGFX12Plus) {
2286 NumVDataDwords, NumVAddrDwords);
2287 }
else if (IsGFX11Plus) {
2289 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2290 : AMDGPU::MIMGEncGfx11Default,
2291 NumVDataDwords, NumVAddrDwords);
2292 }
else if (IsGFX10Plus) {
2294 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2295 : AMDGPU::MIMGEncGfx10Default,
2296 NumVDataDwords, NumVAddrDwords);
2298 if (Subtarget->hasGFX90AInsts()) {
2300 NumVDataDwords, NumVAddrDwords);
2304 <<
"requested image instruction is not supported on this GPU\n");
2311 NumVDataDwords, NumVAddrDwords);
2314 NumVDataDwords, NumVAddrDwords);
2324 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2326 Register TmpReg = MRI->createVirtualRegister(
2327 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2328 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2331 if (!MRI->use_empty(VDataOut)) {
2344 for (
int I = 0;
I != NumVAddrRegs; ++
I) {
2345 MachineOperand &SrcOp =
MI.getOperand(ArgOffset + Intr->
VAddrStart +
I);
2346 if (SrcOp.
isReg()) {
2365 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2367 MIB.
addImm(IsA16 ? -1 : 0);
2369 if (!Subtarget->hasGFX90AInsts()) {
2381 MIB.
addImm(IsD16 ? -1 : 0);
2383 MI.eraseFromParent();
2385 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2391bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2397 MachineBasicBlock *
MBB =
MI.getParent();
2402 unsigned Offset =
MI.getOperand(6).getImm();
2406 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2407 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2408 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2410 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2411 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2413 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2414 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2426 MI.eraseFromParent();
2431bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2434 switch (IntrinsicID) {
2435 case Intrinsic::amdgcn_end_cf:
2436 return selectEndCfIntrinsic(
I);
2437 case Intrinsic::amdgcn_ds_ordered_add:
2438 case Intrinsic::amdgcn_ds_ordered_swap:
2439 return selectDSOrderedIntrinsic(
I, IntrinsicID);
2440 case Intrinsic::amdgcn_ds_gws_init:
2441 case Intrinsic::amdgcn_ds_gws_barrier:
2442 case Intrinsic::amdgcn_ds_gws_sema_v:
2443 case Intrinsic::amdgcn_ds_gws_sema_br:
2444 case Intrinsic::amdgcn_ds_gws_sema_p:
2445 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2446 return selectDSGWSIntrinsic(
I, IntrinsicID);
2447 case Intrinsic::amdgcn_ds_append:
2448 return selectDSAppendConsume(
I,
true);
2449 case Intrinsic::amdgcn_ds_consume:
2450 return selectDSAppendConsume(
I,
false);
2451 case Intrinsic::amdgcn_init_whole_wave:
2452 return selectInitWholeWave(
I);
2453 case Intrinsic::amdgcn_raw_buffer_load_lds:
2454 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
2455 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2456 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
2457 case Intrinsic::amdgcn_struct_buffer_load_lds:
2458 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
2459 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2460 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
2461 return selectBufferLoadLds(
I);
2466 case Intrinsic::amdgcn_load_to_lds:
2467 case Intrinsic::amdgcn_load_async_to_lds:
2468 case Intrinsic::amdgcn_global_load_lds:
2469 case Intrinsic::amdgcn_global_load_async_lds:
2470 return selectGlobalLoadLds(
I);
2471 case Intrinsic::amdgcn_tensor_load_to_lds:
2472 case Intrinsic::amdgcn_tensor_store_from_lds:
2473 return selectTensorLoadStore(
I, IntrinsicID);
2474 case Intrinsic::amdgcn_asyncmark:
2475 case Intrinsic::amdgcn_wait_asyncmark:
2476 if (!Subtarget->hasAsyncMark())
2479 case Intrinsic::amdgcn_exp_compr:
2480 if (!STI.hasCompressedExport()) {
2485 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2486 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2487 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2488 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2489 return selectDSBvhStackIntrinsic(
I);
2490 case Intrinsic::amdgcn_s_alloc_vgpr: {
2496 Register ResReg =
I.getOperand(0).getReg();
2498 MachineInstr *AllocMI =
BuildMI(*
MBB, &
I,
DL, TII.get(AMDGPU::S_ALLOC_VGPR))
2499 .
add(
I.getOperand(2));
2502 I.eraseFromParent();
2504 return RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI);
2506 case Intrinsic::amdgcn_s_barrier_init:
2507 case Intrinsic::amdgcn_s_barrier_signal_var:
2508 return selectNamedBarrierInit(
I, IntrinsicID);
2509 case Intrinsic::amdgcn_s_wakeup_barrier: {
2510 if (!STI.hasSWakeupBarrier()) {
2514 return selectNamedBarrierInst(
I, IntrinsicID);
2516 case Intrinsic::amdgcn_s_barrier_join:
2517 case Intrinsic::amdgcn_s_get_named_barrier_state:
2518 return selectNamedBarrierInst(
I, IntrinsicID);
2519 case Intrinsic::amdgcn_s_get_barrier_state:
2520 return selectSGetBarrierState(
I, IntrinsicID);
2521 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2522 return selectSBarrierSignalIsfirst(
I, IntrinsicID);
2527bool AMDGPUInstructionSelector::selectG_SELECT(
MachineInstr &
I)
const {
2534 Register DstReg =
I.getOperand(0).getReg();
2535 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2537 const MachineOperand &CCOp =
I.getOperand(1);
2539 if (!isVCC(CCReg, *MRI)) {
2540 unsigned SelectOpcode =
Size == 64 ? AMDGPU::S_CSELECT_B64 :
2541 AMDGPU::S_CSELECT_B32;
2542 MachineInstr *CopySCC =
BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2548 if (!MRI->getRegClassOrNull(CCReg))
2549 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2551 .
add(
I.getOperand(2))
2552 .
add(
I.getOperand(3));
2556 I.eraseFromParent();
2565 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2567 .
add(
I.getOperand(3))
2569 .
add(
I.getOperand(2))
2570 .
add(
I.getOperand(1));
2573 I.eraseFromParent();
2577bool AMDGPUInstructionSelector::selectG_TRUNC(
MachineInstr &
I)
const {
2578 Register DstReg =
I.getOperand(0).getReg();
2579 Register SrcReg =
I.getOperand(1).getReg();
2580 const LLT DstTy = MRI->getType(DstReg);
2581 const LLT SrcTy = MRI->getType(SrcReg);
2584 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2585 const RegisterBank *DstRB;
2591 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2596 const bool IsVALU = DstRB->
getID() == AMDGPU::VGPRRegBankID;
2601 const TargetRegisterClass *SrcRC =
2602 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2603 const TargetRegisterClass *DstRC =
2604 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2605 if (!SrcRC || !DstRC)
2608 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2609 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2614 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2615 assert(STI.useRealTrue16Insts());
2619 .
addReg(SrcReg, {}, AMDGPU::lo16);
2620 I.eraseFromParent();
2628 Register LoReg = MRI->createVirtualRegister(DstRC);
2629 Register HiReg = MRI->createVirtualRegister(DstRC);
2631 .
addReg(SrcReg, {}, AMDGPU::sub0);
2633 .
addReg(SrcReg, {}, AMDGPU::sub1);
2635 if (IsVALU && STI.hasSDWA()) {
2638 MachineInstr *MovSDWA =
2639 BuildMI(*
MBB,
I,
DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2649 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2650 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2651 Register ImmReg = MRI->createVirtualRegister(DstRC);
2653 BuildMI(*
MBB,
I,
DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2663 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2664 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2665 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2677 And.setOperandDead(3);
2678 Or.setOperandDead(3);
2682 I.eraseFromParent();
2690 unsigned SubRegIdx = DstSize < 32
2691 ?
static_cast<unsigned>(AMDGPU::sub0)
2692 : TRI.getSubRegFromChannel(0, DstSize / 32);
2693 if (SubRegIdx == AMDGPU::NoSubRegister)
2698 const TargetRegisterClass *SrcWithSubRC
2699 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2703 if (SrcWithSubRC != SrcRC) {
2704 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2708 I.getOperand(1).setSubReg(SubRegIdx);
2711 I.setDesc(TII.get(TargetOpcode::COPY));
2718 int SignedMask =
static_cast<int>(Mask);
2719 return SignedMask >= -16 && SignedMask <= 64;
2723const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2732 return &RBI.getRegBankFromRegClass(*RC, LLT());
2736bool AMDGPUInstructionSelector::selectG_SZA_EXT(
MachineInstr &
I)
const {
2737 bool InReg =
I.getOpcode() == AMDGPU::G_SEXT_INREG;
2738 bool Signed =
I.getOpcode() == AMDGPU::G_SEXT || InReg;
2741 const Register DstReg =
I.getOperand(0).getReg();
2742 const Register SrcReg =
I.getOperand(1).getReg();
2744 const LLT DstTy = MRI->getType(DstReg);
2745 const LLT SrcTy = MRI->getType(SrcReg);
2746 const unsigned SrcSize =
I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2753 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2756 if (
I.getOpcode() == AMDGPU::G_ANYEXT) {
2758 return selectCOPY(
I);
2760 const TargetRegisterClass *SrcRC =
2761 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2762 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2763 const TargetRegisterClass *DstRC =
2764 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2766 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2767 BuildMI(
MBB,
I,
DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2773 I.eraseFromParent();
2775 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2776 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2779 if (SrcBank->
getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2785 MachineInstr *ExtI =
2789 I.eraseFromParent();
2794 const unsigned BFE =
Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2795 MachineInstr *ExtI =
2800 I.eraseFromParent();
2805 if (SrcBank->
getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2806 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2807 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2808 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2811 if (
Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2812 const unsigned SextOpc = SrcSize == 8 ?
2813 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2816 I.eraseFromParent();
2817 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2822 if (DstSize > 32 && SrcSize == 32) {
2823 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2824 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2827 .
addReg(SrcReg, {}, SubReg)
2835 .
addReg(SrcReg, {}, SubReg)
2836 .addImm(AMDGPU::sub0)
2839 I.eraseFromParent();
2840 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2844 const unsigned BFE64 =
Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2845 const unsigned BFE32 =
Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2848 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2850 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2851 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2852 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2854 BuildMI(
MBB,
I,
DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2856 .
addReg(SrcReg, {}, SubReg)
2857 .addImm(AMDGPU::sub0)
2865 I.eraseFromParent();
2866 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2881 I.eraseFromParent();
2882 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2906 if (Unmerge->getNumDefs() == 2 && Unmerge->getOperand(1).getReg() == In &&
2908 Out = Unmerge->getSourceReg();
2928 if (Shuffle->
getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2935 assert(Mask.size() == 2);
2937 if (Mask[0] == 1 && Mask[1] <= 1) {
2945bool AMDGPUInstructionSelector::selectG_FPEXT(
MachineInstr &
I)
const {
2946 if (!Subtarget->hasSALUFloatInsts())
2949 Register Dst =
I.getOperand(0).getReg();
2950 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2951 if (DstRB->
getID() != AMDGPU::SGPRRegBankID)
2954 Register Src =
I.getOperand(1).getReg();
2960 BuildMI(*BB, &
I,
I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2962 I.eraseFromParent();
2963 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2970bool AMDGPUInstructionSelector::selectG_FNEG(
MachineInstr &
MI)
const {
2983 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2984 if (DstRB->
getID() != AMDGPU::SGPRRegBankID ||
2989 MachineInstr *Fabs =
getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2993 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2994 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2997 MachineBasicBlock *BB =
MI.getParent();
2999 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3000 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3001 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3002 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3004 BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::COPY), LoReg)
3005 .
addReg(Src, {}, AMDGPU::sub0);
3006 BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::COPY), HiReg)
3007 .
addReg(Src, {}, AMDGPU::sub1);
3008 BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
3012 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
3017 BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
3022 MI.eraseFromParent();
3027bool AMDGPUInstructionSelector::selectG_FABS(
MachineInstr &
MI)
const {
3029 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
3030 if (DstRB->
getID() != AMDGPU::SGPRRegBankID ||
3035 MachineBasicBlock *BB =
MI.getParent();
3037 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3038 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3039 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3040 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3042 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
3043 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
3046 BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::COPY), LoReg)
3047 .
addReg(Src, {}, AMDGPU::sub0);
3048 BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::COPY), HiReg)
3049 .
addReg(Src, {}, AMDGPU::sub1);
3050 BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
3055 BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::S_AND_B32), OpReg)
3059 BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
3065 MI.eraseFromParent();
3070 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
3073void AMDGPUInstructionSelector::getAddrModeInfo(
const MachineInstr &Load,
3076 unsigned OpNo =
Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
3077 const MachineInstr *PtrMI =
3078 MRI.getUniqueVRegDef(
Load.getOperand(OpNo).getReg());
3082 if (PtrMI->
getOpcode() != TargetOpcode::G_PTR_ADD)
3087 for (
unsigned i = 1; i != 3; ++i) {
3088 const MachineOperand &GEPOp = PtrMI->
getOperand(i);
3089 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.
getReg());
3094 assert(GEPInfo.Imm == 0);
3098 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.
getReg(), MRI, TRI);
3099 if (OpBank->
getID() == AMDGPU::SGPRRegBankID)
3100 GEPInfo.SgprParts.push_back(GEPOp.
getReg());
3102 GEPInfo.VgprParts.push_back(GEPOp.
getReg());
3106 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
3109bool AMDGPUInstructionSelector::isSGPR(
Register Reg)
const {
3110 return RBI.getRegBank(
Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
3113bool AMDGPUInstructionSelector::isInstrUniform(
const MachineInstr &
MI)
const {
3114 if (!
MI.hasOneMemOperand())
3117 const MachineMemOperand *MMO = *
MI.memoperands_begin();
3130 if (
MI.getOpcode() == AMDGPU::G_PREFETCH)
3131 return RBI.getRegBank(
MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
3132 AMDGPU::SGPRRegBankID;
3135 return I &&
I->getMetadata(
"amdgpu.uniform");
3139 for (
const GEPInfo &GEPInfo : AddrInfo) {
3140 if (!GEPInfo.VgprParts.empty())
3146void AMDGPUInstructionSelector::initM0(
MachineInstr &
I)
const {
3147 const LLT PtrTy = MRI->getType(
I.getOperand(1).getReg());
3150 STI.ldsRequiresM0Init()) {
3154 BuildMI(*BB, &
I,
I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3159bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
3166 if (
Reg.isPhysical())
3170 const unsigned Opcode =
MI.getOpcode();
3172 if (Opcode == AMDGPU::COPY)
3175 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3176 Opcode == AMDGPU::G_XOR)
3181 return GI->is(Intrinsic::amdgcn_class);
3183 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3186bool AMDGPUInstructionSelector::selectG_BRCOND(
MachineInstr &
I)
const {
3188 MachineOperand &CondOp =
I.getOperand(0);
3194 const TargetRegisterClass *ConstrainRC;
3201 if (!isVCC(CondReg, *MRI)) {
3205 CondPhysReg = AMDGPU::SCC;
3206 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3207 ConstrainRC = &AMDGPU::SReg_32RegClass;
3214 const bool Is64 = STI.isWave64();
3215 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3216 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3218 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3219 BuildMI(*BB, &
I,
DL, TII.get(Opcode), TmpReg)
3226 CondPhysReg = TRI.getVCC();
3227 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3228 ConstrainRC = TRI.getBoolRC();
3231 if (!MRI->getRegClassOrNull(CondReg))
3232 MRI->setRegClass(CondReg, ConstrainRC);
3234 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::COPY), CondPhysReg)
3237 .
addMBB(
I.getOperand(1).getMBB());
3239 I.eraseFromParent();
3243bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3245 Register DstReg =
I.getOperand(0).getReg();
3246 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3247 const bool IsVGPR = DstRB->
getID() == AMDGPU::VGPRRegBankID;
3248 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3252 return RBI.constrainGenericRegister(
3253 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3256bool AMDGPUInstructionSelector::selectG_PTRMASK(
MachineInstr &
I)
const {
3257 Register DstReg =
I.getOperand(0).getReg();
3258 Register SrcReg =
I.getOperand(1).getReg();
3259 Register MaskReg =
I.getOperand(2).getReg();
3260 LLT Ty = MRI->getType(DstReg);
3261 LLT MaskTy = MRI->getType(MaskReg);
3265 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3266 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3267 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3268 const bool IsVGPR = DstRB->
getID() == AMDGPU::VGPRRegBankID;
3274 APInt MaskOnes =
VT->getKnownOnes(MaskReg).zext(64);
3278 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3279 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3282 !CanCopyLow32 && !CanCopyHi32) {
3283 auto MIB =
BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3287 I.eraseFromParent();
3292 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3293 const TargetRegisterClass &RegRC
3294 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3296 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3297 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3298 const TargetRegisterClass *MaskRC =
3299 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3301 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3302 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3303 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3308 "ptrmask should have been narrowed during legalize");
3310 auto NewOp =
BuildMI(*BB, &
I,
DL, TII.get(NewOpc), DstReg)
3316 I.eraseFromParent();
3320 Register HiReg = MRI->createVirtualRegister(&RegRC);
3321 Register LoReg = MRI->createVirtualRegister(&RegRC);
3324 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::COPY), LoReg)
3325 .
addReg(SrcReg, {}, AMDGPU::sub0);
3326 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::COPY), HiReg)
3327 .
addReg(SrcReg, {}, AMDGPU::sub1);
3336 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3337 MaskedLo = MRI->createVirtualRegister(&RegRC);
3339 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::COPY), MaskLo)
3340 .
addReg(MaskReg, {}, AMDGPU::sub0);
3341 BuildMI(*BB, &
I,
DL, TII.get(NewOpc), MaskedLo)
3350 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3351 MaskedHi = MRI->createVirtualRegister(&RegRC);
3353 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::COPY), MaskHi)
3354 .
addReg(MaskReg, {}, AMDGPU::sub1);
3355 BuildMI(*BB, &
I,
DL, TII.get(NewOpc), MaskedHi)
3360 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3365 I.eraseFromParent();
3371static std::pair<Register, unsigned>
3378 std::tie(IdxBaseReg,
Offset) =
3380 if (IdxBaseReg == AMDGPU::NoRegister) {
3384 IdxBaseReg = IdxReg;
3391 if (
static_cast<unsigned>(
Offset) >= SubRegs.
size())
3392 return std::pair(IdxReg, SubRegs[0]);
3393 return std::pair(IdxBaseReg, SubRegs[
Offset]);
3396bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3402 LLT DstTy = MRI->getType(DstReg);
3403 LLT SrcTy = MRI->getType(SrcReg);
3405 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3406 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3407 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3411 if (IdxRB->
getID() != AMDGPU::SGPRRegBankID)
3414 const TargetRegisterClass *SrcRC =
3415 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3416 const TargetRegisterClass *DstRC =
3417 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3418 if (!SrcRC || !DstRC)
3420 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3421 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3422 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3425 MachineBasicBlock *BB =
MI.getParent();
3433 if (SrcRB->
getID() == AMDGPU::SGPRRegBankID) {
3437 BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3440 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3442 .
addReg(SrcReg, {}, SubReg)
3444 MI.eraseFromParent();
3451 if (!STI.useVGPRIndexMode()) {
3452 BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3454 BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3455 .
addReg(SrcReg, {}, SubReg)
3457 MI.eraseFromParent();
3461 const MCInstrDesc &GPRIDXDesc =
3462 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC),
true);
3468 MI.eraseFromParent();
3473bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3480 LLT VecTy = MRI->getType(DstReg);
3481 LLT ValTy = MRI->getType(ValReg);
3485 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3486 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3487 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3493 if (IdxRB->
getID() != AMDGPU::SGPRRegBankID)
3496 const TargetRegisterClass *VecRC =
3497 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3498 const TargetRegisterClass *ValRC =
3499 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3501 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3502 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3503 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3504 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3507 if (VecRB->
getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3511 std::tie(IdxReg, SubReg) =
3514 const bool IndexMode = VecRB->
getID() == AMDGPU::VGPRRegBankID &&
3515 STI.useVGPRIndexMode();
3517 MachineBasicBlock *BB =
MI.getParent();
3521 BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3524 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3525 VecSize, ValSize, VecRB->
getID() == AMDGPU::SGPRRegBankID);
3530 MI.eraseFromParent();
3534 const MCInstrDesc &GPRIDXDesc =
3535 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC),
false);
3542 MI.eraseFromParent();
3548 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
3549 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
3550 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
3551 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
3552 case Intrinsic::amdgcn_load_async_to_lds:
3553 case Intrinsic::amdgcn_global_load_async_lds:
3559bool AMDGPUInstructionSelector::selectBufferLoadLds(
MachineInstr &
MI)
const {
3560 if (!Subtarget->hasVMemToLDSLoad())
3563 unsigned Size =
MI.getOperand(3).getImm();
3567 const bool HasVIndex =
MI.getNumOperands() == 9;
3571 VIndex =
MI.getOperand(4).getReg();
3575 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
3576 std::optional<ValueAndVReg> MaybeVOffset =
3578 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3584 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3585 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3586 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3587 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3590 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3591 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3592 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3593 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3596 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3597 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3598 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3599 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3602 if (!Subtarget->hasLDSLoadB96_B128())
3605 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3606 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3607 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3608 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3611 if (!Subtarget->hasLDSLoadB96_B128())
3614 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3615 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3616 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3617 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3621 MachineBasicBlock *
MBB =
MI.getParent();
3624 .
add(
MI.getOperand(2));
3628 if (HasVIndex && HasVOffset) {
3629 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3630 BuildMI(*
MBB, &*MIB,
DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3637 }
else if (HasVIndex) {
3639 }
else if (HasVOffset) {
3643 MIB.
add(
MI.getOperand(1));
3644 MIB.
add(
MI.getOperand(5 + OpOffset));
3645 MIB.
add(
MI.getOperand(6 + OpOffset));
3647 unsigned Aux =
MI.getOperand(7 + OpOffset).getImm();
3656 MachineMemOperand *LoadMMO = *
MI.memoperands_begin();
3661 MachinePointerInfo StorePtrI = LoadPtrI;
3672 MachineMemOperand *StoreMMO =
3678 MI.eraseFromParent();
3691 if (
Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3697 return Def->getOperand(1).getReg();
3711 if (
Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3719 return Def->getOperand(1).getReg();
3721 if (
VT->signBitIsZero(
Reg))
3722 return matchZeroExtendFromS32(
Reg);
3730AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(
Register Reg)
const {
3732 : matchZeroExtendFromS32(
Reg);
3738AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(
Register Reg)
const {
3740 : matchSignExtendFromS32(
Reg);
3744AMDGPUInstructionSelector::matchExtendFromS32OrS32(
Register Reg,
3745 bool IsSigned)
const {
3747 return matchSignExtendFromS32OrS32(
Reg);
3749 return matchZeroExtendFromS32OrS32(
Reg);
3759 if (
Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3766 return Def->getOperand(1).getReg();
3771bool AMDGPUInstructionSelector::selectGlobalLoadLds(
MachineInstr &
MI)
const{
3772 if (!Subtarget->hasVMemToLDSLoad())
3776 unsigned Size =
MI.getOperand(3).getImm();
3783 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3786 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3789 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3792 if (!Subtarget->hasLDSLoadB96_B128())
3794 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3797 if (!Subtarget->hasLDSLoadB96_B128())
3799 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3803 MachineBasicBlock *
MBB =
MI.getParent();
3806 .
add(
MI.getOperand(2));
3812 if (!isSGPR(Addr)) {
3814 if (isSGPR(AddrDef->Reg)) {
3815 Addr = AddrDef->Reg;
3816 }
else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3819 if (isSGPR(SAddr)) {
3820 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3821 if (
Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
3832 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3844 MIB.
add(
MI.getOperand(4));
3846 unsigned Aux =
MI.getOperand(5).getImm();
3850 MachineMemOperand *LoadMMO = *
MI.memoperands_begin();
3852 LoadPtrI.
Offset =
MI.getOperand(4).getImm();
3853 MachinePointerInfo StorePtrI = LoadPtrI;
3862 MachineMemOperand *StoreMMO =
3864 sizeof(int32_t),
Align(4));
3868 MI.eraseFromParent();
3873bool AMDGPUInstructionSelector::selectTensorLoadStore(
MachineInstr &
MI,
3875 bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;
3877 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3881 const auto isAllZeros = [&](MachineOperand &Opnd) {
3882 const MachineInstr *
DefMI = MRI->getVRegDef(Opnd.getReg());
3891 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3892 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3897 MachineBasicBlock *
MBB =
MI.getParent();
3899 .
add(
MI.getOperand(1))
3900 .
add(
MI.getOperand(2));
3902 if (NumGroups >= 4) {
3903 MIB.
add(
MI.getOperand(3))
3904 .
add(
MI.getOperand(4));
3908 .
add(
MI.getOperand(6));
3910 MI.eraseFromParent();
3914bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3916 unsigned OpcodeOpIdx =
3917 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3918 MI.setDesc(TII.get(
MI.getOperand(OpcodeOpIdx).getImm()));
3919 MI.removeOperand(OpcodeOpIdx);
3920 MI.addImplicitDefUseOperands(*
MI.getMF());
3927bool AMDGPUInstructionSelector::selectSMFMACIntrin(
MachineInstr &
MI)
const {
3930 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3931 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3933 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3934 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3936 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3937 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3939 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3940 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3942 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3943 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3945 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3946 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3948 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3949 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3951 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3952 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3954 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3955 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3957 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3958 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3960 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3961 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3963 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3964 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3966 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3967 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3969 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3970 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3972 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3973 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3975 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3976 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3978 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3979 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3981 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3982 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3984 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3985 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3987 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3988 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3990 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3991 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3993 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3994 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3996 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3997 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3999 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
4000 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
4002 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
4003 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
4005 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
4006 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
4008 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
4009 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
4011 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
4012 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
4018 auto VDst_In =
MI.getOperand(4);
4020 MI.setDesc(TII.get(
Opc));
4021 MI.removeOperand(4);
4022 MI.removeOperand(1);
4023 MI.addOperand(VDst_In);
4024 MI.addImplicitDefUseOperands(*
MI.getMF());
4025 const MCInstrDesc &MCID =
MI.getDesc();
4027 MI.getOperand(0).setIsEarlyClobber(
true);
4032bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
4034 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
4035 !Subtarget->hasPermlane16Swap())
4037 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
4038 !Subtarget->hasPermlane32Swap())
4041 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
4042 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
4043 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
4045 MI.removeOperand(2);
4046 MI.setDesc(TII.get(Opcode));
4049 MachineOperand &FI =
MI.getOperand(4);
4056bool AMDGPUInstructionSelector::selectWaveAddress(
MachineInstr &
MI)
const {
4059 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4060 const bool IsVALU = DstRB->
getID() == AMDGPU::VGPRRegBankID;
4061 MachineBasicBlock *
MBB =
MI.getParent();
4065 BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
4066 .
addImm(Subtarget->getWavefrontSizeLog2())
4071 .
addImm(Subtarget->getWavefrontSizeLog2())
4075 const TargetRegisterClass &RC =
4076 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
4077 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
4080 MI.eraseFromParent();
4084bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
4087 MachineBasicBlock *
MBB =
MI.getParent();
4094 const LLT DstTy = MRI->getType(DstReg);
4096 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4097 const TargetRegisterClass *DstRC =
4098 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
4103 if (!Subtarget->supportsBPermute())
4107 if (Subtarget->supportsWaveWideBPermute()) {
4108 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4109 BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4119 assert(Subtarget->isWave64());
4123 MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID));
4124 BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg);
4126 Register UndefExecReg = MRI->createVirtualRegister(
4127 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4128 BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg);
4130 Register PoisonValReg = MRI->createVirtualRegister(DstRC);
4131 BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg)
4139 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4140 BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4144 Register PoisonIdxReg = MRI->createVirtualRegister(DstRC);
4145 BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg)
4153 Register SameSidePermReg = MRI->createVirtualRegister(DstRC);
4154 BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg)
4159 Register SwappedValReg = MRI->createVirtualRegister(DstRC);
4160 BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg)
4163 Register OppSidePermReg = MRI->createVirtualRegister(DstRC);
4164 BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg)
4169 Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC);
4170 BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg)
4177 Register ThreadIDReg = MRI->createVirtualRegister(DstRC);
4178 BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg)
4182 Register XORReg = MRI->createVirtualRegister(DstRC);
4187 Register ANDReg = MRI->createVirtualRegister(DstRC);
4192 Register CompareReg = MRI->createVirtualRegister(
4193 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4194 BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg)
4199 BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
4207 MI.eraseFromParent();
4216 unsigned NumOpcodes = 0;
4229 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4240 for (
unsigned I = 0;
I < Src.size(); ++
I) {
4254 if (Src.size() == 3) {
4261 for (
unsigned I = 0;
I < Src.size(); ++
I) {
4262 if (Src[
I] ==
LHS) {
4272 Bits = SrcBits[Src.size()];
4278 switch (
MI->getOpcode()) {
4279 case TargetOpcode::G_AND:
4280 case TargetOpcode::G_OR:
4281 case TargetOpcode::G_XOR: {
4286 if (!getOperandBits(
LHS, LHSBits) ||
4287 !getOperandBits(
RHS, RHSBits)) {
4288 Src = std::move(Backup);
4289 return std::make_pair(0, 0);
4295 NumOpcodes +=
Op.first;
4296 LHSBits =
Op.second;
4301 NumOpcodes +=
Op.first;
4302 RHSBits =
Op.second;
4307 return std::make_pair(0, 0);
4311 switch (
MI->getOpcode()) {
4312 case TargetOpcode::G_AND:
4313 TTbl = LHSBits & RHSBits;
4315 case TargetOpcode::G_OR:
4316 TTbl = LHSBits | RHSBits;
4318 case TargetOpcode::G_XOR:
4319 TTbl = LHSBits ^ RHSBits;
4325 return std::make_pair(NumOpcodes + 1, TTbl);
4328bool AMDGPUInstructionSelector::selectBITOP3(
MachineInstr &
MI)
const {
4329 if (!Subtarget->hasBitOp3Insts())
4333 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4334 const bool IsVALU = DstRB->
getID() == AMDGPU::VGPRRegBankID;
4340 unsigned NumOpcodes;
4342 std::tie(NumOpcodes, TTbl) =
BitOp3_Op(DstReg, Src, *MRI);
4346 if (NumOpcodes < 2 || Src.empty())
4349 const bool IsB32 = MRI->getType(DstReg) ==
LLT::scalar(32);
4350 if (NumOpcodes == 2 && IsB32) {
4358 }
else if (NumOpcodes < 4) {
4365 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
4366 if (!IsB32 && STI.hasTrue16BitInsts())
4367 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4368 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4369 unsigned CBL = STI.getConstantBusLimit(
Opc);
4370 MachineBasicBlock *
MBB =
MI.getParent();
4373 for (
unsigned I = 0;
I < Src.size(); ++
I) {
4374 const RegisterBank *RB = RBI.getRegBank(Src[
I], *MRI, TRI);
4375 if (RB->
getID() != AMDGPU::SGPRRegBankID)
4381 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4392 while (Src.size() < 3)
4393 Src.push_back(Src[0]);
4410 MI.eraseFromParent();
4415bool AMDGPUInstructionSelector::selectStackRestore(
MachineInstr &
MI)
const {
4417 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
4420 MachineInstr *
DefMI = MRI->getVRegDef(SrcReg);
4422 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4424 MachineBasicBlock *
MBB =
MI.getParent();
4428 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4431 .
addImm(Subtarget->getWavefrontSizeLog2())
4438 MI.eraseFromParent();
4444 if (!
I.isPreISelOpcode()) {
4446 return selectCOPY(
I);
4450 switch (
I.getOpcode()) {
4451 case TargetOpcode::G_AND:
4452 case TargetOpcode::G_OR:
4453 case TargetOpcode::G_XOR:
4454 if (selectBITOP3(
I))
4458 return selectG_AND_OR_XOR(
I);
4459 case TargetOpcode::G_ADD:
4460 case TargetOpcode::G_SUB:
4461 case TargetOpcode::G_PTR_ADD:
4464 return selectG_ADD_SUB(
I);
4465 case TargetOpcode::G_UADDO:
4466 case TargetOpcode::G_USUBO:
4467 case TargetOpcode::G_UADDE:
4468 case TargetOpcode::G_USUBE:
4469 return selectG_UADDO_USUBO_UADDE_USUBE(
I);
4470 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4471 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4472 return selectG_AMDGPU_MAD_64_32(
I);
4473 case TargetOpcode::G_INTTOPTR:
4474 case TargetOpcode::G_BITCAST:
4475 case TargetOpcode::G_PTRTOINT:
4476 case TargetOpcode::G_FREEZE:
4477 return selectCOPY(
I);
4478 case TargetOpcode::G_FNEG:
4481 return selectG_FNEG(
I);
4482 case TargetOpcode::G_FABS:
4485 return selectG_FABS(
I);
4486 case TargetOpcode::G_EXTRACT:
4487 return selectG_EXTRACT(
I);
4488 case TargetOpcode::G_MERGE_VALUES:
4489 case TargetOpcode::G_CONCAT_VECTORS:
4490 return selectG_MERGE_VALUES(
I);
4491 case TargetOpcode::G_UNMERGE_VALUES:
4492 return selectG_UNMERGE_VALUES(
I);
4493 case TargetOpcode::G_BUILD_VECTOR:
4494 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4495 return selectG_BUILD_VECTOR(
I);
4496 case TargetOpcode::G_IMPLICIT_DEF:
4497 return selectG_IMPLICIT_DEF(
I);
4498 case TargetOpcode::G_INSERT:
4499 return selectG_INSERT(
I);
4500 case TargetOpcode::G_INTRINSIC:
4501 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4502 return selectG_INTRINSIC(
I);
4503 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4504 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4505 return selectG_INTRINSIC_W_SIDE_EFFECTS(
I);
4506 case TargetOpcode::G_ICMP:
4507 case TargetOpcode::G_FCMP:
4508 if (selectG_ICMP_or_FCMP(
I))
4511 case TargetOpcode::G_LOAD:
4512 case TargetOpcode::G_ZEXTLOAD:
4513 case TargetOpcode::G_SEXTLOAD:
4514 case TargetOpcode::G_STORE:
4515 case TargetOpcode::G_ATOMIC_CMPXCHG:
4516 case TargetOpcode::G_ATOMICRMW_XCHG:
4517 case TargetOpcode::G_ATOMICRMW_ADD:
4518 case TargetOpcode::G_ATOMICRMW_SUB:
4519 case TargetOpcode::G_ATOMICRMW_AND:
4520 case TargetOpcode::G_ATOMICRMW_OR:
4521 case TargetOpcode::G_ATOMICRMW_XOR:
4522 case TargetOpcode::G_ATOMICRMW_MIN:
4523 case TargetOpcode::G_ATOMICRMW_MAX:
4524 case TargetOpcode::G_ATOMICRMW_UMIN:
4525 case TargetOpcode::G_ATOMICRMW_UMAX:
4526 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4527 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4528 case TargetOpcode::G_ATOMICRMW_USUB_COND:
4529 case TargetOpcode::G_ATOMICRMW_USUB_SAT:
4530 case TargetOpcode::G_ATOMICRMW_FADD:
4531 case TargetOpcode::G_ATOMICRMW_FMIN:
4532 case TargetOpcode::G_ATOMICRMW_FMAX:
4533 return selectG_LOAD_STORE_ATOMICRMW(
I);
4534 case TargetOpcode::G_SELECT:
4535 return selectG_SELECT(
I);
4536 case TargetOpcode::G_TRUNC:
4537 return selectG_TRUNC(
I);
4538 case TargetOpcode::G_SEXT:
4539 case TargetOpcode::G_ZEXT:
4540 case TargetOpcode::G_ANYEXT:
4541 case TargetOpcode::G_SEXT_INREG:
4545 if (MRI->getType(
I.getOperand(1).getReg()) !=
LLT::scalar(1) &&
4548 return selectG_SZA_EXT(
I);
4549 case TargetOpcode::G_FPEXT:
4550 if (selectG_FPEXT(
I))
4553 case TargetOpcode::G_BRCOND:
4554 return selectG_BRCOND(
I);
4555 case TargetOpcode::G_GLOBAL_VALUE:
4556 return selectG_GLOBAL_VALUE(
I);
4557 case TargetOpcode::G_PTRMASK:
4558 return selectG_PTRMASK(
I);
4559 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4560 return selectG_EXTRACT_VECTOR_ELT(
I);
4561 case TargetOpcode::G_INSERT_VECTOR_ELT:
4562 return selectG_INSERT_VECTOR_ELT(
I);
4563 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4564 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4565 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4566 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4567 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4570 assert(Intr &&
"not an image intrinsic with image pseudo");
4571 return selectImageIntrinsic(
I, Intr);
4573 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4574 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4575 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4576 return selectBVHIntersectRayIntrinsic(
I);
4577 case AMDGPU::G_SBFX:
4578 case AMDGPU::G_UBFX:
4579 return selectG_SBFX_UBFX(
I);
4580 case AMDGPU::G_SI_CALL:
4581 I.setDesc(TII.get(AMDGPU::SI_CALL));
4583 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4584 return selectWaveAddress(
I);
4585 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4586 I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4589 case AMDGPU::G_STACKRESTORE:
4590 return selectStackRestore(
I);
4592 return selectPHI(
I);
4593 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4594 return selectCOPY_SCC_VCC(
I);
4595 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4596 return selectCOPY_VCC_SCC(
I);
4597 case AMDGPU::G_AMDGPU_READANYLANE:
4598 return selectReadAnyLane(
I);
4599 case TargetOpcode::G_CONSTANT:
4600 case TargetOpcode::G_FCONSTANT:
4608AMDGPUInstructionSelector::selectVCSRC(
MachineOperand &Root)
const {
4615std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4616 Register Src,
bool IsCanonicalizing,
bool AllowAbs,
bool OpSel)
const {
4620 if (
MI->getOpcode() == AMDGPU::G_FNEG) {
4621 Src =
MI->getOperand(1).getReg();
4624 }
else if (
MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4629 if (
LHS &&
LHS->isZero()) {
4631 Src =
MI->getOperand(2).getReg();
4635 if (AllowAbs &&
MI->getOpcode() == AMDGPU::G_FABS) {
4636 Src =
MI->getOperand(1).getReg();
4643 return std::pair(Src, Mods);
4646std::pair<Register, unsigned>
4647AMDGPUInstructionSelector::selectVOP3PModsF32Impl(
Register Src)
const {
4649 std::tie(Src, Mods) = selectVOP3ModsImpl(Src);
4651 return std::pair(Src, Mods);
4654Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4656 bool ForceVGPR)
const {
4657 if ((Mods != 0 || ForceVGPR) &&
4658 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4665 TII.
get(AMDGPU::COPY), VGPRSrc)
4677AMDGPUInstructionSelector::selectVSRC0(
MachineOperand &Root)
const {
4679 [=](MachineInstrBuilder &MIB) { MIB.
add(Root); }
4684AMDGPUInstructionSelector::selectVOP3Mods0(
MachineOperand &Root)
const {
4687 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.
getReg());
4690 [=](MachineInstrBuilder &MIB) {
4691 MIB.
addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4693 [=](MachineInstrBuilder &MIB) { MIB.
addImm(Mods); },
4694 [=](MachineInstrBuilder &MIB) { MIB.
addImm(0); },
4695 [=](MachineInstrBuilder &MIB) { MIB.
addImm(0); }
4700AMDGPUInstructionSelector::selectVOP3BMods0(
MachineOperand &Root)
const {
4703 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.
getReg(),
4708 [=](MachineInstrBuilder &MIB) {
4709 MIB.
addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4711 [=](MachineInstrBuilder &MIB) { MIB.
addImm(Mods); },
4712 [=](MachineInstrBuilder &MIB) { MIB.
addImm(0); },
4713 [=](MachineInstrBuilder &MIB) { MIB.
addImm(0); }
4718AMDGPUInstructionSelector::selectVOP3OMods(
MachineOperand &Root)
const {
4720 [=](MachineInstrBuilder &MIB) { MIB.
add(Root); },
4721 [=](MachineInstrBuilder &MIB) { MIB.
addImm(0); },
4722 [=](MachineInstrBuilder &MIB) { MIB.
addImm(0); }
4727AMDGPUInstructionSelector::selectVOP3Mods(
MachineOperand &Root)
const {
4730 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.
getReg());
4733 [=](MachineInstrBuilder &MIB) {
4734 MIB.
addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4736 [=](MachineInstrBuilder &MIB) { MIB.
addImm(Mods); }
4741AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4745 std::tie(Src, Mods) =
4746 selectVOP3ModsImpl(Root.
getReg(),
false);
4749 [=](MachineInstrBuilder &MIB) {
4750 MIB.
addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4752 [=](MachineInstrBuilder &MIB) { MIB.
addImm(Mods); }
4757AMDGPUInstructionSelector::selectVOP3BMods(
MachineOperand &Root)
const {
4760 std::tie(Src, Mods) =
4761 selectVOP3ModsImpl(Root.
getReg(),
true,
4765 [=](MachineInstrBuilder &MIB) {
4766 MIB.
addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4768 [=](MachineInstrBuilder &MIB) { MIB.
addImm(Mods); }
4773AMDGPUInstructionSelector::selectVOP3NoMods(
MachineOperand &Root)
const {
4776 if (
Def->getOpcode() == AMDGPU::G_FNEG ||
Def->getOpcode() == AMDGPU::G_FABS)
4779 [=](MachineInstrBuilder &MIB) { MIB.
addReg(
Reg); },
4804 if (
MI->getOpcode() != AMDGPU::G_TRUNC)
4809 return DstSize * 2 == SrcSize;
4815 if (
MI->getOpcode() != AMDGPU::G_LSHR)
4819 std::optional<ValueAndVReg> ShiftAmt;
4820 if (
mi_match(
MI->getOperand(0).getReg(), MRI,
4823 unsigned Shift = ShiftAmt->Value.getZExtValue();
4824 return Shift * 2 == SrcSize;
4832 if (
MI->getOpcode() != AMDGPU::G_SHL)
4836 std::optional<ValueAndVReg> ShiftAmt;
4837 if (
mi_match(
MI->getOperand(0).getReg(), MRI,
4840 unsigned Shift = ShiftAmt->Value.getZExtValue();
4841 return Shift * 2 == SrcSize;
4849 if (
MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4851 return MI->getNumOperands() == 3 &&
MI->getOperand(0).isDef() &&
4852 MI->getOperand(1).isDef() && !
MI->getOperand(2).isDef();
5022static std::optional<std::pair<Register, SrcStatus>>
5027 unsigned Opc =
MI->getOpcode();
5031 case AMDGPU::G_BITCAST:
5032 return std::optional<std::pair<Register, SrcStatus>>(
5033 {
MI->getOperand(1).getReg(), Curr.second});
5035 if (
MI->getOperand(1).getReg().isPhysical())
5036 return std::nullopt;
5037 return std::optional<std::pair<Register, SrcStatus>>(
5038 {
MI->getOperand(1).getReg(), Curr.second});
5039 case AMDGPU::G_FNEG: {
5042 return std::nullopt;
5043 return std::optional<std::pair<Register, SrcStatus>>(
5044 {
MI->getOperand(1).getReg(), Stat});
5051 switch (Curr.second) {
5054 return std::optional<std::pair<Register, SrcStatus>>(
5057 if (Curr.first ==
MI->getOperand(0).getReg())
5058 return std::optional<std::pair<Register, SrcStatus>>(
5060 return std::optional<std::pair<Register, SrcStatus>>(
5072 return std::optional<std::pair<Register, SrcStatus>>(
5076 if (Curr.first ==
MI->getOperand(0).getReg())
5077 return std::optional<std::pair<Register, SrcStatus>>(
5079 return std::optional<std::pair<Register, SrcStatus>>(
5085 return std::optional<std::pair<Register, SrcStatus>>(
5090 return std::optional<std::pair<Register, SrcStatus>>(
5095 return std::optional<std::pair<Register, SrcStatus>>(
5100 return std::optional<std::pair<Register, SrcStatus>>(
5106 return std::nullopt;
5116 bool HasNeg =
false;
5118 bool HasOpsel =
true;
5123 unsigned Opc =
MI->getOpcode();
5125 if (
Opc == TargetOpcode::G_INTRINSIC) {
5128 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
5155 while (
Depth <= MaxDepth && Curr.has_value()) {
5158 Statlist.push_back(Curr.value());
5165static std::pair<Register, SrcStatus>
5172 while (
Depth <= MaxDepth && Curr.has_value()) {
5178 LastSameOrNeg = Curr.value();
5183 return LastSameOrNeg;
5190 return Width1 == Width2;
5225 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
5226 IsHalfState(HiStat);
5229std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
5235 return {RootReg, Mods};
5238 SearchOptions SO(RootReg, MRI);
5249 MachineInstr *
MI = MRI.getVRegDef(Stat.first);
5251 if (
MI->getOpcode() != AMDGPU::G_BUILD_VECTOR ||
MI->getNumOperands() != 3 ||
5252 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
5254 return {Stat.first, Mods};
5260 if (StatlistHi.
empty()) {
5262 return {Stat.first, Mods};
5268 if (StatlistLo.
empty()) {
5270 return {Stat.first, Mods};
5273 for (
int I = StatlistHi.
size() - 1;
I >= 0;
I--) {
5274 for (
int J = StatlistLo.
size() - 1; J >= 0; J--) {
5275 if (StatlistHi[
I].first == StatlistLo[J].first &&
5277 StatlistHi[
I].first, RootReg, TII, MRI))
5278 return {StatlistHi[
I].first,
5279 updateMods(StatlistHi[
I].second, StatlistLo[J].second, Mods)};
5285 return {Stat.first, Mods};
5295 return RB->
getID() == RBNo;
5312 if (
checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI,
TRI) ||
5313 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI,
TRI))
5317 if (
MI->getOpcode() == AMDGPU::COPY && NewReg ==
MI->getOperand(1).getReg()) {
5326 BuildMI(*BB,
MI,
MI->getDebugLoc(),
TII.get(AMDGPU::COPY), DstReg)
5334AMDGPUInstructionSelector::selectVOP3PRetHelper(
MachineOperand &Root,
5339 std::tie(
Reg, Mods) = selectVOP3PModsImpl(Root.
getReg(), MRI, IsDOT);
5343 [=](MachineInstrBuilder &MIB) { MIB.
addReg(
Reg); },
5344 [=](MachineInstrBuilder &MIB) { MIB.
addImm(Mods); }
5349AMDGPUInstructionSelector::selectVOP3PMods(
MachineOperand &Root)
const {
5351 return selectVOP3PRetHelper(Root);
5355AMDGPUInstructionSelector::selectVOP3PModsDOT(
MachineOperand &Root)
const {
5357 return selectVOP3PRetHelper(Root,
true);
5361AMDGPUInstructionSelector::selectVOP3PNoModsDOT(
MachineOperand &Root)
const {
5365 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.
getReg(), MRI,
true );
5369 return {{[=](MachineInstrBuilder &MIB) { MIB.
addReg(Src); }}};
5373AMDGPUInstructionSelector::selectVOP3PModsF32(
MachineOperand &Root)
const {
5376 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.
getReg());
5379 [=](MachineInstrBuilder &MIB) { MIB.
addReg(Src); },
5380 [=](MachineInstrBuilder &MIB) { MIB.
addImm(Mods); }
5385AMDGPUInstructionSelector::selectVOP3PNoModsF32(
MachineOperand &Root)
const {
5388 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.
getReg());
5392 return {{[=](MachineInstrBuilder &MIB) { MIB.
addReg(Src); }}};
5396AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
5399 "expected i1 value");
5405 [=](MachineInstrBuilder &MIB) { MIB.
addImm(Mods); }
5413 switch (Elts.
size()) {
5415 DstRegClass = &AMDGPU::VReg_256RegClass;
5418 DstRegClass = &AMDGPU::VReg_128RegClass;
5421 DstRegClass = &AMDGPU::VReg_64RegClass;
5428 auto MIB =
B.buildInstr(AMDGPU::REG_SEQUENCE)
5430 for (
unsigned i = 0; i < Elts.
size(); ++i) {
5441 if (ModOpcode == TargetOpcode::G_FNEG) {
5445 for (
auto El : Elts) {
5451 if (Elts.size() != NegAbsElts.
size()) {
5460 assert(ModOpcode == TargetOpcode::G_FABS);
5468AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(
MachineOperand &Root)
const {
5474 assert(BV->getNumSources() > 0);
5476 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5477 unsigned ModOpcode = (ElF32->
getOpcode() == AMDGPU::G_FNEG)
5480 for (
unsigned i = 0; i < BV->getNumSources(); ++i) {
5481 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5488 if (BV->getNumSources() == EltsF32.
size()) {
5494 return {{[=](MachineInstrBuilder &MIB) { MIB.
addReg(Src); },
5495 [=](MachineInstrBuilder &MIB) { MIB.
addImm(Mods); }}};
5499AMDGPUInstructionSelector::selectWMMAModsF16Neg(
MachineOperand &Root)
const {
5505 for (
unsigned i = 0; i < CV->getNumSources(); ++i) {
5513 if (CV->getNumSources() == EltsV2F16.
size()) {
5520 return {{[=](MachineInstrBuilder &MIB) { MIB.
addReg(Src); },
5521 [=](MachineInstrBuilder &MIB) { MIB.
addImm(Mods); }}};
5525AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(
MachineOperand &Root)
const {
5531 assert(CV->getNumSources() > 0);
5532 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5534 unsigned ModOpcode = (ElV2F16->
getOpcode() == AMDGPU::G_FNEG)
5538 for (
unsigned i = 0; i < CV->getNumSources(); ++i) {
5539 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5546 if (CV->getNumSources() == EltsV2F16.
size()) {
5553 return {{[=](MachineInstrBuilder &MIB) { MIB.
addReg(Src); },
5554 [=](MachineInstrBuilder &MIB) { MIB.
addImm(Mods); }}};
5558AMDGPUInstructionSelector::selectWMMAVISrc(
MachineOperand &Root)
const {
5559 std::optional<FPValueAndVReg> FPValReg;
5561 if (TII.isInlineConstant(FPValReg->Value)) {
5562 return {{[=](MachineInstrBuilder &MIB) {
5563 MIB.
addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5573 if (TII.isInlineConstant(ICst)) {
5583AMDGPUInstructionSelector::selectSWMMACIndex8(
MachineOperand &Root)
const {
5589 std::optional<ValueAndVReg> ShiftAmt;
5591 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5592 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5593 Key = ShiftAmt->Value.getZExtValue() / 8;
5598 [=](MachineInstrBuilder &MIB) { MIB.
addReg(Src); },
5599 [=](MachineInstrBuilder &MIB) { MIB.
addImm(
Key); }
5604AMDGPUInstructionSelector::selectSWMMACIndex16(
MachineOperand &Root)
const {
5611 std::optional<ValueAndVReg> ShiftAmt;
5613 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5614 ShiftAmt->Value.getZExtValue() == 16) {
5620 [=](MachineInstrBuilder &MIB) { MIB.
addReg(Src); },
5621 [=](MachineInstrBuilder &MIB) { MIB.
addImm(
Key); }
5626AMDGPUInstructionSelector::selectSWMMACIndex32(
MachineOperand &Root)
const {
5633 S32 = matchAnyExtendFromS32(Src);
5637 if (
Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5642 Src =
Def->getOperand(2).getReg();
5649 [=](MachineInstrBuilder &MIB) { MIB.
addReg(Src); },
5650 [=](MachineInstrBuilder &MIB) { MIB.
addImm(
Key); }
5655AMDGPUInstructionSelector::selectVOP3OpSelMods(
MachineOperand &Root)
const {
5658 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.
getReg());
5662 [=](MachineInstrBuilder &MIB) { MIB.
addReg(Src); },
5663 [=](MachineInstrBuilder &MIB) { MIB.
addImm(Mods); }
5669AMDGPUInstructionSelector::selectVINTERPMods(
MachineOperand &Root)
const {
5672 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.
getReg(),
5678 [=](MachineInstrBuilder &MIB) {
5680 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB,
true));
5682 [=](MachineInstrBuilder &MIB) { MIB.
addImm(Mods); },
5687AMDGPUInstructionSelector::selectVINTERPModsHi(
MachineOperand &Root)
const {
5690 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.
getReg(),
5696 [=](MachineInstrBuilder &MIB) {
5698 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB,
true));
5700 [=](MachineInstrBuilder &MIB) { MIB.
addImm(Mods); },
5707bool AMDGPUInstructionSelector::selectScaleOffset(
MachineOperand &Root,
5709 bool IsSigned)
const {
5710 if (!Subtarget->hasScaleOffset())
5714 MachineMemOperand *MMO = *
MI.memoperands_begin();
5726 OffsetReg =
Def->Reg;
5741 m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5745 (
Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5746 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5747 (IsSigned &&
Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5748 VT->signBitIsZero(
Mul->getOperand(2).getReg()))) &&
5761bool AMDGPUInstructionSelector::selectSmrdOffset(
MachineOperand &Root,
5765 bool *ScaleOffset)
const {
5767 MachineBasicBlock *
MBB =
MI->getParent();
5772 getAddrModeInfo(*
MI, *MRI, AddrInfo);
5774 if (AddrInfo.
empty())
5777 const GEPInfo &GEPI = AddrInfo[0];
5778 std::optional<int64_t> EncodedImm;
5781 *ScaleOffset =
false;
5786 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5787 AddrInfo.
size() > 1) {
5788 const GEPInfo &GEPI2 = AddrInfo[1];
5789 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5790 Register OffsetReg = GEPI2.SgprParts[1];
5793 selectScaleOffset(Root, OffsetReg,
false );
5794 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5796 Base = GEPI2.SgprParts[0];
5797 *SOffset = OffsetReg;
5806 auto SKnown =
VT->getKnownBits(*SOffset);
5807 if (*
Offset + SKnown.getMinValue().getSExtValue() < 0)
5819 if (
Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5820 Base = GEPI.SgprParts[0];
5826 if (SOffset && GEPI.SgprParts.size() == 1 &&
isUInt<32>(GEPI.Imm) &&
5832 Base = GEPI.SgprParts[0];
5833 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5834 BuildMI(*
MBB,
MI,
MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5839 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5840 Register OffsetReg = GEPI.SgprParts[1];
5842 *ScaleOffset = selectScaleOffset(Root, OffsetReg,
false );
5843 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5845 Base = GEPI.SgprParts[0];
5846 *SOffset = OffsetReg;
5855AMDGPUInstructionSelector::selectSmrdImm(
MachineOperand &Root)
const {
5858 if (!selectSmrdOffset(Root,
Base,
nullptr, &
Offset,
5860 return std::nullopt;
5862 return {{[=](MachineInstrBuilder &MIB) { MIB.
addReg(
Base); },
5863 [=](MachineInstrBuilder &MIB) { MIB.
addImm(
Offset); }}};
5867AMDGPUInstructionSelector::selectSmrdImm32(
MachineOperand &Root)
const {
5869 getAddrModeInfo(*Root.
getParent(), *MRI, AddrInfo);
5871 if (AddrInfo.
empty() || AddrInfo[0].SgprParts.size() != 1)
5872 return std::nullopt;
5874 const GEPInfo &GEPInfo = AddrInfo[0];
5875 Register PtrReg = GEPInfo.SgprParts[0];
5876 std::optional<int64_t> EncodedImm =
5879 return std::nullopt;
5882 [=](MachineInstrBuilder &MIB) { MIB.
addReg(PtrReg); },
5883 [=](MachineInstrBuilder &MIB) { MIB.
addImm(*EncodedImm); }
5888AMDGPUInstructionSelector::selectSmrdSgpr(
MachineOperand &Root)
const {
5891 if (!selectSmrdOffset(Root,
Base, &SOffset,
nullptr,
5893 return std::nullopt;
5896 return {{[=](MachineInstrBuilder &MIB) { MIB.
addReg(
Base); },
5897 [=](MachineInstrBuilder &MIB) { MIB.
addReg(SOffset); },
5898 [=](MachineInstrBuilder &MIB) { MIB.
addImm(CPol); }}};
5902AMDGPUInstructionSelector::selectSmrdSgprImm(
MachineOperand &Root)
const {
5906 if (!selectSmrdOffset(Root,
Base, &SOffset, &
Offset, &ScaleOffset))
5907 return std::nullopt;
5910 return {{[=](MachineInstrBuilder &MIB) { MIB.
addReg(
Base); },
5911 [=](MachineInstrBuilder &MIB) { MIB.
addReg(SOffset); },
5913 [=](MachineInstrBuilder &MIB) { MIB.
addImm(CPol); }}};
5916std::pair<Register, int>
5917AMDGPUInstructionSelector::selectFlatOffsetImpl(
MachineOperand &Root,
5918 uint64_t FlatVariant)
const {
5923 if (!STI.hasFlatInstOffsets())
5927 int64_t ConstOffset;
5929 std::tie(PtrBase, ConstOffset, IsInBounds) =
5930 getPtrBaseWithConstantOffset(Root.
getReg(), *MRI);
5936 if (ConstOffset == 0 ||
5938 !isFlatScratchBaseLegal(Root.
getReg())) ||
5942 unsigned AddrSpace = (*
MI->memoperands_begin())->getAddrSpace();
5943 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
5946 return std::pair(PtrBase, ConstOffset);
5950AMDGPUInstructionSelector::selectFlatOffset(
MachineOperand &Root)
const {
5954 [=](MachineInstrBuilder &MIB) { MIB.
addReg(PtrWithOffset.first); },
5955 [=](MachineInstrBuilder &MIB) { MIB.
addImm(PtrWithOffset.second); },
5960AMDGPUInstructionSelector::selectGlobalOffset(
MachineOperand &Root)
const {
5964 [=](MachineInstrBuilder &MIB) { MIB.
addReg(PtrWithOffset.first); },
5965 [=](MachineInstrBuilder &MIB) { MIB.
addImm(PtrWithOffset.second); },
5970AMDGPUInstructionSelector::selectScratchOffset(
MachineOperand &Root)
const {
5974 [=](MachineInstrBuilder &MIB) { MIB.
addReg(PtrWithOffset.first); },
5975 [=](MachineInstrBuilder &MIB) { MIB.
addImm(PtrWithOffset.second); },
5981AMDGPUInstructionSelector::selectGlobalSAddr(
MachineOperand &Root,
5983 bool NeedIOffset)
const {
5986 int64_t ConstOffset;
5987 int64_t ImmOffset = 0;
5991 std::tie(PtrBase, ConstOffset, std::ignore) =
5992 getPtrBaseWithConstantOffset(Addr, *MRI);
5994 if (ConstOffset != 0) {
5999 ImmOffset = ConstOffset;
6002 if (isSGPR(PtrBaseDef->Reg)) {
6003 if (ConstOffset > 0) {
6009 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
6011 std::tie(SplitImmOffset, RemainderOffset) =
6016 if (Subtarget->hasSignedGVSOffset() ?
isInt<32>(RemainderOffset)
6019 MachineBasicBlock *
MBB =
MI->getParent();
6021 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6023 BuildMI(*
MBB,
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
6025 .
addImm(RemainderOffset);
6029 [=](MachineInstrBuilder &MIB) {
6032 [=](MachineInstrBuilder &MIB) {
6035 [=](MachineInstrBuilder &MIB) { MIB.
addImm(SplitImmOffset); },
6036 [=](MachineInstrBuilder &MIB) { MIB.
addImm(CPolBits); },
6039 [=](MachineInstrBuilder &MIB) { MIB.
addReg(PtrBase); },
6040 [=](MachineInstrBuilder &MIB) {
6043 [=](MachineInstrBuilder &MIB) { MIB.
addImm(CPolBits); },
6053 unsigned NumLiterals =
6054 !TII.isInlineConstant(APInt(32,
Lo_32(ConstOffset))) +
6055 !TII.isInlineConstant(APInt(32,
Hi_32(ConstOffset)));
6056 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
6057 return std::nullopt;
6064 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6069 if (isSGPR(SAddr)) {
6070 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
6074 bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
6075 Subtarget->hasSignedGVSOffset());
6076 if (
Register VOffset = matchExtendFromS32OrS32(
6077 PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
6079 return {{[=](MachineInstrBuilder &MIB) {
6082 [=](MachineInstrBuilder &MIB) {
6085 [=](MachineInstrBuilder &MIB) {
6088 [=](MachineInstrBuilder &MIB) {
6092 return {{[=](MachineInstrBuilder &MIB) {
6095 [=](MachineInstrBuilder &MIB) {
6098 [=](MachineInstrBuilder &MIB) {
6108 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
6109 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
6110 return std::nullopt;
6115 MachineBasicBlock *
MBB =
MI->getParent();
6116 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6118 BuildMI(*
MBB,
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
6123 [=](MachineInstrBuilder &MIB) { MIB.
addReg(AddrDef->Reg); },
6124 [=](MachineInstrBuilder &MIB) { MIB.
addReg(VOffset); },
6125 [=](MachineInstrBuilder &MIB) { MIB.
addImm(ImmOffset); },
6126 [=](MachineInstrBuilder &MIB) { MIB.
addImm(CPolBits); }
6129 [=](MachineInstrBuilder &MIB) { MIB.
addReg(AddrDef->Reg); },
6130 [=](MachineInstrBuilder &MIB) { MIB.
addReg(VOffset); },
6131 [=](MachineInstrBuilder &MIB) { MIB.
addImm(CPolBits); }
6136AMDGPUInstructionSelector::selectGlobalSAddr(
MachineOperand &Root)
const {
6137 return selectGlobalSAddr(Root, 0);
6141AMDGPUInstructionSelector::selectGlobalSAddrCPol(
MachineOperand &Root)
const {
6147 return selectGlobalSAddr(Root, PassedCPol);
6151AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(
MachineOperand &Root)
const {
6157 return selectGlobalSAddr(Root, PassedCPol);
6161AMDGPUInstructionSelector::selectGlobalSAddrGLC(
MachineOperand &Root)
const {
6166AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
6173 return selectGlobalSAddr(Root, PassedCPol,
false);
6177AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
6184 return selectGlobalSAddr(Root, PassedCPol,
false);
6188AMDGPUInstructionSelector::selectScratchSAddr(
MachineOperand &Root)
const {
6191 int64_t ConstOffset;
6192 int64_t ImmOffset = 0;
6196 std::tie(PtrBase, ConstOffset, std::ignore) =
6197 getPtrBaseWithConstantOffset(Addr, *MRI);
6199 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
6203 ImmOffset = ConstOffset;
6207 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6208 int FI = AddrDef->MI->getOperand(1).
getIndex();
6211 [=](MachineInstrBuilder &MIB) { MIB.
addImm(ImmOffset); }
6217 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6218 Register LHS = AddrDef->MI->getOperand(1).getReg();
6219 Register RHS = AddrDef->MI->getOperand(2).getReg();
6223 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
6224 isSGPR(RHSDef->Reg)) {
6225 int FI = LHSDef->MI->getOperand(1).getIndex();
6229 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6231 BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
6239 return std::nullopt;
6242 [=](MachineInstrBuilder &MIB) { MIB.
addReg(SAddr); },
6243 [=](MachineInstrBuilder &MIB) { MIB.
addImm(ImmOffset); }
6248bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
6250 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
6256 auto VKnown =
VT->getKnownBits(VAddr);
6259 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
6260 uint64_t
SMax = SKnown.getMaxValue().getZExtValue();
6261 return (VMax & 3) + (
SMax & 3) >= 4;
6265AMDGPUInstructionSelector::selectScratchSVAddr(
MachineOperand &Root)
const {
6268 int64_t ConstOffset;
6269 int64_t ImmOffset = 0;
6273 std::tie(PtrBase, ConstOffset, std::ignore) =
6274 getPtrBaseWithConstantOffset(Addr, *MRI);
6277 if (ConstOffset != 0 &&
6281 ImmOffset = ConstOffset;
6285 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
6286 return std::nullopt;
6288 Register RHS = AddrDef->MI->getOperand(2).getReg();
6289 if (RBI.getRegBank(
RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
6290 return std::nullopt;
6292 Register LHS = AddrDef->MI->getOperand(1).getReg();
6295 if (OrigAddr != Addr) {
6296 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
6297 return std::nullopt;
6299 if (!isFlatScratchBaseLegalSV(OrigAddr))
6300 return std::nullopt;
6303 if (checkFlatScratchSVSSwizzleBug(
RHS,
LHS, ImmOffset))
6304 return std::nullopt;
6306 unsigned CPol = selectScaleOffset(Root,
RHS,
true )
6310 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6311 int FI = LHSDef->MI->getOperand(1).getIndex();
6313 [=](MachineInstrBuilder &MIB) { MIB.
addReg(
RHS); },
6315 [=](MachineInstrBuilder &MIB) { MIB.
addImm(ImmOffset); },
6316 [=](MachineInstrBuilder &MIB) { MIB.
addImm(CPol); }
6325 return std::nullopt;
6328 [=](MachineInstrBuilder &MIB) { MIB.
addReg(
RHS); },
6329 [=](MachineInstrBuilder &MIB) { MIB.
addReg(
LHS); },
6330 [=](MachineInstrBuilder &MIB) { MIB.
addImm(ImmOffset); },
6331 [=](MachineInstrBuilder &MIB) { MIB.
addImm(CPol); }
6336AMDGPUInstructionSelector::selectMUBUFScratchOffen(
MachineOperand &Root)
const {
6338 MachineBasicBlock *
MBB =
MI->getParent();
6340 const SIMachineFunctionInfo *
Info =
MF->getInfo<SIMachineFunctionInfo>();
6345 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6350 BuildMI(*
MBB,
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
6354 return {{[=](MachineInstrBuilder &MIB) {
6357 [=](MachineInstrBuilder &MIB) {
6360 [=](MachineInstrBuilder &MIB) {
6365 [=](MachineInstrBuilder &MIB) {
6374 std::optional<int> FI;
6377 const MachineInstr *RootDef = MRI->getVRegDef(Root.
getReg());
6379 int64_t ConstOffset;
6380 std::tie(PtrBase, ConstOffset, std::ignore) =
6381 getPtrBaseWithConstantOffset(VAddr, *MRI);
6382 if (ConstOffset != 0) {
6383 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
6384 (!STI.privateMemoryResourceIsRangeChecked() ||
6385 VT->signBitIsZero(PtrBase))) {
6386 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
6387 if (PtrBaseDef->
getOpcode() == AMDGPU::G_FRAME_INDEX)
6393 }
else if (RootDef->
getOpcode() == AMDGPU::G_FRAME_INDEX) {
6397 return {{[=](MachineInstrBuilder &MIB) {
6400 [=](MachineInstrBuilder &MIB) {
6406 [=](MachineInstrBuilder &MIB) {
6411 [=](MachineInstrBuilder &MIB) {
6416bool AMDGPUInstructionSelector::isDSOffsetLegal(
Register Base,
6421 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6426 return VT->signBitIsZero(
Base);
6429bool AMDGPUInstructionSelector::isDSOffset2Legal(
Register Base, int64_t Offset0,
6431 unsigned Size)
const {
6432 if (Offset0 %
Size != 0 || Offset1 %
Size != 0)
6437 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6442 return VT->signBitIsZero(
Base);
6447 return Addr->
getOpcode() == TargetOpcode::G_OR ||
6448 (Addr->
getOpcode() == TargetOpcode::G_PTR_ADD &&
6455bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(
Register Addr)
const {
6463 if (STI.hasSignedScratchOffsets())
6469 if (AddrMI->
getOpcode() == TargetOpcode::G_PTR_ADD) {
6470 std::optional<ValueAndVReg> RhsValReg =
6476 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6477 RhsValReg->Value.getSExtValue() > -0x40000000)
6481 return VT->signBitIsZero(
LHS);
6486bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(
Register Addr)
const {
6494 if (STI.hasSignedScratchOffsets())
6499 return VT->signBitIsZero(
RHS) &&
VT->signBitIsZero(
LHS);
6504bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6508 if (STI.hasSignedScratchOffsets())
6513 std::optional<DefinitionAndSourceRegister> BaseDef =
6515 std::optional<ValueAndVReg> RHSOffset =
6525 (RHSOffset->Value.getSExtValue() < 0 &&
6526 RHSOffset->Value.getSExtValue() > -0x40000000)))
6529 Register LHS = BaseDef->MI->getOperand(1).getReg();
6530 Register RHS = BaseDef->MI->getOperand(2).getReg();
6531 return VT->signBitIsZero(
RHS) &&
VT->signBitIsZero(
LHS);
6534bool AMDGPUInstructionSelector::isUnneededShiftMask(
const MachineInstr &
MI,
6535 unsigned ShAmtBits)
const {
6536 assert(
MI.getOpcode() == TargetOpcode::G_AND);
6538 std::optional<APInt>
RHS =
6543 if (
RHS->countr_one() >= ShAmtBits)
6546 const APInt &LHSKnownZeros =
VT->getKnownZeroes(
MI.getOperand(1).getReg());
6547 return (LHSKnownZeros | *
RHS).countr_one() >= ShAmtBits;
6551AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6554 const SIMachineFunctionInfo *
Info =
MF->getInfo<SIMachineFunctionInfo>();
6556 std::optional<DefinitionAndSourceRegister>
Def =
6558 assert(Def &&
"this shouldn't be an optional result");
6563 [=](MachineInstrBuilder &MIB) {
6566 [=](MachineInstrBuilder &MIB) {
6569 [=](MachineInstrBuilder &MIB) { MIB.
addImm(0); }
6580 if (!TII.isLegalMUBUFImmOffset(
Offset))
6588 [=](MachineInstrBuilder &MIB) {
6591 [=](MachineInstrBuilder &MIB) {
6599 !TII.isLegalMUBUFImmOffset(
Offset))
6603 [=](MachineInstrBuilder &MIB) {
6606 [=](MachineInstrBuilder &MIB) {
6613std::pair<Register, unsigned>
6614AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(
MachineOperand &Root)
const {
6615 const MachineInstr *RootDef = MRI->getVRegDef(Root.
getReg());
6616 int64_t ConstAddr = 0;
6620 std::tie(PtrBase,
Offset, std::ignore) =
6621 getPtrBaseWithConstantOffset(Root.
getReg(), *MRI);
6624 if (isDSOffsetLegal(PtrBase,
Offset)) {
6626 return std::pair(PtrBase,
Offset);
6628 }
else if (RootDef->
getOpcode() == AMDGPU::G_SUB) {
6637 return std::pair(Root.
getReg(), 0);
6641AMDGPUInstructionSelector::selectDS1Addr1Offset(
MachineOperand &Root)
const {
6644 std::tie(
Reg,
Offset) = selectDS1Addr1OffsetImpl(Root);
6646 [=](MachineInstrBuilder &MIB) { MIB.
addReg(
Reg); },
6652AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(
MachineOperand &Root)
const {
6653 return selectDSReadWrite2(Root, 4);
6657AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(
MachineOperand &Root)
const {
6658 return selectDSReadWrite2(Root, 8);
6662AMDGPUInstructionSelector::selectDSReadWrite2(
MachineOperand &Root,
6663 unsigned Size)
const {
6668 [=](MachineInstrBuilder &MIB) { MIB.
addReg(
Reg); },
6670 [=](MachineInstrBuilder &MIB) { MIB.
addImm(
Offset+1); }
6674std::pair<Register, unsigned>
6675AMDGPUInstructionSelector::selectDSReadWrite2Impl(
MachineOperand &Root,
6676 unsigned Size)
const {
6677 const MachineInstr *RootDef = MRI->getVRegDef(Root.
getReg());
6678 int64_t ConstAddr = 0;
6682 std::tie(PtrBase,
Offset, std::ignore) =
6683 getPtrBaseWithConstantOffset(Root.
getReg(), *MRI);
6686 int64_t OffsetValue0 =
Offset;
6688 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1,
Size)) {
6690 return std::pair(PtrBase, OffsetValue0 /
Size);
6692 }
else if (RootDef->
getOpcode() == AMDGPU::G_SUB) {
6700 return std::pair(Root.
getReg(), 0);
6708std::tuple<Register, int64_t, bool>
6709AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6712 if (RootI->
getOpcode() != TargetOpcode::G_PTR_ADD)
6713 return {Root, 0,
false};
6716 std::optional<ValueAndVReg> MaybeOffset =
6719 return {Root, 0,
false};
6739 B.buildInstr(AMDGPU::S_MOV_B32)
6742 B.buildInstr(AMDGPU::S_MOV_B32)
6749 B.buildInstr(AMDGPU::REG_SEQUENCE)
6752 .addImm(AMDGPU::sub0)
6754 .addImm(AMDGPU::sub1);
6759 B.buildInstr(AMDGPU::S_MOV_B64)
6764 B.buildInstr(AMDGPU::REG_SEQUENCE)
6767 .addImm(AMDGPU::sub0_sub1)
6769 .addImm(AMDGPU::sub2_sub3);
6776 uint64_t DefaultFormat =
TII.getDefaultRsrcDataFormat();
6785 uint64_t DefaultFormat =
TII.getDefaultRsrcDataFormat();
6792AMDGPUInstructionSelector::MUBUFAddressData
6793AMDGPUInstructionSelector::parseMUBUFAddress(
Register Src)
const {
6794 MUBUFAddressData
Data;
6800 std::tie(PtrBase,
Offset, std::ignore) =
6801 getPtrBaseWithConstantOffset(Src, *MRI);
6807 if (MachineInstr *InputAdd
6809 Data.N2 = InputAdd->getOperand(1).getReg();
6810 Data.N3 = InputAdd->getOperand(2).getReg();
6825bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr)
const {
6831 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6832 return N0Bank->
getID() == AMDGPU::VGPRRegBankID;
6838void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6840 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6844 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6845 B.buildInstr(AMDGPU::S_MOV_B32)
6851bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6856 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6859 MUBUFAddressData AddrData = parseMUBUFAddress(Root.
getReg());
6860 if (!shouldUseAddr64(AddrData))
6866 Offset = AddrData.Offset;
6872 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6874 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6887 }
else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6898 splitIllegalMUBUFOffset(
B, SOffset,
Offset);
6902bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6907 if (STI.useFlatForGlobal())
6910 MUBUFAddressData AddrData = parseMUBUFAddress(Root.
getReg());
6911 if (shouldUseAddr64(AddrData))
6917 Offset = AddrData.Offset;
6923 splitIllegalMUBUFOffset(
B, SOffset,
Offset);
6928AMDGPUInstructionSelector::selectMUBUFAddr64(
MachineOperand &Root)
const {
6934 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset,
Offset))
6940 [=](MachineInstrBuilder &MIB) {
6943 [=](MachineInstrBuilder &MIB) {
6946 [=](MachineInstrBuilder &MIB) {
6949 else if (STI.hasRestrictedSOffset())
6950 MIB.
addReg(AMDGPU::SGPR_NULL);
6954 [=](MachineInstrBuilder &MIB) {
6964AMDGPUInstructionSelector::selectMUBUFOffset(
MachineOperand &Root)
const {
6969 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset,
Offset))
6973 [=](MachineInstrBuilder &MIB) {
6976 [=](MachineInstrBuilder &MIB) {
6979 else if (STI.hasRestrictedSOffset())
6980 MIB.
addReg(AMDGPU::SGPR_NULL);
6992AMDGPUInstructionSelector::selectBUFSOffset(
MachineOperand &Root)
const {
6997 SOffset = AMDGPU::SGPR_NULL;
6999 return {{[=](MachineInstrBuilder &MIB) { MIB.
addReg(SOffset); }}};
7003static std::optional<uint64_t>
7007 if (!OffsetVal || !
isInt<32>(*OffsetVal))
7008 return std::nullopt;
7009 return Lo_32(*OffsetVal);
7013AMDGPUInstructionSelector::selectSMRDBufferImm(
MachineOperand &Root)
const {
7014 std::optional<uint64_t> OffsetVal =
7019 std::optional<int64_t> EncodedImm =
7024 return {{ [=](MachineInstrBuilder &MIB) { MIB.
addImm(*EncodedImm); } }};
7028AMDGPUInstructionSelector::selectSMRDBufferImm32(
MachineOperand &Root)
const {
7035 std::optional<int64_t> EncodedImm =
7040 return {{ [=](MachineInstrBuilder &MIB) { MIB.
addImm(*EncodedImm); } }};
7044AMDGPUInstructionSelector::selectSMRDBufferSgprImm(
MachineOperand &Root)
const {
7052 return std::nullopt;
7054 std::optional<int64_t> EncodedOffset =
7057 return std::nullopt;
7060 return {{[=](MachineInstrBuilder &MIB) { MIB.
addReg(SOffset); },
7061 [=](MachineInstrBuilder &MIB) { MIB.
addImm(*EncodedOffset); }}};
7064std::pair<Register, unsigned>
7065AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(
MachineOperand &Root,
7066 bool &Matched)
const {
7071 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.
getReg());
7081 const auto CheckAbsNeg = [&]() {
7086 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
7117AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
7122 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7127 [=](MachineInstrBuilder &MIB) { MIB.
addReg(Src); },
7128 [=](MachineInstrBuilder &MIB) { MIB.
addImm(Mods); }
7133AMDGPUInstructionSelector::selectVOP3PMadMixMods(
MachineOperand &Root)
const {
7137 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7140 [=](MachineInstrBuilder &MIB) { MIB.
addReg(Src); },
7141 [=](MachineInstrBuilder &MIB) { MIB.
addImm(Mods); }
7145bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
7149 Register CCReg =
I.getOperand(0).getReg();
7154 BuildMI(*
MBB, &
I,
DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
7155 .
addImm(
I.getOperand(2).getImm());
7159 I.eraseFromParent();
7160 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
7164bool AMDGPUInstructionSelector::selectSGetBarrierState(
7168 const MachineOperand &BarOp =
I.getOperand(2);
7169 std::optional<int64_t> BarValImm =
7173 auto CopyMIB =
BuildMI(*
MBB, &
I,
DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7177 MachineInstrBuilder MIB;
7178 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
7179 : AMDGPU::S_GET_BARRIER_STATE_M0;
7182 auto DstReg =
I.getOperand(0).getReg();
7183 const TargetRegisterClass *DstRC =
7184 TRI.getConstrainedRegClassForOperand(
I.getOperand(0), *MRI);
7185 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7191 I.eraseFromParent();
7196 if (HasInlineConst) {
7200 case Intrinsic::amdgcn_s_barrier_join:
7201 return AMDGPU::S_BARRIER_JOIN_IMM;
7202 case Intrinsic::amdgcn_s_wakeup_barrier:
7203 return AMDGPU::S_WAKEUP_BARRIER_IMM;
7204 case Intrinsic::amdgcn_s_get_named_barrier_state:
7205 return AMDGPU::S_GET_BARRIER_STATE_IMM;
7211 case Intrinsic::amdgcn_s_barrier_join:
7212 return AMDGPU::S_BARRIER_JOIN_M0;
7213 case Intrinsic::amdgcn_s_wakeup_barrier:
7214 return AMDGPU::S_WAKEUP_BARRIER_M0;
7215 case Intrinsic::amdgcn_s_get_named_barrier_state:
7216 return AMDGPU::S_GET_BARRIER_STATE_M0;
7221bool AMDGPUInstructionSelector::selectNamedBarrierInit(
7225 const MachineOperand &BarOp =
I.getOperand(1);
7226 const MachineOperand &CntOp =
I.getOperand(2);
7230 if (IntrID == Intrinsic::amdgcn_s_barrier_signal_var) {
7231 std::optional<int64_t> CntImm =
7233 if (CntImm && *CntImm == 0) {
7234 std::optional<int64_t> BarValImm =
7237 auto BarID = ((*BarValImm) >> 4) & 0x3F;
7238 BuildMI(*
MBB, &
I,
DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
7240 I.eraseFromParent();
7247 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7253 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7260 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7266 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7267 constexpr unsigned ShAmt = 16;
7273 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7283 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
7284 ? AMDGPU::S_BARRIER_INIT_M0
7285 : AMDGPU::S_BARRIER_SIGNAL_M0;
7286 MachineInstrBuilder MIB;
7289 I.eraseFromParent();
7293bool AMDGPUInstructionSelector::selectNamedBarrierInst(
7297 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
7300 std::optional<int64_t> BarValImm =
7305 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7311 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7317 auto CopyMIB =
BuildMI(*
MBB, &
I,
DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7322 MachineInstrBuilder MIB;
7326 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
7327 auto DstReg =
I.getOperand(0).getReg();
7328 const TargetRegisterClass *DstRC =
7329 TRI.getConstrainedRegClassForOperand(
I.getOperand(0), *MRI);
7330 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7336 auto BarId = ((*BarValImm) >> 4) & 0x3F;
7340 I.eraseFromParent();
7347 assert(
MI.getOpcode() == TargetOpcode::G_CONSTANT &&
OpIdx == -1 &&
7348 "Expected G_CONSTANT");
7349 MIB.
addImm(
MI.getOperand(1).getCImm()->getSExtValue());
7355 assert(
MI.getOpcode() == TargetOpcode::G_CONSTANT &&
OpIdx == -1 &&
7356 "Expected G_CONSTANT");
7357 MIB.
addImm(-
MI.getOperand(1).getCImm()->getSExtValue());
7363 const MachineOperand &
Op =
MI.getOperand(1);
7364 assert(
MI.getOpcode() == TargetOpcode::G_FCONSTANT &&
OpIdx == -1);
7365 MIB.
addImm(
Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
7368void AMDGPUInstructionSelector::renderCountTrailingOnesImm(
7370 assert(
MI.getOpcode() == TargetOpcode::G_CONSTANT &&
OpIdx == -1 &&
7371 "Expected G_CONSTANT");
7372 MIB.
addImm(
MI.getOperand(1).getCImm()->getValue().countTrailingOnes());
7380 const MachineOperand &
Op =
MI.getOperand(
OpIdx);
7397 assert(
OpIdx >= 0 &&
"expected to match an immediate operand");
7401void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
7403 assert(
OpIdx >= 0 &&
"expected to match an immediate operand");
7408void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
7410 assert(
OpIdx >= 0 &&
"expected to match an immediate operand");
7416void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
7418 assert(
OpIdx >= 0 &&
"expected to match an immediate operand");
7423void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
7425 assert(
OpIdx >= 0 &&
"expected to match an immediate operand");
7431void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
7433 assert(
OpIdx >= 0 &&
"expected to match an immediate operand");
7438void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7440 assert(
OpIdx >= 0 &&
"expected to match an immediate operand");
7445void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7447 assert(
OpIdx >= 0 &&
"expected to match an immediate operand");
7452void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7454 assert(
OpIdx >= 0 &&
"expected to match an immediate operand");
7463 assert(
OpIdx >= 0 &&
"expected to match an immediate operand");
7472 assert(
OpIdx >= 0 &&
"expected to match an immediate operand");
7479void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7481 assert(
OpIdx >= 0 &&
"expected to match an immediate operand");
7482 const uint32_t Cpol =
MI.getOperand(
OpIdx).getImm() &
7497 const APFloat &APF =
MI.getOperand(1).getFPImm()->getValueAPF();
7499 assert(ExpVal != INT_MIN);
7517 if (
MI.getOperand(
OpIdx).getImm())
7519 MIB.
addImm((int64_t)Mods);
7526 if (
MI.getOperand(
OpIdx).getImm())
7528 MIB.
addImm((int64_t)Mods);
7534 unsigned Val =
MI.getOperand(
OpIdx).getImm();
7542 MIB.
addImm((int64_t)Mods);
7548 uint32_t
V =
MI.getOperand(2).getImm();
7551 if (!Subtarget->hasSafeCUPrefetch())
7557void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7559 unsigned Val =
MI.getOperand(
OpIdx).getImm();
7568bool AMDGPUInstructionSelector::isInlineImmediate(
const APInt &Imm)
const {
7569 return TII.isInlineConstant(Imm);
7572bool AMDGPUInstructionSelector::isInlineImmediate(
const APFloat &Imm)
const {
7573 return TII.isInlineConstant(Imm);
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static void diagnoseUnsupportedIntrinsic(const MachineInstr &I)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool isAllZeros(StringRef Arr)
Return true if the array is empty or all zeros.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
MachineInstr unsigned OpIdx
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
This is used to control valid status that current MI supports.
bool checkOptions(SrcStatus Stat) const
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
uint32_t getLDSSize() const
LLVM_READONLY int getExactLog2Abs() const
Class for arbitrary precision integers.
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
int64_t getSExtValue() const
Get sign extended value.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
Get the array size.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ FCMP_ULT
1 1 0 0 True if unordered or less than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
@ ICMP_ULT
unsigned less than
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
@ ICMP_SGE
signed greater or equal
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
@ ICMP_ULE
unsigned less or equal
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
bool isFPPredicate() const
bool isIntPredicate() const
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
LLVM_ABI DILocation * get() const
Get the underlying DILocation.
Diagnostic information for unsupported feature in backend.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
CodeGenCoverage * CoverageInfo
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
TypeSize getValue() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void setReturnAddressIsTaken(bool s)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Analysis providing profile information.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static bool isGenericOpcode(unsigned Opc)
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
bool isGFX13Plus(const MCSubtargetInfo &STI)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
IndexMode
ARM Index Modes.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
operand_type_match m_Reg()
SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue)
Matches a constant equal to RequestedValue.
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
ImplicitDefMatch m_GImplicitDef()
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
NodeAddr< DefNode * > Def
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
FunctionAddr VTableAddr Value
LLVM_ABI bool isBuildVectorAllZeros(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndef=false)
Return true if the specified instruction is a G_BUILD_VECTOR or G_BUILD_VECTOR_TRUNC where all of the...
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
constexpr RegState getUndefRegState(bool B)
@ Default
The result value is uniform if and only if all operands are uniform.
unsigned AtomicNoRetBaseOpcode
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.