27#include "llvm/IR/IntrinsicsAMDGPU.h"
29#define DEBUG_TYPE "amdgpu-regbanklegalize"
37 : MF(B.getMF()), ST(MF.getSubtarget<
GCNSubtarget>()), B(B),
38 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
39 RBLRules(RBLRules), IsWave32(ST.isWave32()),
40 SgprRB(&RBI.getRegBank(
AMDGPU::SGPRRegBankID)),
41 VgprRB(&RBI.getRegBank(
AMDGPU::VGPRRegBankID)),
42 AgprRB(&RBI.getRegBank(
AMDGPU::AGPRRegBankID)),
43 VccRB(&RBI.getRegBank(
AMDGPU::VCCRegBankID)) {}
49 "No AMDGPU RegBankLegalize rules defined for opcode",
57 "AMDGPU RegBankLegalize: none of the rules defined with "
58 "'Any' for MI's opcode matched MI",
66 B.setInsertPt(*
MI.getParent(), std::next(
MI.getIterator()));
76 if (!lower(
MI, *Mapping, WFI))
80 if (!executeInWaterfallLoop(B, WFI))
90 "Waterfall range not initialized");
107 const int OrigRangeSize = std::distance(BeginIt, EndIt);
116 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
142 MBB.addSuccessor(LoopBB);
145 B.setInsertPt(*LoopBB, LoopBB->
end());
196 auto NewEnd = BodyBB->
end();
197 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
210 auto OldVal = WaterfalledRegMap.
find(OldReg);
211 if (OldVal != WaterfalledRegMap.
end()) {
212 Op.setReg(OldVal->second);
226 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
228 unsigned NumParts = OpSize / PartSize;
234 CurrentLaneParts.
push_back(CurrentLaneReg);
236 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
237 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
238 for (
unsigned i = 0; i < NumParts; ++i) {
240 CurrentLaneParts.
push_back(UnmergeCurrLane.getReg(i));
244 for (
unsigned i = 0; i < NumParts; ++i) {
245 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
251 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
254 Op.setReg(CurrentLaneReg);
257 WaterfalledRegMap.
insert(std::pair(OldReg,
Op.getReg()));
263 MRI.createVirtualRegister({WaveRC,
LLT::scalar(IsWave32 ? 32 : 64)});
264 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
270 MRI.setSimpleHint(SavedExec, CondRegLM);
272 B.setInsertPt(*BodyBB, BodyBB->
end());
284 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
288 B.buildInstr(LMC.
MovOpc).addDef(SaveExecReg).addReg(LMC.
ExecReg);
291 B.setInsertPt(*RestoreExecBB, RestoreExecBB->
begin());
296 B.setInsertPt(*RemainderBB, RemainderBB->
begin());
301bool RegBankLegalizeHelper::splitLoad(MachineInstr &
MI,
303 MachineFunction &MF = B.getMF();
304 assert(
MI.getNumMemOperands() == 1);
305 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
307 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
309 LLT PtrTy = MRI.getType(
Base);
310 const RegisterBank *PtrRB = MRI.getRegBankOrNull(
Base);
314 unsigned ByteOffset = 0;
315 for (LLT PartTy : LLTBreakdown) {
317 if (ByteOffset == 0) {
318 BasePlusOffset =
Base;
320 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
324 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
325 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
326 LoadPartRegs.
push_back(LoadPart.getReg(0));
332 B.buildMergeLikeInstr(Dst, LoadPartRegs);
338 if (MRI.getType(
Reg) == MergeTy) {
341 auto Unmerge = B.buildUnmerge({DstRB, MergeTy},
Reg);
342 for (
unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
343 MergeTyParts.
push_back(Unmerge.getReg(i));
346 B.buildMergeLikeInstr(Dst, MergeTyParts);
348 MI.eraseFromParent();
352bool RegBankLegalizeHelper::widenLoad(MachineInstr &
MI, LLT WideTy,
354 MachineFunction &MF = B.getMF();
355 assert(
MI.getNumMemOperands() == 1);
356 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
358 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
361 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
362 auto WideLoad = B.buildLoad({DstRB, WideTy},
Base, *WideMMO);
365 B.buildTrunc(Dst, WideLoad);
368 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
370 LLT DstTy = MRI.getType(Dst);
372 for (
unsigned i = 0; i < NumElts; ++i) {
373 MergeTyParts.
push_back(Unmerge.getReg(i));
375 B.buildMergeLikeInstr(Dst, MergeTyParts);
377 MI.eraseFromParent();
381bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &
MI)
const {
384 MachineMemOperand &MMO =
MI.getMMO();
387 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
389 if (
MI.getOpcode() == G_LOAD) {
390 B.buildLoad(Dst, Ptr, *WideMMO);
392 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
394 if (
MI.getOpcode() == G_ZEXTLOAD) {
396 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
397 B.buildAnd(Dst, Load, MaskCst);
399 assert(
MI.getOpcode() == G_SEXTLOAD);
400 B.buildSExtInReg(Dst, Load, MemSize);
404 MI.eraseFromParent();
408bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &
MI) {
410 LLT Ty = MRI.getType(Dst);
412 unsigned Opc =
MI.getOpcode();
413 int TrueExtCst =
Opc == G_SEXT ? -1 : 1;
414 if (Ty == S32 || Ty == S16) {
415 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
416 auto False = B.buildConstant({VgprRB, Ty}, 0);
417 B.buildSelect(Dst, Src, True, False);
418 }
else if (Ty == S64) {
419 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
420 auto False = B.buildConstant({VgprRB_S32}, 0);
421 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
422 MachineInstrBuilder
Hi;
431 Hi = B.buildUndef({VgprRB_S32});
435 MF, MORE,
"amdgpu-regbanklegalize",
436 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported",
MI);
440 B.buildMergeValues(Dst, {
Lo.getReg(0),
Hi.getReg(0)});
443 MF, MORE,
"amdgpu-regbanklegalize",
444 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported",
MI);
448 MI.eraseFromParent();
452std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(
Register Reg) {
453 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
454 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
455 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
456 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
457 return {
Lo.getReg(0),
Hi.getReg(0)};
460std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(
Register Reg) {
461 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
462 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
463 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
464 return {
Lo.getReg(0),
Hi.getReg(0)};
467std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(
Register Reg) {
468 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
470 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
471 return {
Lo.getReg(0),
Hi.getReg(0)};
474std::pair<Register, Register>
475RegBankLegalizeHelper::unpackAExtTruncS16(
Register Reg) {
476 auto [Lo32, Hi32] = unpackAExt(
Reg);
477 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
478 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
481bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &
MI) {
483 switch (
MI.getOpcode()) {
484 case AMDGPU::G_SHL: {
485 auto [Val0, Val1] = unpackAExt(
MI.getOperand(1).getReg());
486 auto [Amt0, Amt1] = unpackAExt(
MI.getOperand(2).getReg());
487 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
488 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
491 case AMDGPU::G_LSHR: {
492 auto [Val0, Val1] = unpackZExt(
MI.getOperand(1).getReg());
493 auto [Amt0, Amt1] = unpackZExt(
MI.getOperand(2).getReg());
494 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
495 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
498 case AMDGPU::G_ASHR: {
499 auto [Val0, Val1] = unpackSExt(
MI.getOperand(1).getReg());
500 auto [Amt0, Amt1] = unpackSExt(
MI.getOperand(2).getReg());
501 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
502 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
507 MF, MORE,
"amdgpu-regbanklegalize",
508 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
512 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
513 MI.eraseFromParent();
517bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &
MI) {
519 switch (
MI.getOpcode()) {
521 case AMDGPU::G_SMAX: {
523 auto [Val0_Lo, Val0_Hi] = unpackSExt(
MI.getOperand(1).getReg());
524 auto [Val1_Lo, Val1_Hi] = unpackSExt(
MI.getOperand(2).getReg());
525 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
527 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
532 case AMDGPU::G_UMAX: {
534 auto [Val0_Lo, Val0_Hi] = unpackZExt(
MI.getOperand(1).getReg());
535 auto [Val1_Lo, Val1_Hi] = unpackZExt(
MI.getOperand(2).getReg());
536 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
538 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
544 MF, MORE,
"amdgpu-regbanklegalize",
545 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented",
MI);
548 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
549 MI.eraseFromParent();
553bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &
MI) {
554 auto [Op1Lo, Op1Hi] = unpackAExt(
MI.getOperand(1).getReg());
555 auto [Op2Lo, Op2Hi] = unpackAExt(
MI.getOperand(2).getReg());
556 auto ResLo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
557 auto ResHi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
558 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
559 {ResLo.getReg(0), ResHi.getReg(0)});
560 MI.eraseFromParent();
566 return (GI->is(Intrinsic::amdgcn_sbfe));
568 return MI.getOpcode() == AMDGPU::G_SBFX;
571bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &
MI) {
578 Register Src =
MI.getOperand(FirstOpnd).getReg();
579 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
580 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
585 unsigned SHROpc =
Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
586 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
594 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
595 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
596 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
597 MI.eraseFromParent();
601 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
602 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
603 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
604 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
605 auto Zero = B.buildConstant({VgprRB, S32}, 0);
606 unsigned BFXOpc =
Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
608 if (WidthImm <= 32) {
610 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo,
Zero, Width});
611 MachineInstrBuilder
Hi;
614 Hi = B.buildAShr(VgprRB_S32,
Lo, B.buildConstant(VgprRB_S32, 31));
619 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
621 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
623 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi,
Zero, Amt});
624 B.buildMergeLikeInstr(Dst, {SHRSrcLo,
Hi});
627 MI.eraseFromParent();
631bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &
MI) {
633 LLT Ty = MRI.getType(DstReg);
636 Register Src =
MI.getOperand(FirstOpnd).getReg();
637 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
638 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
645 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
646 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
647 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset,
Size);
648 unsigned Opc32 =
Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
649 unsigned Opc64 =
Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
650 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
654 auto S_BFE = B.buildInstr(
Opc, {{SgprRB, Ty}},
655 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
657 *ST.getRegisterInfo(), RBI);
659 B.buildCopy(DstReg,
S_BFE->getOperand(0).getReg());
660 MI.eraseFromParent();
664bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &
MI) {
666 LLT DstTy = MRI.getType(Dst);
667 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
668 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
669 auto Op1 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(1).
getReg());
670 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
671 unsigned Opc =
MI.getOpcode();
674 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)},
Flags);
676 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)},
Flags);
677 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
678 MI.eraseFromParent();
682bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &
MI) {
684 assert(MRI.getType(Dst) == S64);
685 auto Op1 = B.buildUnmerge({VgprRB_S32},
MI.getOperand(1).
getReg());
686 auto Op2 = B.buildUnmerge({VgprRB_S32},
MI.getOperand(2).
getReg());
690 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
691 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
692 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
693 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
694 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
695 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
697 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
698 MI.eraseFromParent();
702bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &
MI) {
704 assert(MRI.getType(Dst) == V2S16);
705 unsigned Opc =
MI.getOpcode();
706 unsigned NumOps =
MI.getNumOperands();
709 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(
MI.getOperand(1).getReg());
712 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo},
Flags);
713 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi},
Flags);
714 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
715 MI.eraseFromParent();
719 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(
MI.getOperand(2).getReg());
722 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo},
Flags);
723 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi},
Flags);
724 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
725 MI.eraseFromParent();
730 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(
MI.getOperand(3).getReg());
731 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo},
Flags);
732 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi},
Flags);
733 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
734 MI.eraseFromParent();
738bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &
MI) {
745 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
748 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
749 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
750 if (ST.hasScalarMulHiInsts()) {
751 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
753 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
754 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
755 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
766 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
767 B.buildConstant(Dst1, 0);
770 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
771 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
772 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
774 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
776 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
777 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
778 B.buildCopy(Dst1, AddHi.getReg(1));
781 MI.eraseFromParent();
785bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &
MI) {
787 LLT DstTy = MRI.getType(Dst);
788 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
790 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
791 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
792 auto Op3 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(3).
getReg());
796 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(0), Op3.getReg(0), Flags);
798 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(1), Op3.getReg(1), Flags);
800 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
801 MI.eraseFromParent();
805bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &
MI) {
806 auto Op1 = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
807 int Amt =
MI.getOperand(2).getImm();
811 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
814 Lo = Freeze.getReg(0);
817 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
820 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
821 Hi = B.buildAShr(VgprRB_S32,
Lo, SignExtCst).getReg(0);
825 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
828 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(), {Lo, Hi});
829 MI.eraseFromParent();
833bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &
MI) {
839 unsigned Opc =
MI.getOpcode();
848 case AMDGPU::G_AMDGPU_FFBH_U32:
850 AddOpc = AMDGPU::G_UADDSAT;
851 SearchFromMSB =
true;
853 case AMDGPU::G_AMDGPU_FFBL_B32:
855 AddOpc = AMDGPU::G_UADDSAT;
856 SearchFromMSB =
false;
858 case AMDGPU::G_CTLZ_ZERO_POISON:
859 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
860 AddOpc = AMDGPU::G_ADD;
861 SearchFromMSB =
true;
863 case AMDGPU::G_CTTZ_ZERO_POISON:
864 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
865 AddOpc = AMDGPU::G_ADD;
866 SearchFromMSB =
false;
872 auto Unmerge = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
879 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ?
Hi :
Lo});
881 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ?
Lo :
Hi});
883 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
884 {Secondary, B.buildConstant(VgprRB_S32, 32)});
885 B.buildUMin(
MI.getOperand(0).getReg(), Primary, Adjusted);
887 MI.eraseFromParent();
891bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &
MI) {
903 LLT VecTy = MRI.getType(Src);
906 MachineRegisterInfo::VRegAttrs VgprRB_EltTy = {VgprRB, ScalarTy};
908 auto Unmerge = B.buildUnmerge(VgprRB_EltTy, Src);
911 Register PrevSelect = Unmerge.getReg(0);
912 for (
unsigned I = 1;
I < NumElts; ++
I) {
913 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)},
I);
916 B.buildSelect(VgprRB_EltTy, Cmp, Unmerge.getReg(
I), PrevSelect)
919 B.buildCopy(Dst, PrevSelect);
921 auto InitUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(0));
922 Register PrevLo = InitUnmerge.getReg(0);
923 Register PrevHi = InitUnmerge.getReg(1);
924 for (
unsigned I = 1;
I < NumElts; ++
I) {
925 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)},
I);
927 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(
I));
928 PrevLo = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(0), PrevLo)
930 PrevHi = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(1), PrevHi)
933 B.buildMergeLikeInstr(Dst, {PrevLo, PrevHi});
936 MF, MORE,
"amdgpu-regbanklegalize",
937 "AMDGPU RegBankLegalize: ExtrVecEltToSel unsupported element type",
MI);
941 MI.eraseFromParent();
945bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &
MI) {
958 LLT SrcTy = MRI.getType(Src);
961 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
962 "expected VGPR src and SGPR idx");
964 auto CastSrc = B.buildBitcast({VgprRB, Vec32Ty}, Src);
967 auto One = B.buildConstant(SgprRB_S32, 1);
968 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
969 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
971 auto ExtLo = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxLo);
972 auto ExtHi = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxHi);
974 B.buildMergeLikeInstr(Dst, {ExtLo.getReg(0), ExtHi.getReg(0)});
976 MI.eraseFromParent();
980bool RegBankLegalizeHelper::lowerInsVecEltToSel(MachineInstr &
MI) {
993 LLT VecTy = MRI.getType(Src);
996 const RegisterBank *SrcRB = MRI.getRegBank(Src);
997 bool IsSGPR = (SrcRB == SgprRB);
998 SmallVector<Register, 16> Selects;
1002 auto Unmerge = B.buildUnmerge(VgprRB_S32, Src);
1003 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1004 Register EltLo = EltUnmerge.getReg(0);
1005 Register EltHi = EltUnmerge.getReg(1);
1006 for (
unsigned I = 0;
I < NumElts; ++
I) {
1007 auto IdxConst = B.buildConstant(VgprRB_S32,
I);
1010 B.buildSelect(VgprRB_S32, Cmp, EltLo, Unmerge.getReg(2 *
I))
1013 B.buildSelect(VgprRB_S32, Cmp, EltHi, Unmerge.getReg(2 *
I + 1))
1017 auto Vec32 = B.buildBuildVector({VgprRB, Vec32Ty}, Selects);
1018 B.buildBitcast(Dst, Vec32);
1021 MachineRegisterInfo::VRegAttrs SrcRB_EltTy = {SrcRB, ScalarTy};
1022 MachineRegisterInfo::VRegAttrs CmpTy = IsSGPR ? SgprRB_S32 : VccRB_S1;
1023 auto Unmerge = B.buildUnmerge(SrcRB_EltTy, Src);
1024 for (
unsigned I = 0;
I < NumElts; ++
I) {
1025 auto IdxConst = B.buildConstant(SgprRB_S32,
I);
1028 B.buildSelect(SrcRB_EltTy, Cmp, Elt, Unmerge.getReg(
I)).getReg(0));
1030 B.buildMergeLikeInstr(Dst, Selects);
1033 MF, MORE,
"amdgpu-regbanklegalize",
1034 "AMDGPU RegBankLegalize: InsVecEltToSel unsupported element type",
MI);
1038 MI.eraseFromParent();
1042bool RegBankLegalizeHelper::lowerInsVecEltTo32(MachineInstr &
MI) {
1057 LLT SrcTy = MRI.getType(Src);
1060 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1061 "expected VGPR src and SGPR idx");
1063 MachineRegisterInfo::VRegAttrs VgprRB_Vec32Ty = {VgprRB, Vec32Ty};
1065 auto CastSrc = B.buildBitcast(VgprRB_Vec32Ty, Src);
1066 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1069 auto One = B.buildConstant(SgprRB_S32, 1);
1070 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1071 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1073 auto InsLo = B.buildInsertVectorElement(VgprRB_Vec32Ty, CastSrc,
1074 EltUnmerge.getReg(0), IdxLo);
1075 auto InsHi = B.buildInsertVectorElement(VgprRB_Vec32Ty, InsLo,
1076 EltUnmerge.getReg(1), IdxHi);
1078 B.buildBitcast(Dst, InsHi);
1080 MI.eraseFromParent();
1084bool RegBankLegalizeHelper::lowerAbsToNegMax(MachineInstr &
MI) {
1094 LLT Ty = MRI.getType(DstReg);
1100 Zero = B.buildBuildVector({VgprRB, Ty}, {Zero16, Zero16}).
getReg(0);
1102 assert((Ty == S32 || Ty == S16) &&
"unexpected type for AbsToNegMax");
1103 Zero = B.buildConstant({VgprRB, Ty}, 0).
getReg(0);
1106 auto Neg = B.buildSub({VgprRB, Ty},
Zero, SrcReg);
1107 B.buildSMax(DstReg, SrcReg, Neg);
1108 MI.eraseFromParent();
1112bool RegBankLegalizeHelper::lowerAbsToS32(MachineInstr &
MI) {
1122 auto Bitcast = B.buildBitcast({SgprRB_S32},
MI.getOperand(1).
getReg());
1123 auto SextInReg = B.buildSExtInReg({SgprRB_S32},
Bitcast, 16);
1125 B.buildAShr({SgprRB_S32},
Bitcast, B.buildConstant({SgprRB_S32}, 16));
1127 auto AbsLo = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {SextInReg});
1128 auto AbsHi = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {ShiftHi});
1129 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
1130 {AbsLo.getReg(0), AbsHi.getReg(0)});
1132 MI.eraseFromParent();
1136bool RegBankLegalizeHelper::lower(MachineInstr &
MI,
1144 return lowerVccExtToSel(
MI);
1146 LLT Ty = MRI.getType(
MI.getOperand(0).getReg());
1147 auto True = B.buildConstant({SgprRB, Ty},
1148 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
1149 auto False = B.buildConstant({SgprRB, Ty}, 0);
1153 B.buildSelect(
MI.getOperand(0).getReg(),
MI.getOperand(1).getReg(), True,
1155 MI.eraseFromParent();
1159 return lowerUnpackBitShift(
MI);
1161 return lowerUnpackMinMax(
MI);
1163 return lowerSplitTo16(
MI);
1165 const RegisterBank *RB = MRI.getRegBank(
MI.getOperand(0).getReg());
1166 MachineInstrBuilder
Hi;
1167 switch (
MI.getOpcode()) {
1168 case AMDGPU::G_ZEXT: {
1169 Hi = B.buildConstant({RB, S32}, 0);
1172 case AMDGPU::G_SEXT: {
1174 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
1175 Hi = B.buildAShr({RB, S32},
MI.getOperand(1).
getReg(), ShiftAmt);
1178 case AMDGPU::G_ANYEXT: {
1179 Hi = B.buildUndef({RB, S32});
1184 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
1189 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(),
1190 {MI.getOperand(1).getReg(), Hi});
1191 MI.eraseFromParent();
1195 uint64_t ConstVal =
MI.getOperand(1).getCImm()->getZExtValue();
1196 B.buildConstant(
MI.getOperand(0).getReg(), ConstVal);
1198 MI.eraseFromParent();
1203 LLT Ty = MRI.getType(Src);
1207 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
1209 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
1210 auto One = B.buildConstant(VgprRB_S32, 1);
1211 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
1212 auto Zero = B.buildConstant(VgprRB_S32, 0);
1213 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
1214 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
1216 assert(Ty == S32 || Ty == S16);
1217 auto One = B.buildConstant({VgprRB, Ty}, 1);
1218 B.buildAnd(BoolSrc, Src, One);
1220 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
1222 MI.eraseFromParent();
1226 return lowerV_BFE(
MI);
1228 return lowerS_BFE(
MI);
1230 return lowerUniMAD64(
MI);
1232 B.buildMul(
MI.getOperand(0),
MI.getOperand(1),
MI.getOperand(2));
1233 MI.eraseFromParent();
1237 auto Op1 = B.buildTrunc(VgprRB_S32,
MI.getOperand(1));
1238 auto Op2 = B.buildTrunc(VgprRB_S32,
MI.getOperand(2));
1239 auto Zero = B.buildConstant({VgprRB, S64}, 0);
1241 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
1242 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1243 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1245 B.buildInstr(NewOpc, {
MI.getOperand(0).getReg(), {SgprRB, S32}},
1247 MI.eraseFromParent();
1251 return lowerSplitTo32(
MI);
1253 return lowerSplitTo32Mul(
MI);
1255 return lowerSplitTo32Select(
MI);
1257 return lowerSplitTo32SExtInReg(
MI);
1259 auto Unmerge = B.buildUnmerge({VgprRB, S32},
MI.getOperand(1).
getReg());
1260 auto LoPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(0));
1261 auto HiPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(1));
1263 B.buildAdd(
MI.getOperand(0).getReg(), LoPopCnt, HiPopCnt,
1266 MI.eraseFromParent();
1270 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
1281 if (
Size / 128 == 2)
1283 else if (
Size / 128 == 4)
1287 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1293 else if (DstTy == S96)
1294 splitLoad(
MI, {S64, S32}, S32);
1295 else if (DstTy == V3S32)
1296 splitLoad(
MI, {V2S32, S32}, S32);
1297 else if (DstTy == V6S16)
1298 splitLoad(
MI, {V4S16, V2S16}, V2S16);
1301 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1308 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
1310 widenLoad(
MI, S128);
1311 else if (DstTy == V3S32)
1312 widenLoad(
MI, V4S32, S32);
1313 else if (DstTy == V6S16)
1314 widenLoad(
MI, V8S16, V2S16);
1317 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1324 return lowerUnpackAExt(
MI);
1329 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1335 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1337 B.setInstrAndDebugLoc(
MI);
1338 for (
unsigned i =
MI.getNumDefs(); i <
MI.getNumOperands(); ++i) {
1339 MachineOperand &
Op =
MI.getOperand(i);
1343 if (MRI.getRegBank(
Reg) != VgprRB) {
1344 auto Copy = B.buildCopy({VgprRB, MRI.getType(
Reg)},
Reg);
1345 Op.setReg(
Copy.getReg(0));
1355 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1360 B.setInstrAndDebugLoc(
MI);
1363 B.buildUnmerge({SgprRB, V2S16}, Unmerge->
getSourceReg());
1364 for (
unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1365 auto [Dst0S32, Dst1S32] =
1366 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1367 B.buildTrunc(
MI.getOperand(i * 2).getReg(), Dst0S32);
1368 B.buildTrunc(
MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1371 auto [Dst0S32, Dst1S32] = unpackAExt(
MI.getOperand(2).getReg());
1372 B.buildTrunc(
MI.getOperand(0).getReg(), Dst0S32);
1373 B.buildTrunc(
MI.getOperand(1).getReg(), Dst1S32);
1376 MI.eraseFromParent();
1381 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1382 B.setInsertPt(*
MI.getParent(),
MI.getParent()->getFirstNonPHI());
1383 MI.getOperand(0).setReg(NewDst);
1384 B.buildTrunc(Dst, NewDst);
1386 for (
unsigned i = 1; i <
MI.getNumOperands(); i += 2) {
1394 auto NewUse = B.buildAnyExt(SgprRB_S32,
UseReg);
1395 MI.getOperand(i).setReg(NewUse.getReg(0));
1403 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1408 assert(MRI.getRegBankOrNull(
MI.getOperand(0).getReg()) == VgprRB);
1412 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1413 return RB == VgprRB || RB == SgprRB;
1418 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1423 unsigned RsrcIdx = RSrcIntrin->
RsrcArg +
MI.getNumExplicitDefs() + 1;
1424 return applyRegisterBanksVgprWithSgprRsrc(
MI, RsrcIdx);
1430 unsigned RsrcIdx =
MI.getNumOperands();
1431 while (RsrcIdx-- >
MI.getNumExplicitDefs()) {
1432 const MachineOperand &
Op =
MI.getOperand(RsrcIdx);
1433 if (
Op.isReg() &&
Op.getReg().isVirtual())
1436 return applyRegisterBanksVgprWithSgprRsrc(
MI, RsrcIdx);
1439 return lowerSplitBitCount64To32(
MI);
1441 return lowerExtrVecEltToSel(
MI);
1443 return lowerExtrVecEltTo32(
MI);
1445 return lowerInsVecEltToSel(
MI);
1447 return lowerInsVecEltTo32(
MI);
1449 return lowerAbsToNegMax(
MI);
1451 return lowerAbsToS32(
MI);
1571 return isAnyPtr(Ty, 32) ? Ty : LLT();
1574 return isAnyPtr(Ty, 64) ? Ty : LLT();
1577 return isAnyPtr(Ty, 128) ? Ty : LLT();
1621 const SIRegisterInfo *
TRI =
1622 static_cast<const SIRegisterInfo *
>(MRI.getTargetRegisterInfo());
1624 if (LLTSize >= 32 &&
TRI->getSGPRClassForBitWidth(LLTSize))
1629 const SIRegisterInfo *
TRI =
1630 static_cast<const SIRegisterInfo *
>(MRI.getTargetRegisterInfo());
1751bool RegBankLegalizeHelper::applyMappingDst(
1752 MachineInstr &
MI,
unsigned &
OpIdx,
1753 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1758 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1760 LLT Ty = MRI.getType(
Reg);
1761 [[maybe_unused]]
const RegisterBank *RB = MRI.getRegBank(
Reg);
1763 switch (MethodIDs[
OpIdx]) {
1839 Register NewAgprDst = MRI.createVirtualRegister({AgprRB, Ty});
1840 Op.setReg(NewAgprDst);
1841 if (!MRI.use_nodbg_empty(
Reg))
1842 B.buildCopy(
Reg, NewAgprDst);
1849 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1851 if (!MRI.use_empty(
Reg)) {
1853 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1854 B.buildTrunc(
Reg, CopyS32_Vcc);
1861 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1862 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1863 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1864 Op.setReg(NewVgprDstS16);
1865 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1867 B.buildTrunc(
Reg, NewSgprDstS32);
1886 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1887 Op.setReg(NewVgprDst);
1900 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1901 Op.setReg(NewVgprDst);
1909 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1911 if (!MRI.use_empty(
Reg))
1912 B.buildTrunc(
Reg, NewDst);
1919 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
1920 B.buildCopy(
Reg,
Op.getReg());
1925 MF, MORE,
"amdgpu-regbanklegalize",
1926 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for",
MI);
1931 MF, MORE,
"amdgpu-regbanklegalize",
1932 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported",
MI);
1940bool RegBankLegalizeHelper::applyMappingSrc(
1941 MachineInstr &
MI,
unsigned &
OpIdx,
1942 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1944 for (
unsigned i = 0; i < MethodIDs.
size(); ++
OpIdx, ++i) {
1945 if (MethodIDs[i] ==
None || MethodIDs[i] ==
IntrId || MethodIDs[i] ==
Imm)
1948 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1950 LLT Ty = MRI.getType(
Reg);
1951 const RegisterBank *RB = MRI.getRegBank(
Reg);
1953 switch (MethodIDs[i]) {
1956 assert(RB == VccRB || RB == SgprRB);
1958 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1960 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1961 Op.setReg(CopyVcc_Scc.getReg(0));
1979 assert(Ty == getTyFromID(MethodIDs[i]));
1980 assert(RB == getRegBankFromID(MethodIDs[i]));
1994 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1995 assert(RB == getRegBankFromID(MethodIDs[i]));
2022 assert(Ty == getTyFromID(MethodIDs[i]));
2024 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
2025 Op.setReg(CopyToVgpr.getReg(0));
2041 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2043 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
2044 Op.setReg(CopyToVgpr.getReg(0));
2050 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
2051 Op.setReg(CopyToVgpr.getReg(0));
2057 auto CopyToAgpr = B.buildCopy({AgprRB, Ty},
Reg);
2058 Op.setReg(CopyToAgpr.getReg(0));
2065 assert(Ty == getTyFromID(MethodIDs[i]));
2070 WFI.
End = std::next(
MI.getIterator());
2077 assert(Ty == getTyFromID(MethodIDs[i]));
2083 while (
Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
2088 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
2100 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2104 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2110 assert(Ty == getTyFromID(MethodIDs[i]));
2114 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2124 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
2125 Op.setReg(Aext.getReg(0));
2132 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
2135 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
2136 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
2137 Op.setReg(BoolInReg.getReg(0));
2143 auto Sext = B.buildSExt(SgprRB_S32,
Reg);
2144 Op.setReg(Sext.getReg(0));
2150 auto Zext = B.buildZExt({SgprRB, S32},
Reg);
2151 Op.setReg(Zext.getReg(0));
2157 auto Aext = B.buildAnyExt({VgprRB, S32},
Reg);
2158 Op.setReg(Aext.getReg(0));
2165 auto Sext = B.buildSExt({VgprRB, S32},
Reg);
2166 Op.setReg(Sext.getReg(0));
2173 auto Zext = B.buildZExt({VgprRB, S32},
Reg);
2174 Op.setReg(Zext.getReg(0));
2179 MF, MORE,
"amdgpu-regbanklegalize",
2180 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported",
MI);
2190 unsigned StartOpIdx,
2191 unsigned EndOpIdx) {
2192 for (
unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
2199bool RegBankLegalizeHelper::applyRegisterBanksVgprWithSgprRsrc(
2200 MachineInstr &
MI,
unsigned RsrcIdx) {
2201 const unsigned NumDefs =
MI.getNumExplicitDefs();
2203 MachineBasicBlock *
MBB =
MI.getParent();
2207 for (
unsigned i = 0; i < NumDefs; ++i) {
2209 if (MRI.getRegBank(
Reg) == VgprRB)
2212 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(
Reg)});
2213 MI.getOperand(i).setReg(NewVgprDst);
2217 B.setInstrAndDebugLoc(
MI);
2220 for (
unsigned i = NumDefs; i < RsrcIdx; ++i) {
2221 MachineOperand &
Op =
MI.getOperand(i);
2229 if (MRI.getRegBank(
Reg) == VgprRB)
2232 auto Copy = B.buildCopy({VgprRB, MRI.getType(
Reg)},
Reg);
2233 Op.setReg(
Copy.getReg(0));
2236 SmallSet<Register, 4> OpsToWaterfall;
2239 for (
unsigned i = RsrcIdx; i <
MI.getNumOperands(); ++i) {
2240 MachineOperand &
Op =
MI.getOperand(i);
2245 if (MRI.getRegBank(
Reg) != SgprRB)
2249 if (!OpsToWaterfall.
empty()) {
2251 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
const SmallVectorImpl< MachineOperand > & Cond
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned MovTermOpc
const unsigned AndSaveExecOpc
bool findRuleAndApplyMapping(MachineInstr &MI)
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
bool isValid() const
Check for null.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank * getRegBank(Register Reg) const
Return the register bank of Reg.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ SgprV4S32_ReadFirstLane
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
@ VerifyAllSgprOrVgprGPHI
@ AextToS32InIncomingBlockGPHI
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LoweringMethodID LoweringMethod
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs
MachineBasicBlock::iterator End