27#include "llvm/IR/IntrinsicsAMDGPU.h"
29#define DEBUG_TYPE "amdgpu-regbanklegalize"
37 : MF(B.getMF()), ST(MF.getSubtarget<
GCNSubtarget>()), B(B),
38 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
39 RBLRules(RBLRules), IsWave32(ST.isWave32()),
40 SgprRB(&RBI.getRegBank(
AMDGPU::SGPRRegBankID)),
41 VgprRB(&RBI.getRegBank(
AMDGPU::VGPRRegBankID)),
42 AgprRB(&RBI.getRegBank(
AMDGPU::AGPRRegBankID)),
43 VccRB(&RBI.getRegBank(
AMDGPU::VCCRegBankID)) {}
49 "No AMDGPU RegBankLegalize rules defined for opcode",
57 "AMDGPU RegBankLegalize: none of the rules defined with "
58 "'Any' for MI's opcode matched MI",
66 B.setInsertPt(*
MI.getParent(), std::next(
MI.getIterator()));
76 if (!lower(
MI, *Mapping, WFI))
80 if (!executeInWaterfallLoop(B, WFI))
90 "Waterfall range not initialized");
107 const int OrigRangeSize = std::distance(BeginIt, EndIt);
116 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
142 MBB.addSuccessor(LoopBB);
145 B.setInsertPt(*LoopBB, LoopBB->
end());
196 auto NewEnd = BodyBB->
end();
197 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
210 auto OldVal = WaterfalledRegMap.
find(OldReg);
211 if (OldVal != WaterfalledRegMap.
end()) {
212 Op.setReg(OldVal->second);
226 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
228 unsigned NumParts = OpSize / PartSize;
234 CurrentLaneParts.
push_back(CurrentLaneReg);
236 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
237 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
238 for (
unsigned i = 0; i < NumParts; ++i) {
240 CurrentLaneParts.
push_back(UnmergeCurrLane.getReg(i));
244 for (
unsigned i = 0; i < NumParts; ++i) {
245 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
251 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
254 Op.setReg(CurrentLaneReg);
257 WaterfalledRegMap.
insert(std::pair(OldReg,
Op.getReg()));
263 MRI.createVirtualRegister({WaveRC,
LLT::scalar(IsWave32 ? 32 : 64)});
264 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
270 MRI.setSimpleHint(SavedExec, CondRegLM);
272 B.setInsertPt(*BodyBB, BodyBB->
end());
284 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
288 B.buildInstr(LMC.
MovOpc).addDef(SaveExecReg).addReg(LMC.
ExecReg);
291 B.setInsertPt(*RestoreExecBB, RestoreExecBB->
begin());
296 B.setInsertPt(*RemainderBB, RemainderBB->
begin());
301bool RegBankLegalizeHelper::splitLoad(MachineInstr &
MI,
303 MachineFunction &MF = B.getMF();
304 assert(
MI.getNumMemOperands() == 1);
305 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
307 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
309 LLT PtrTy = MRI.getType(
Base);
310 const RegisterBank *PtrRB = MRI.getRegBankOrNull(
Base);
314 unsigned ByteOffset = 0;
315 for (LLT PartTy : LLTBreakdown) {
317 if (ByteOffset == 0) {
318 BasePlusOffset =
Base;
320 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
324 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
325 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
326 LoadPartRegs.
push_back(LoadPart.getReg(0));
332 B.buildMergeLikeInstr(Dst, LoadPartRegs);
338 if (MRI.getType(
Reg) == MergeTy) {
341 auto Unmerge = B.buildUnmerge({DstRB, MergeTy},
Reg);
342 for (
unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
343 MergeTyParts.
push_back(Unmerge.getReg(i));
346 B.buildMergeLikeInstr(Dst, MergeTyParts);
348 MI.eraseFromParent();
352bool RegBankLegalizeHelper::widenLoad(MachineInstr &
MI, LLT WideTy,
354 MachineFunction &MF = B.getMF();
355 assert(
MI.getNumMemOperands() == 1);
356 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
358 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
361 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
362 auto WideLoad = B.buildLoad({DstRB, WideTy},
Base, *WideMMO);
365 B.buildTrunc(Dst, WideLoad);
368 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
370 LLT DstTy = MRI.getType(Dst);
372 for (
unsigned i = 0; i < NumElts; ++i) {
373 MergeTyParts.
push_back(Unmerge.getReg(i));
375 B.buildMergeLikeInstr(Dst, MergeTyParts);
377 MI.eraseFromParent();
381bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &
MI)
const {
384 MachineMemOperand &MMO =
MI.getMMO();
387 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
389 if (
MI.getOpcode() == G_LOAD) {
390 B.buildLoad(Dst, Ptr, *WideMMO);
392 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
394 if (
MI.getOpcode() == G_ZEXTLOAD) {
396 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
397 B.buildAnd(Dst, Load, MaskCst);
399 assert(
MI.getOpcode() == G_SEXTLOAD);
400 B.buildSExtInReg(Dst, Load, MemSize);
404 MI.eraseFromParent();
408bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &
MI) {
410 LLT Ty = MRI.getType(Dst);
412 unsigned Opc =
MI.getOpcode();
413 int TrueExtCst =
Opc == G_SEXT ? -1 : 1;
414 if (Ty == S32 || Ty == S16) {
415 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
416 auto False = B.buildConstant({VgprRB, Ty}, 0);
417 B.buildSelect(Dst, Src, True, False);
418 }
else if (Ty == S64) {
419 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
420 auto False = B.buildConstant({VgprRB_S32}, 0);
421 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
422 MachineInstrBuilder
Hi;
431 Hi = B.buildUndef({VgprRB_S32});
435 MF, MORE,
"amdgpu-regbanklegalize",
436 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported",
MI);
440 B.buildMergeValues(Dst, {
Lo.getReg(0),
Hi.getReg(0)});
443 MF, MORE,
"amdgpu-regbanklegalize",
444 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported",
MI);
448 MI.eraseFromParent();
452std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(
Register Reg) {
453 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
454 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
455 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
456 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
457 return {
Lo.getReg(0),
Hi.getReg(0)};
460std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(
Register Reg) {
461 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
462 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
463 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
464 return {
Lo.getReg(0),
Hi.getReg(0)};
467std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(
Register Reg) {
468 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
470 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
471 return {
Lo.getReg(0),
Hi.getReg(0)};
474std::pair<Register, Register>
475RegBankLegalizeHelper::unpackAExtTruncS16(
Register Reg) {
476 auto [Lo32, Hi32] = unpackAExt(
Reg);
477 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
478 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
481bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &
MI) {
483 switch (
MI.getOpcode()) {
484 case AMDGPU::G_SHL: {
485 auto [Val0, Val1] = unpackAExt(
MI.getOperand(1).getReg());
486 auto [Amt0, Amt1] = unpackAExt(
MI.getOperand(2).getReg());
487 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
488 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
491 case AMDGPU::G_LSHR: {
492 auto [Val0, Val1] = unpackZExt(
MI.getOperand(1).getReg());
493 auto [Amt0, Amt1] = unpackZExt(
MI.getOperand(2).getReg());
494 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
495 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
498 case AMDGPU::G_ASHR: {
499 auto [Val0, Val1] = unpackSExt(
MI.getOperand(1).getReg());
500 auto [Amt0, Amt1] = unpackSExt(
MI.getOperand(2).getReg());
501 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
502 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
507 MF, MORE,
"amdgpu-regbanklegalize",
508 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
512 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
513 MI.eraseFromParent();
517bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &
MI) {
519 switch (
MI.getOpcode()) {
521 case AMDGPU::G_SMAX: {
523 auto [Val0_Lo, Val0_Hi] = unpackSExt(
MI.getOperand(1).getReg());
524 auto [Val1_Lo, Val1_Hi] = unpackSExt(
MI.getOperand(2).getReg());
525 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
527 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
532 case AMDGPU::G_UMAX: {
534 auto [Val0_Lo, Val0_Hi] = unpackZExt(
MI.getOperand(1).getReg());
535 auto [Val1_Lo, Val1_Hi] = unpackZExt(
MI.getOperand(2).getReg());
536 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
538 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
544 MF, MORE,
"amdgpu-regbanklegalize",
545 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented",
MI);
548 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
549 MI.eraseFromParent();
553bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &
MI) {
554 auto [Op1Lo, Op1Hi] = unpackAExt(
MI.getOperand(1).getReg());
555 auto [Op2Lo, Op2Hi] = unpackAExt(
MI.getOperand(2).getReg());
556 auto ResLo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
557 auto ResHi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
558 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
559 {ResLo.getReg(0), ResHi.getReg(0)});
560 MI.eraseFromParent();
566 return (GI->is(Intrinsic::amdgcn_sbfe));
568 return MI.getOpcode() == AMDGPU::G_SBFX;
571bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &
MI) {
578 Register Src =
MI.getOperand(FirstOpnd).getReg();
579 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
580 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
585 unsigned SHROpc =
Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
586 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
594 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
595 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
596 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
597 MI.eraseFromParent();
601 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
602 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
603 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
604 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
605 auto Zero = B.buildConstant({VgprRB, S32}, 0);
606 unsigned BFXOpc =
Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
608 if (WidthImm <= 32) {
610 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo,
Zero, Width});
611 MachineInstrBuilder
Hi;
614 Hi = B.buildAShr(VgprRB_S32,
Lo, B.buildConstant(VgprRB_S32, 31));
619 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
621 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
623 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi,
Zero, Amt});
624 B.buildMergeLikeInstr(Dst, {SHRSrcLo,
Hi});
627 MI.eraseFromParent();
631bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &
MI) {
633 LLT Ty = MRI.getType(DstReg);
636 Register Src =
MI.getOperand(FirstOpnd).getReg();
637 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
638 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
645 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
646 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
647 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset,
Size);
648 unsigned Opc32 =
Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
649 unsigned Opc64 =
Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
650 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
654 auto S_BFE = B.buildInstr(
Opc, {{SgprRB, Ty}},
655 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
657 *ST.getRegisterInfo(), RBI);
659 B.buildCopy(DstReg,
S_BFE->getOperand(0).getReg());
660 MI.eraseFromParent();
664bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &
MI) {
666 LLT DstTy = MRI.getType(Dst);
667 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
668 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
669 auto Op1 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(1).
getReg());
670 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
671 unsigned Opc =
MI.getOpcode();
674 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)},
Flags);
676 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)},
Flags);
677 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
678 MI.eraseFromParent();
682bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &
MI) {
684 assert(MRI.getType(Dst) == S64);
685 auto Op1 = B.buildUnmerge({VgprRB_S32},
MI.getOperand(1).
getReg());
686 auto Op2 = B.buildUnmerge({VgprRB_S32},
MI.getOperand(2).
getReg());
690 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
691 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
692 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
693 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
694 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
695 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
697 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
698 MI.eraseFromParent();
702bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &
MI) {
704 assert(MRI.getType(Dst) == V2S16);
705 unsigned Opc =
MI.getOpcode();
706 unsigned NumOps =
MI.getNumOperands();
709 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(
MI.getOperand(1).getReg());
712 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo},
Flags);
713 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi},
Flags);
714 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
715 MI.eraseFromParent();
719 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(
MI.getOperand(2).getReg());
722 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo},
Flags);
723 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi},
Flags);
724 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
725 MI.eraseFromParent();
730 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(
MI.getOperand(3).getReg());
731 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo},
Flags);
732 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi},
Flags);
733 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
734 MI.eraseFromParent();
738bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &
MI) {
745 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
748 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
749 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
750 if (ST.hasScalarMulHiInsts()) {
751 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
753 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
754 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
755 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
766 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
767 B.buildConstant(Dst1, 0);
770 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
771 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
772 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
774 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
776 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
777 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
778 B.buildCopy(Dst1, AddHi.getReg(1));
781 MI.eraseFromParent();
785bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &
MI) {
787 LLT DstTy = MRI.getType(Dst);
788 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
790 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
791 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
792 auto Op3 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(3).
getReg());
796 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(0), Op3.getReg(0), Flags);
798 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(1), Op3.getReg(1), Flags);
800 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
801 MI.eraseFromParent();
805bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &
MI) {
806 auto Op1 = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
807 int Amt =
MI.getOperand(2).getImm();
811 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
814 Lo = Freeze.getReg(0);
817 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
820 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
821 Hi = B.buildAShr(VgprRB_S32,
Lo, SignExtCst).getReg(0);
825 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
828 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(), {Lo, Hi});
829 MI.eraseFromParent();
833bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &
MI) {
839 unsigned Opc =
MI.getOpcode();
848 case AMDGPU::G_AMDGPU_FFBH_U32:
850 AddOpc = AMDGPU::G_UADDSAT;
851 SearchFromMSB =
true;
853 case AMDGPU::G_AMDGPU_FFBL_B32:
855 AddOpc = AMDGPU::G_UADDSAT;
856 SearchFromMSB =
false;
858 case AMDGPU::G_CTLZ_ZERO_POISON:
859 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
860 AddOpc = AMDGPU::G_ADD;
861 SearchFromMSB =
true;
863 case AMDGPU::G_CTTZ_ZERO_POISON:
864 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
865 AddOpc = AMDGPU::G_ADD;
866 SearchFromMSB =
false;
872 auto Unmerge = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
879 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ?
Hi :
Lo});
881 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ?
Lo :
Hi});
883 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
884 {Secondary, B.buildConstant(VgprRB_S32, 32)});
885 B.buildUMin(
MI.getOperand(0).getReg(), Primary, Adjusted);
887 MI.eraseFromParent();
891bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &
MI) {
903 LLT VecTy = MRI.getType(Src);
906 MachineRegisterInfo::VRegAttrs VgprRB_EltTy = {VgprRB, ScalarTy};
908 auto Unmerge = B.buildUnmerge(VgprRB_EltTy, Src);
911 Register PrevSelect = Unmerge.getReg(0);
912 for (
unsigned I = 1;
I < NumElts; ++
I) {
913 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)},
I);
916 B.buildSelect(VgprRB_EltTy, Cmp, Unmerge.getReg(
I), PrevSelect)
919 B.buildCopy(Dst, PrevSelect);
921 auto InitUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(0));
922 Register PrevLo = InitUnmerge.getReg(0);
923 Register PrevHi = InitUnmerge.getReg(1);
924 for (
unsigned I = 1;
I < NumElts; ++
I) {
925 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)},
I);
927 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(
I));
928 PrevLo = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(0), PrevLo)
930 PrevHi = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(1), PrevHi)
933 B.buildMergeLikeInstr(Dst, {PrevLo, PrevHi});
936 MF, MORE,
"amdgpu-regbanklegalize",
937 "AMDGPU RegBankLegalize: ExtrVecEltToSel unsupported element type",
MI);
941 MI.eraseFromParent();
945bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &
MI) {
958 LLT SrcTy = MRI.getType(Src);
961 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
962 "expected VGPR src and SGPR idx");
964 auto CastSrc = B.buildBitcast({VgprRB, Vec32Ty}, Src);
967 auto One = B.buildConstant(SgprRB_S32, 1);
968 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
969 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
971 auto ExtLo = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxLo);
972 auto ExtHi = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxHi);
974 B.buildMergeLikeInstr(Dst, {ExtLo.getReg(0), ExtHi.getReg(0)});
976 MI.eraseFromParent();
980bool RegBankLegalizeHelper::lowerInsVecEltToSel(MachineInstr &
MI) {
993 LLT VecTy = MRI.getType(Src);
996 const RegisterBank *SrcRB = MRI.getRegBank(Src);
997 bool IsSGPR = (SrcRB == SgprRB);
998 SmallVector<Register, 16> Selects;
1002 auto Unmerge = B.buildUnmerge(VgprRB_S32, Src);
1003 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1004 Register EltLo = EltUnmerge.getReg(0);
1005 Register EltHi = EltUnmerge.getReg(1);
1006 for (
unsigned I = 0;
I < NumElts; ++
I) {
1007 auto IdxConst = B.buildConstant(VgprRB_S32,
I);
1010 B.buildSelect(VgprRB_S32, Cmp, EltLo, Unmerge.getReg(2 *
I))
1013 B.buildSelect(VgprRB_S32, Cmp, EltHi, Unmerge.getReg(2 *
I + 1))
1017 auto Vec32 = B.buildBuildVector({VgprRB, Vec32Ty}, Selects);
1018 B.buildBitcast(Dst, Vec32);
1021 MachineRegisterInfo::VRegAttrs SrcRB_EltTy = {SrcRB, ScalarTy};
1022 MachineRegisterInfo::VRegAttrs CmpTy = IsSGPR ? SgprRB_S32 : VccRB_S1;
1023 auto Unmerge = B.buildUnmerge(SrcRB_EltTy, Src);
1024 for (
unsigned I = 0;
I < NumElts; ++
I) {
1025 auto IdxConst = B.buildConstant(SgprRB_S32,
I);
1028 B.buildSelect(SrcRB_EltTy, Cmp, Elt, Unmerge.getReg(
I)).getReg(0));
1030 B.buildMergeLikeInstr(Dst, Selects);
1033 MF, MORE,
"amdgpu-regbanklegalize",
1034 "AMDGPU RegBankLegalize: InsVecEltToSel unsupported element type",
MI);
1038 MI.eraseFromParent();
1042bool RegBankLegalizeHelper::lowerInsVecEltTo32(MachineInstr &
MI) {
1057 LLT SrcTy = MRI.getType(Src);
1060 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1061 "expected VGPR src and SGPR idx");
1063 MachineRegisterInfo::VRegAttrs VgprRB_Vec32Ty = {VgprRB, Vec32Ty};
1065 auto CastSrc = B.buildBitcast(VgprRB_Vec32Ty, Src);
1066 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1069 auto One = B.buildConstant(SgprRB_S32, 1);
1070 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1071 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1073 auto InsLo = B.buildInsertVectorElement(VgprRB_Vec32Ty, CastSrc,
1074 EltUnmerge.getReg(0), IdxLo);
1075 auto InsHi = B.buildInsertVectorElement(VgprRB_Vec32Ty, InsLo,
1076 EltUnmerge.getReg(1), IdxHi);
1078 B.buildBitcast(Dst, InsHi);
1080 MI.eraseFromParent();
1084bool RegBankLegalizeHelper::lowerAbsToNegMax(MachineInstr &
MI) {
1094 LLT Ty = MRI.getType(DstReg);
1100 Zero = B.buildBuildVector({VgprRB, Ty}, {Zero16, Zero16}).
getReg(0);
1102 assert((Ty == S32 || Ty == S16) &&
"unexpected type for AbsToNegMax");
1103 Zero = B.buildConstant({VgprRB, Ty}, 0).
getReg(0);
1106 auto Neg = B.buildSub({VgprRB, Ty},
Zero, SrcReg);
1107 B.buildSMax(DstReg, SrcReg, Neg);
1108 MI.eraseFromParent();
1112bool RegBankLegalizeHelper::lowerAbsToS32(MachineInstr &
MI) {
1122 auto Bitcast = B.buildBitcast({SgprRB_S32},
MI.getOperand(1).
getReg());
1123 auto SextInReg = B.buildSExtInReg({SgprRB_S32},
Bitcast, 16);
1125 B.buildAShr({SgprRB_S32},
Bitcast, B.buildConstant({SgprRB_S32}, 16));
1127 auto AbsLo = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {SextInReg});
1128 auto AbsHi = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {ShiftHi});
1129 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
1130 {AbsLo.getReg(0), AbsHi.getReg(0)});
1132 MI.eraseFromParent();
1136bool RegBankLegalizeHelper::lower(MachineInstr &
MI,
1144 return lowerVccExtToSel(
MI);
1146 LLT Ty = MRI.getType(
MI.getOperand(0).getReg());
1147 auto True = B.buildConstant({SgprRB, Ty},
1148 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
1149 auto False = B.buildConstant({SgprRB, Ty}, 0);
1153 B.buildSelect(
MI.getOperand(0).getReg(),
MI.getOperand(1).getReg(), True,
1155 MI.eraseFromParent();
1159 return lowerUnpackBitShift(
MI);
1161 return lowerUnpackMinMax(
MI);
1163 return lowerSplitTo16(
MI);
1165 const RegisterBank *RB = MRI.getRegBank(
MI.getOperand(0).getReg());
1166 MachineInstrBuilder
Hi;
1167 switch (
MI.getOpcode()) {
1168 case AMDGPU::G_ZEXT: {
1169 Hi = B.buildConstant({RB, S32}, 0);
1172 case AMDGPU::G_SEXT: {
1174 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
1175 Hi = B.buildAShr({RB, S32},
MI.getOperand(1).
getReg(), ShiftAmt);
1178 case AMDGPU::G_ANYEXT: {
1179 Hi = B.buildUndef({RB, S32});
1184 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
1189 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(),
1190 {MI.getOperand(1).getReg(), Hi});
1191 MI.eraseFromParent();
1195 uint64_t ConstVal =
MI.getOperand(1).getCImm()->getZExtValue();
1196 B.buildConstant(
MI.getOperand(0).getReg(), ConstVal);
1198 MI.eraseFromParent();
1203 LLT Ty = MRI.getType(Src);
1207 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
1209 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
1210 auto One = B.buildConstant(VgprRB_S32, 1);
1211 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
1212 auto Zero = B.buildConstant(VgprRB_S32, 0);
1213 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
1214 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
1216 assert(Ty == S32 || Ty == S16);
1217 auto One = B.buildConstant({VgprRB, Ty}, 1);
1218 B.buildAnd(BoolSrc, Src, One);
1220 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
1222 MI.eraseFromParent();
1226 return lowerV_BFE(
MI);
1228 return lowerS_BFE(
MI);
1230 return lowerUniMAD64(
MI);
1232 B.buildMul(
MI.getOperand(0),
MI.getOperand(1),
MI.getOperand(2));
1233 MI.eraseFromParent();
1237 auto Op1 = B.buildTrunc(VgprRB_S32,
MI.getOperand(1));
1238 auto Op2 = B.buildTrunc(VgprRB_S32,
MI.getOperand(2));
1239 auto Zero = B.buildConstant({VgprRB, S64}, 0);
1241 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
1242 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1243 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1245 B.buildInstr(NewOpc, {
MI.getOperand(0).getReg(), {SgprRB, S32}},
1247 MI.eraseFromParent();
1251 return lowerSplitTo32(
MI);
1253 return lowerSplitTo32Mul(
MI);
1255 return lowerSplitTo32Select(
MI);
1257 return lowerSplitTo32SExtInReg(
MI);
1259 auto Unmerge = B.buildUnmerge({VgprRB, S32},
MI.getOperand(1).
getReg());
1260 auto LoPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(0));
1261 auto HiPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(1));
1263 B.buildAdd(
MI.getOperand(0).getReg(), LoPopCnt, HiPopCnt,
1266 MI.eraseFromParent();
1270 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
1281 if (
Size / 128 == 2)
1283 else if (
Size / 128 == 4)
1287 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1293 else if (DstTy == S96)
1294 splitLoad(
MI, {S64, S32}, S32);
1295 else if (DstTy == V3S32)
1296 splitLoad(
MI, {V2S32, S32}, S32);
1297 else if (DstTy == V6S16)
1298 splitLoad(
MI, {V4S16, V2S16}, V2S16);
1301 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1308 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
1310 widenLoad(
MI, S128);
1311 else if (DstTy == V3S32)
1312 widenLoad(
MI, V4S32, S32);
1313 else if (DstTy == V6S16)
1314 widenLoad(
MI, V8S16, V2S16);
1317 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1324 return lowerUnpackAExt(
MI);
1329 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1335 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1337 B.setInstrAndDebugLoc(
MI);
1338 for (
unsigned i =
MI.getNumDefs(); i <
MI.getNumOperands(); ++i) {
1339 MachineOperand &
Op =
MI.getOperand(i);
1343 if (MRI.getRegBank(
Reg) != VgprRB) {
1344 auto Copy = B.buildCopy({VgprRB, MRI.getType(
Reg)},
Reg);
1345 Op.setReg(
Copy.getReg(0));
1355 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1360 B.setInstrAndDebugLoc(
MI);
1363 B.buildUnmerge({SgprRB, V2S16}, Unmerge->
getSourceReg());
1364 for (
unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1365 auto [Dst0S32, Dst1S32] =
1366 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1367 B.buildTrunc(
MI.getOperand(i * 2).getReg(), Dst0S32);
1368 B.buildTrunc(
MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1371 auto [Dst0S32, Dst1S32] = unpackAExt(
MI.getOperand(2).getReg());
1372 B.buildTrunc(
MI.getOperand(0).getReg(), Dst0S32);
1373 B.buildTrunc(
MI.getOperand(1).getReg(), Dst1S32);
1376 MI.eraseFromParent();
1381 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1382 B.setInsertPt(*
MI.getParent(),
MI.getParent()->getFirstNonPHI());
1383 MI.getOperand(0).setReg(NewDst);
1384 B.buildTrunc(Dst, NewDst);
1386 for (
unsigned i = 1; i <
MI.getNumOperands(); i += 2) {
1394 auto NewUse = B.buildAnyExt(SgprRB_S32,
UseReg);
1395 MI.getOperand(i).setReg(NewUse.getReg(0));
1403 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1408 assert(MRI.getRegBankOrNull(
MI.getOperand(0).getReg()) == VgprRB);
1412 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1413 return RB == VgprRB || RB == SgprRB;
1418 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1423 unsigned RsrcIdx = RSrcIntrin->
RsrcArg +
MI.getNumExplicitDefs() + 1;
1424 return applyRegisterBanksVgprWithSgprRsrc(
MI, RsrcIdx);
1430 unsigned RsrcIdx =
MI.getNumOperands();
1431 while (RsrcIdx-- >
MI.getNumExplicitDefs()) {
1432 const MachineOperand &
Op =
MI.getOperand(RsrcIdx);
1433 if (
Op.isReg() &&
Op.getReg().isVirtual())
1436 return applyRegisterBanksVgprWithSgprRsrc(
MI, RsrcIdx);
1439 return lowerSplitBitCount64To32(
MI);
1441 return lowerExtrVecEltToSel(
MI);
1443 return lowerExtrVecEltTo32(
MI);
1445 return lowerInsVecEltToSel(
MI);
1447 return lowerInsVecEltTo32(
MI);
1449 return lowerAbsToNegMax(
MI);
1451 return lowerAbsToS32(
MI);
1557 return isAnyPtr(Ty, 32) ? Ty : LLT();
1560 return isAnyPtr(Ty, 64) ? Ty : LLT();
1563 return isAnyPtr(Ty, 128) ? Ty : LLT();
1607 const SIRegisterInfo *
TRI =
1608 static_cast<const SIRegisterInfo *
>(MRI.getTargetRegisterInfo());
1610 if (LLTSize >= 32 &&
TRI->getSGPRClassForBitWidth(LLTSize))
1615 const SIRegisterInfo *
TRI =
1616 static_cast<const SIRegisterInfo *
>(MRI.getTargetRegisterInfo());
1725bool RegBankLegalizeHelper::applyMappingDst(
1726 MachineInstr &
MI,
unsigned &
OpIdx,
1727 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1732 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1734 LLT Ty = MRI.getType(
Reg);
1735 [[maybe_unused]]
const RegisterBank *RB = MRI.getRegBank(
Reg);
1737 switch (MethodIDs[
OpIdx]) {
1809 Register NewAgprDst = MRI.createVirtualRegister({AgprRB, Ty});
1810 Op.setReg(NewAgprDst);
1811 if (!MRI.use_nodbg_empty(
Reg))
1812 B.buildCopy(
Reg, NewAgprDst);
1819 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1821 if (!MRI.use_empty(
Reg)) {
1823 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1824 B.buildTrunc(
Reg, CopyS32_Vcc);
1831 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1832 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1833 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1834 Op.setReg(NewVgprDstS16);
1835 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1837 B.buildTrunc(
Reg, NewSgprDstS32);
1848 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1849 Op.setReg(NewVgprDst);
1862 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1863 Op.setReg(NewVgprDst);
1871 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1873 if (!MRI.use_empty(
Reg))
1874 B.buildTrunc(
Reg, NewDst);
1881 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
1882 B.buildCopy(
Reg,
Op.getReg());
1887 MF, MORE,
"amdgpu-regbanklegalize",
1888 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for",
MI);
1893 MF, MORE,
"amdgpu-regbanklegalize",
1894 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported",
MI);
1902bool RegBankLegalizeHelper::applyMappingSrc(
1903 MachineInstr &
MI,
unsigned &
OpIdx,
1904 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1906 for (
unsigned i = 0; i < MethodIDs.
size(); ++
OpIdx, ++i) {
1907 if (MethodIDs[i] ==
None || MethodIDs[i] ==
IntrId || MethodIDs[i] ==
Imm)
1910 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1912 LLT Ty = MRI.getType(
Reg);
1913 const RegisterBank *RB = MRI.getRegBank(
Reg);
1915 switch (MethodIDs[i]) {
1918 assert(RB == VccRB || RB == SgprRB);
1920 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1922 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1923 Op.setReg(CopyVcc_Scc.getReg(0));
1941 assert(Ty == getTyFromID(MethodIDs[i]));
1942 assert(RB == getRegBankFromID(MethodIDs[i]));
1956 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1957 assert(RB == getRegBankFromID(MethodIDs[i]));
1981 assert(Ty == getTyFromID(MethodIDs[i]));
1983 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
1984 Op.setReg(CopyToVgpr.getReg(0));
2000 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2002 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
2003 Op.setReg(CopyToVgpr.getReg(0));
2009 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
2010 Op.setReg(CopyToVgpr.getReg(0));
2016 auto CopyToAgpr = B.buildCopy({AgprRB, Ty},
Reg);
2017 Op.setReg(CopyToAgpr.getReg(0));
2024 assert(Ty == getTyFromID(MethodIDs[i]));
2029 WFI.
End = std::next(
MI.getIterator());
2036 assert(Ty == getTyFromID(MethodIDs[i]));
2042 while (
Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
2047 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
2059 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2063 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2069 assert(Ty == getTyFromID(MethodIDs[i]));
2073 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2083 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
2084 Op.setReg(Aext.getReg(0));
2091 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
2094 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
2095 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
2096 Op.setReg(BoolInReg.getReg(0));
2102 auto Sext = B.buildSExt(SgprRB_S32,
Reg);
2103 Op.setReg(Sext.getReg(0));
2109 auto Zext = B.buildZExt({SgprRB, S32},
Reg);
2110 Op.setReg(Zext.getReg(0));
2116 auto Aext = B.buildAnyExt({VgprRB, S32},
Reg);
2117 Op.setReg(Aext.getReg(0));
2124 auto Sext = B.buildSExt({VgprRB, S32},
Reg);
2125 Op.setReg(Sext.getReg(0));
2132 auto Zext = B.buildZExt({VgprRB, S32},
Reg);
2133 Op.setReg(Zext.getReg(0));
2138 MF, MORE,
"amdgpu-regbanklegalize",
2139 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported",
MI);
2149 unsigned StartOpIdx,
2150 unsigned EndOpIdx) {
2151 for (
unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
2158bool RegBankLegalizeHelper::applyRegisterBanksVgprWithSgprRsrc(
2159 MachineInstr &
MI,
unsigned RsrcIdx) {
2160 const unsigned NumDefs =
MI.getNumExplicitDefs();
2162 MachineBasicBlock *
MBB =
MI.getParent();
2166 for (
unsigned i = 0; i < NumDefs; ++i) {
2168 if (MRI.getRegBank(
Reg) == VgprRB)
2171 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(
Reg)});
2172 MI.getOperand(i).setReg(NewVgprDst);
2176 B.setInstrAndDebugLoc(
MI);
2179 for (
unsigned i = NumDefs; i < RsrcIdx; ++i) {
2180 MachineOperand &
Op =
MI.getOperand(i);
2188 if (MRI.getRegBank(
Reg) == VgprRB)
2191 auto Copy = B.buildCopy({VgprRB, MRI.getType(
Reg)},
Reg);
2192 Op.setReg(
Copy.getReg(0));
2195 SmallSet<Register, 4> OpsToWaterfall;
2198 for (
unsigned i = RsrcIdx; i <
MI.getNumOperands(); ++i) {
2199 MachineOperand &
Op =
MI.getOperand(i);
2204 if (MRI.getRegBank(
Reg) != SgprRB)
2208 if (!OpsToWaterfall.
empty()) {
2210 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
const SmallVectorImpl< MachineOperand > & Cond
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned MovTermOpc
const unsigned AndSaveExecOpc
bool findRuleAndApplyMapping(MachineInstr &MI)
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
bool isValid() const
Check for null.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank * getRegBank(Register Reg) const
Return the register bank of Reg.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ SgprV4S32_ReadFirstLane
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
@ VerifyAllSgprOrVgprGPHI
@ AextToS32InIncomingBlockGPHI
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LoweringMethodID LoweringMethod
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs
MachineBasicBlock::iterator End