26#include "llvm/IR/IntrinsicsAMDGPU.h"
28#define DEBUG_TYPE "amdgpu-regbanklegalize"
36 : MF(B.getMF()), ST(MF.getSubtarget<
GCNSubtarget>()), B(B),
37 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
38 RBLRules(RBLRules), IsWave32(ST.isWave32()),
39 SgprRB(&RBI.getRegBank(
AMDGPU::SGPRRegBankID)),
40 VgprRB(&RBI.getRegBank(
AMDGPU::VGPRRegBankID)),
41 VccRB(&RBI.getRegBank(
AMDGPU::VCCRegBankID)) {}
47 "No AMDGPU RegBankLegalize rules defined for opcode",
55 "AMDGPU RegBankLegalize: none of the rules defined with "
56 "'Any' for MI's opcode matched MI",
64 B.setInsertPt(*
MI.getParent(), std::next(
MI.getIterator()));
74 if (!lower(
MI, *Mapping, WaterfallSgprs))
80bool RegBankLegalizeHelper::executeInWaterfallLoop(
92 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
94 MovExecOpc = AMDGPU::S_MOV_B32;
95 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
96 XorTermOpc = AMDGPU::S_XOR_B32_term;
97 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
98 ExecReg = AMDGPU::EXEC_LO;
100 MovExecOpc = AMDGPU::S_MOV_B64;
101 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
102 XorTermOpc = AMDGPU::S_XOR_B64_term;
103 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
104 ExecReg = AMDGPU::EXEC;
108 const int OrigRangeSize = std::distance(
Range.begin(),
Range.end());
112 Register SaveExecReg =
MRI.createVirtualRegister(WaveRC);
113 Register InitSaveExecReg =
MRI.createVirtualRegister(WaveRC);
116 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
118 Register SavedExec =
MRI.createVirtualRegister(WaveRC);
142 MBB.addSuccessor(LoopBB);
145 B.setInsertPt(*LoopBB, LoopBB->
end());
196 auto NewEnd = BodyBB->
end();
197 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
205 if (!SGPROperandRegs.
count(OldReg))
210 auto OldVal = WaterfalledRegMap.
find(OldReg);
211 if (OldVal != WaterfalledRegMap.
end()) {
212 Op.setReg(OldVal->second);
217 LLT OpTy = MRI.getType(OpReg);
220 assert(MRI.getRegBank(OpReg) == VgprRB);
221 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
226 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
228 unsigned NumParts = OpSize / PartSize;
234 CurrentLaneParts.
push_back(CurrentLaneReg);
236 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
237 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
238 for (
unsigned i = 0; i < NumParts; ++i) {
240 CurrentLaneParts.
push_back(UnmergeCurrLane.getReg(i));
244 for (
unsigned i = 0; i < NumParts; ++i) {
245 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
251 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
254 Op.setReg(CurrentLaneReg);
257 WaterfalledRegMap.
insert(std::pair(OldReg,
Op.getReg()));
263 MRI.createVirtualRegister({WaveRC,
LLT::scalar(IsWave32 ? 32 : 64)});
264 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
267 B.buildInstr(AndSaveExecOpc)
270 MRI.setSimpleHint(SavedExec, CondRegLM);
272 B.setInsertPt(*BodyBB, BodyBB->
end());
275 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
281 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
285 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
288 B.setInsertPt(*RestoreExecBB, RestoreExecBB->
begin());
289 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
293 B.setInsertPt(*RemainderBB, RemainderBB->
begin());
298bool RegBankLegalizeHelper::splitLoad(MachineInstr &
MI,
300 MachineFunction &MF = B.getMF();
301 assert(
MI.getNumMemOperands() == 1);
302 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
304 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
306 LLT PtrTy = MRI.getType(
Base);
307 const RegisterBank *PtrRB = MRI.getRegBankOrNull(
Base);
311 unsigned ByteOffset = 0;
312 for (LLT PartTy : LLTBreakdown) {
314 if (ByteOffset == 0) {
315 BasePlusOffset =
Base;
317 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
321 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
322 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
323 LoadPartRegs.
push_back(LoadPart.getReg(0));
329 B.buildMergeLikeInstr(Dst, LoadPartRegs);
335 if (MRI.getType(
Reg) == MergeTy) {
338 auto Unmerge = B.buildUnmerge({DstRB, MergeTy},
Reg);
339 for (
unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
340 MergeTyParts.
push_back(Unmerge.getReg(i));
343 B.buildMergeLikeInstr(Dst, MergeTyParts);
345 MI.eraseFromParent();
349bool RegBankLegalizeHelper::widenLoad(MachineInstr &
MI, LLT WideTy,
351 MachineFunction &MF = B.getMF();
352 assert(
MI.getNumMemOperands() == 1);
353 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
355 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
358 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
359 auto WideLoad = B.buildLoad({DstRB, WideTy},
Base, *WideMMO);
362 B.buildTrunc(Dst, WideLoad);
365 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
367 LLT DstTy = MRI.getType(Dst);
369 for (
unsigned i = 0; i < NumElts; ++i) {
370 MergeTyParts.
push_back(Unmerge.getReg(i));
372 B.buildMergeLikeInstr(Dst, MergeTyParts);
374 MI.eraseFromParent();
378bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &
MI)
const {
381 MachineMemOperand &MMO =
MI.getMMO();
384 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
386 if (
MI.getOpcode() == G_LOAD) {
387 B.buildLoad(Dst, Ptr, *WideMMO);
389 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
391 if (
MI.getOpcode() == G_ZEXTLOAD) {
393 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
394 B.buildAnd(Dst, Load, MaskCst);
396 assert(
MI.getOpcode() == G_SEXTLOAD);
397 B.buildSExtInReg(Dst, Load, MemSize);
401 MI.eraseFromParent();
405bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &
MI) {
407 LLT Ty = MRI.getType(Dst);
409 unsigned Opc =
MI.getOpcode();
410 int TrueExtCst =
Opc == G_SEXT ? -1 : 1;
411 if (Ty == S32 || Ty == S16) {
412 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
413 auto False = B.buildConstant({VgprRB, Ty}, 0);
414 B.buildSelect(Dst, Src, True, False);
415 }
else if (Ty == S64) {
416 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
417 auto False = B.buildConstant({VgprRB_S32}, 0);
418 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
419 MachineInstrBuilder
Hi;
428 Hi = B.buildUndef({VgprRB_S32});
432 MF, MORE,
"amdgpu-regbanklegalize",
433 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported",
MI);
437 B.buildMergeValues(Dst, {
Lo.getReg(0),
Hi.getReg(0)});
440 MF, MORE,
"amdgpu-regbanklegalize",
441 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported",
MI);
445 MI.eraseFromParent();
449std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(
Register Reg) {
450 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
451 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
452 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
453 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
454 return {
Lo.getReg(0),
Hi.getReg(0)};
457std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(
Register Reg) {
458 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
459 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
460 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
461 return {
Lo.getReg(0),
Hi.getReg(0)};
464std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(
Register Reg) {
465 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
467 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
468 return {
Lo.getReg(0),
Hi.getReg(0)};
471std::pair<Register, Register>
472RegBankLegalizeHelper::unpackAExtTruncS16(
Register Reg) {
473 auto [Lo32, Hi32] = unpackAExt(
Reg);
474 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
475 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
478bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &
MI) {
480 switch (
MI.getOpcode()) {
481 case AMDGPU::G_SHL: {
482 auto [Val0, Val1] = unpackAExt(
MI.getOperand(1).getReg());
483 auto [Amt0, Amt1] = unpackAExt(
MI.getOperand(2).getReg());
484 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
485 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
488 case AMDGPU::G_LSHR: {
489 auto [Val0, Val1] = unpackZExt(
MI.getOperand(1).getReg());
490 auto [Amt0, Amt1] = unpackZExt(
MI.getOperand(2).getReg());
491 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
492 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
495 case AMDGPU::G_ASHR: {
496 auto [Val0, Val1] = unpackSExt(
MI.getOperand(1).getReg());
497 auto [Amt0, Amt1] = unpackSExt(
MI.getOperand(2).getReg());
498 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
499 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
504 MF, MORE,
"amdgpu-regbanklegalize",
505 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
509 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
510 MI.eraseFromParent();
514bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &
MI) {
516 switch (
MI.getOpcode()) {
518 case AMDGPU::G_SMAX: {
520 auto [Val0_Lo, Val0_Hi] = unpackSExt(
MI.getOperand(1).getReg());
521 auto [Val1_Lo, Val1_Hi] = unpackSExt(
MI.getOperand(2).getReg());
522 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
524 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
529 case AMDGPU::G_UMAX: {
531 auto [Val0_Lo, Val0_Hi] = unpackZExt(
MI.getOperand(1).getReg());
532 auto [Val1_Lo, Val1_Hi] = unpackZExt(
MI.getOperand(2).getReg());
533 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
535 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
541 MF, MORE,
"amdgpu-regbanklegalize",
542 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented",
MI);
545 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
546 MI.eraseFromParent();
550bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &
MI) {
551 auto [Op1Lo, Op1Hi] = unpackAExt(
MI.getOperand(1).getReg());
552 auto [Op2Lo, Op2Hi] = unpackAExt(
MI.getOperand(2).getReg());
553 auto ResLo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
554 auto ResHi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
555 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
556 {ResLo.getReg(0), ResHi.getReg(0)});
557 MI.eraseFromParent();
563 return (GI->is(Intrinsic::amdgcn_sbfe));
565 return MI.getOpcode() == AMDGPU::G_SBFX;
568bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &
MI) {
575 Register Src =
MI.getOperand(FirstOpnd).getReg();
576 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
577 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
582 unsigned SHROpc =
Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
583 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
591 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
592 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
593 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
594 MI.eraseFromParent();
598 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
599 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
600 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
601 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
602 auto Zero = B.buildConstant({VgprRB, S32}, 0);
603 unsigned BFXOpc =
Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
605 if (WidthImm <= 32) {
607 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo,
Zero, Width});
608 MachineInstrBuilder
Hi;
611 Hi = B.buildAShr(VgprRB_S32,
Lo, B.buildConstant(VgprRB_S32, 31));
616 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
618 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
620 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi,
Zero, Amt});
621 B.buildMergeLikeInstr(Dst, {SHRSrcLo,
Hi});
624 MI.eraseFromParent();
628bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &
MI) {
630 LLT Ty = MRI.getType(DstReg);
633 Register Src =
MI.getOperand(FirstOpnd).getReg();
634 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
635 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
642 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
643 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
644 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset,
Size);
645 unsigned Opc32 =
Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
646 unsigned Opc64 =
Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
647 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
651 auto S_BFE = B.buildInstr(
Opc, {{SgprRB, Ty}},
652 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
654 *ST.getRegisterInfo(), RBI)) {
656 MF, MORE,
"amdgpu-regbanklegalize",
657 "AMDGPU RegBankLegalize: lowerS_BFE, failed to constrain BFE",
MI);
661 B.buildCopy(DstReg,
S_BFE->getOperand(0).getReg());
662 MI.eraseFromParent();
666bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &
MI) {
668 LLT DstTy = MRI.getType(Dst);
669 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
670 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
671 auto Op1 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(1).
getReg());
672 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
673 unsigned Opc =
MI.getOpcode();
676 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)},
Flags);
678 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)},
Flags);
679 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
680 MI.eraseFromParent();
684bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &
MI) {
686 assert(MRI.getType(Dst) == S64);
687 auto Op1 = B.buildUnmerge({VgprRB_S32},
MI.getOperand(1).
getReg());
688 auto Op2 = B.buildUnmerge({VgprRB_S32},
MI.getOperand(2).
getReg());
692 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
693 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
694 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
695 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
696 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
697 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
699 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
700 MI.eraseFromParent();
704bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &
MI) {
706 assert(MRI.getType(Dst) == V2S16);
707 unsigned Opc =
MI.getOpcode();
708 unsigned NumOps =
MI.getNumOperands();
711 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(
MI.getOperand(1).getReg());
714 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo},
Flags);
715 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi},
Flags);
716 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
717 MI.eraseFromParent();
721 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(
MI.getOperand(2).getReg());
724 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo},
Flags);
725 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi},
Flags);
726 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
727 MI.eraseFromParent();
732 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(
MI.getOperand(3).getReg());
733 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo},
Flags);
734 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi},
Flags);
735 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
736 MI.eraseFromParent();
740bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &
MI) {
747 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
750 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
751 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
752 if (ST.hasScalarMulHiInsts()) {
753 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
755 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
756 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
757 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
768 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
769 B.buildConstant(Dst1, 0);
772 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
773 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
774 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
776 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
778 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
779 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
780 B.buildCopy(Dst1, AddHi.getReg(1));
783 MI.eraseFromParent();
787bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &
MI) {
789 LLT DstTy = MRI.getType(Dst);
790 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
792 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
793 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
794 auto Op3 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(3).
getReg());
798 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(0), Op3.getReg(0), Flags);
800 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(1), Op3.getReg(1), Flags);
802 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
803 MI.eraseFromParent();
807bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &
MI) {
808 auto Op1 = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
809 int Amt =
MI.getOperand(2).getImm();
813 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
816 Lo = Freeze.getReg(0);
819 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
822 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
823 Hi = B.buildAShr(VgprRB_S32,
Lo, SignExtCst).getReg(0);
827 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
830 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(), {Lo, Hi});
831 MI.eraseFromParent();
835bool RegBankLegalizeHelper::lower(MachineInstr &
MI,
837 SmallSet<Register, 4> &WaterfallSgprs) {
843 return lowerVccExtToSel(
MI);
845 LLT Ty = MRI.getType(
MI.getOperand(0).getReg());
846 auto True = B.buildConstant({SgprRB, Ty},
847 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
848 auto False = B.buildConstant({SgprRB, Ty}, 0);
852 B.buildSelect(
MI.getOperand(0).getReg(),
MI.getOperand(1).getReg(), True,
854 MI.eraseFromParent();
858 return lowerUnpackBitShift(
MI);
860 return lowerUnpackMinMax(
MI);
862 return lowerSplitTo16(
MI);
864 const RegisterBank *RB = MRI.getRegBank(
MI.getOperand(0).getReg());
865 MachineInstrBuilder
Hi;
866 switch (
MI.getOpcode()) {
867 case AMDGPU::G_ZEXT: {
868 Hi = B.buildConstant({RB, S32}, 0);
871 case AMDGPU::G_SEXT: {
873 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
874 Hi = B.buildAShr({RB, S32},
MI.getOperand(1).
getReg(), ShiftAmt);
877 case AMDGPU::G_ANYEXT: {
878 Hi = B.buildUndef({RB, S32});
883 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
888 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(),
889 {MI.getOperand(1).getReg(), Hi});
890 MI.eraseFromParent();
894 uint64_t ConstVal =
MI.getOperand(1).getCImm()->getZExtValue();
895 B.buildConstant(
MI.getOperand(0).getReg(), ConstVal);
897 MI.eraseFromParent();
902 LLT Ty = MRI.getType(Src);
906 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
908 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
909 auto One = B.buildConstant(VgprRB_S32, 1);
910 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
911 auto Zero = B.buildConstant(VgprRB_S32, 0);
912 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
913 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
915 assert(Ty == S32 || Ty == S16);
916 auto One = B.buildConstant({VgprRB, Ty}, 1);
917 B.buildAnd(BoolSrc, Src, One);
919 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
921 MI.eraseFromParent();
925 return lowerV_BFE(
MI);
927 return lowerS_BFE(
MI);
929 return lowerUniMAD64(
MI);
931 B.buildMul(
MI.getOperand(0),
MI.getOperand(1),
MI.getOperand(2));
932 MI.eraseFromParent();
936 auto Op1 = B.buildTrunc(VgprRB_S32,
MI.getOperand(1));
937 auto Op2 = B.buildTrunc(VgprRB_S32,
MI.getOperand(2));
938 auto Zero = B.buildConstant({VgprRB, S64}, 0);
940 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
941 ? AMDGPU::G_AMDGPU_MAD_U64_U32
942 : AMDGPU::G_AMDGPU_MAD_I64_I32;
944 B.buildInstr(NewOpc, {
MI.getOperand(0).getReg(), {SgprRB, S32}},
946 MI.eraseFromParent();
950 return lowerSplitTo32(
MI);
952 return lowerSplitTo32Mul(
MI);
954 return lowerSplitTo32Select(
MI);
956 return lowerSplitTo32SExtInReg(
MI);
958 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
971 else if (
Size / 128 == 4)
975 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
981 else if (DstTy == S96)
982 splitLoad(
MI, {S64, S32}, S32);
983 else if (DstTy == V3S32)
984 splitLoad(
MI, {V2S32, S32}, S32);
985 else if (DstTy == V6S16)
986 splitLoad(
MI, {V4S16, V2S16}, V2S16);
989 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
996 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
999 else if (DstTy == V3S32)
1000 widenLoad(
MI, V4S32, S32);
1001 else if (DstTy == V6S16)
1002 widenLoad(
MI, V8S16, V2S16);
1005 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1012 return lowerUnpackAExt(
MI);
1017 if (!WaterfallSgprs.
empty()) {
1019 if (!executeInWaterfallLoop(B,
make_range(
I, std::next(
I)), WaterfallSgprs))
1108 return isAnyPtr(Ty, 32) ? Ty : LLT();
1111 return isAnyPtr(Ty, 64) ? Ty : LLT();
1114 return isAnyPtr(Ty, 128) ? Ty : LLT();
1239bool RegBankLegalizeHelper::applyMappingDst(
1240 MachineInstr &
MI,
unsigned &
OpIdx,
1241 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1246 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1248 LLT Ty = MRI.getType(
Reg);
1249 [[maybe_unused]]
const RegisterBank *RB = MRI.getRegBank(
Reg);
1251 switch (MethodIDs[
OpIdx]) {
1313 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1315 if (!MRI.use_empty(
Reg)) {
1317 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1318 B.buildTrunc(
Reg, CopyS32_Vcc);
1325 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1326 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1327 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1328 Op.setReg(NewVgprDstS16);
1329 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1331 B.buildTrunc(
Reg, NewSgprDstS32);
1342 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1343 Op.setReg(NewVgprDst);
1355 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1356 Op.setReg(NewVgprDst);
1364 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1366 if (!MRI.use_empty(
Reg))
1367 B.buildTrunc(
Reg, NewDst);
1372 MF, MORE,
"amdgpu-regbanklegalize",
1373 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for",
MI);
1378 MF, MORE,
"amdgpu-regbanklegalize",
1379 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported",
MI);
1387bool RegBankLegalizeHelper::applyMappingSrc(
1388 MachineInstr &
MI,
unsigned &
OpIdx,
1389 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1390 SmallSet<Register, 4> &SgprWaterfallOperandRegs) {
1391 for (
unsigned i = 0; i < MethodIDs.
size(); ++
OpIdx, ++i) {
1392 if (MethodIDs[i] ==
None || MethodIDs[i] ==
IntrId || MethodIDs[i] ==
Imm)
1395 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1397 LLT Ty = MRI.getType(
Reg);
1398 const RegisterBank *RB = MRI.getRegBank(
Reg);
1400 switch (MethodIDs[i]) {
1403 assert(RB == VccRB || RB == SgprRB);
1405 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1407 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1408 Op.setReg(CopyVcc_Scc.getReg(0));
1426 assert(Ty == getTyFromID(MethodIDs[i]));
1427 assert(RB == getRegBankFromID(MethodIDs[i]));
1440 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1441 assert(RB == getRegBankFromID(MethodIDs[i]));
1460 assert(Ty == getTyFromID(MethodIDs[i]));
1462 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
1463 Op.setReg(CopyToVgpr.getReg(0));
1477 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1479 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
1480 Op.setReg(CopyToVgpr.getReg(0));
1487 assert(Ty == getTyFromID(MethodIDs[i]));
1497 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1498 Op.setReg(Aext.getReg(0));
1505 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1508 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
1509 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
1510 Op.setReg(BoolInReg.getReg(0));
1516 auto Sext = B.buildSExt(SgprRB_S32,
Reg);
1517 Op.setReg(Sext.getReg(0));
1523 auto Zext = B.buildZExt({SgprRB, S32},
Reg);
1524 Op.setReg(Zext.getReg(0));
1530 auto Aext = B.buildAnyExt({VgprRB, S32},
Reg);
1531 Op.setReg(Aext.getReg(0));
1538 auto Sext = B.buildSExt({VgprRB, S32},
Reg);
1539 Op.setReg(Sext.getReg(0));
1546 auto Zext = B.buildZExt({VgprRB, S32},
Reg);
1547 Op.setReg(Zext.getReg(0));
1552 MF, MORE,
"amdgpu-regbanklegalize",
1553 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported",
MI);
1562 LLT Ty = MRI.getType(Dst);
1565 B.setInsertPt(*
MI.getParent(),
MI.getParent()->getFirstNonPHI());
1567 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1568 MI.getOperand(0).setReg(NewDst);
1569 B.buildTrunc(Dst, NewDst);
1571 for (
unsigned i = 1; i <
MI.getNumOperands(); i += 2) {
1574 auto DefMI = MRI.getVRegDef(
UseReg)->getIterator();
1579 auto NewUse = B.buildAnyExt(SgprRB_S32,
UseReg);
1580 MI.getOperand(i).setReg(NewUse.getReg(0));
1589 if (Ty ==
LLT::scalar(1) && MUI.isDivergent(Dst)) {
1591 "AMDGPU RegBankLegalize: Can't lower divergent S1 G_PHI",
1605 "AMDGPU RegBankLegalize: type not supported for G_PHI",
1613 unsigned StartOpIdx,
1614 unsigned EndOpIdx) {
1615 for (
unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1616 if (
MRI.getRegBankOrNull(
MI.getOperand(i).getReg()) != RB)
1623 const RegisterBank *RB = MRI.getRegBank(
MI.getOperand(0).getReg());
1625 unsigned NumDefs =
MI.getNumDefs();
1626 unsigned NumOperands =
MI.getNumOperands();
1634 for (
unsigned i = NumDefs; i < NumOperands; ++i) {
1636 if (MRI.getRegBank(Reg) != RB) {
1637 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1638 MI.getOperand(i).setReg(Copy.getReg(0));
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
const SmallVectorImpl< MachineOperand > & Cond
bool findRuleAndApplyMapping(MachineInstr &MI)
bool applyMappingPHI(MachineInstr &MI)
void applyMappingTrivial(MachineInstr &MI)
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
void push_back(const T &Elt)
A range adaptor for a pair of iterators.
bool isAnyPtr(LLT Ty, unsigned Width)
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LoweringMethodID LoweringMethod
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping