25#include "llvm/IR/IntrinsicsAMDGPU.h"
27#define DEBUG_TYPE "amdgpu-regbanklegalize"
35 : ST(B.getMF().getSubtarget<
GCNSubtarget>()), B(B), MRI(*B.getMRI()),
36 MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()),
37 SgprRB(&RBI.getRegBank(
AMDGPU::SGPRRegBankID)),
38 VgprRB(&RBI.getRegBank(
AMDGPU::VGPRRegBankID)),
39 VccRB(&RBI.getRegBank(
AMDGPU::VCCRegBankID)) {}
48 B.setInsertPt(*
MI.getParent(), std::next(
MI.getIterator()));
56 lower(
MI, Mapping, WaterfallSgprs);
59bool RegBankLegalizeHelper::executeInWaterfallLoop(
71 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
73 MovExecOpc = AMDGPU::S_MOV_B32;
74 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
75 XorTermOpc = AMDGPU::S_XOR_B32_term;
76 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
77 ExecReg = AMDGPU::EXEC_LO;
79 MovExecOpc = AMDGPU::S_MOV_B64;
80 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
81 XorTermOpc = AMDGPU::S_XOR_B64_term;
82 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
83 ExecReg = AMDGPU::EXEC;
87 const int OrigRangeSize = std::distance(
Range.begin(),
Range.end());
91 Register SaveExecReg =
MRI.createVirtualRegister(WaveRC);
92 Register InitSaveExecReg =
MRI.createVirtualRegister(WaveRC);
95 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
97 Register SavedExec =
MRI.createVirtualRegister(WaveRC);
121 MBB.addSuccessor(LoopBB);
124 B.setInsertPt(*LoopBB, LoopBB->
end());
175 auto NewEnd = BodyBB->
end();
176 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
184 if (!SGPROperandRegs.
count(OldReg))
189 auto OldVal = WaterfalledRegMap.
find(OldReg);
190 if (OldVal != WaterfalledRegMap.
end()) {
191 Op.setReg(OldVal->second);
196 LLT OpTy = MRI.getType(OpReg);
199 assert(MRI.getRegBank(OpReg) == VgprRB);
200 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
205 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
207 unsigned NumParts = OpSize / PartSize;
213 CurrentLaneParts.
push_back(CurrentLaneReg);
215 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
216 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
217 for (
unsigned i = 0; i < NumParts; ++i) {
219 CurrentLaneParts.
push_back(UnmergeCurrLane.getReg(i));
223 for (
unsigned i = 0; i < NumParts; ++i) {
224 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
230 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
233 Op.setReg(CurrentLaneReg);
236 WaterfalledRegMap.
insert(std::pair(OldReg,
Op.getReg()));
242 MRI.createVirtualRegister({WaveRC,
LLT::scalar(IsWave32 ? 32 : 64)});
243 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
246 B.buildInstr(AndSaveExecOpc)
249 MRI.setSimpleHint(SavedExec, CondRegLM);
251 B.setInsertPt(*BodyBB, BodyBB->
end());
254 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
260 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
264 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
267 B.setInsertPt(*RestoreExecBB, RestoreExecBB->
begin());
268 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
272 B.setInsertPt(*RemainderBB, RemainderBB->
begin());
277void RegBankLegalizeHelper::splitLoad(MachineInstr &
MI,
279 MachineFunction &MF = B.getMF();
280 assert(
MI.getNumMemOperands() == 1);
281 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
283 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
285 LLT PtrTy = MRI.getType(
Base);
286 const RegisterBank *PtrRB = MRI.getRegBankOrNull(
Base);
290 unsigned ByteOffset = 0;
291 for (LLT PartTy : LLTBreakdown) {
293 if (ByteOffset == 0) {
294 BasePlusOffset =
Base;
296 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
301 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
302 LoadPartRegs.
push_back(LoadPart.getReg(0));
308 B.buildMergeLikeInstr(Dst, LoadPartRegs);
314 if (MRI.getType(
Reg) == MergeTy) {
317 auto Unmerge = B.buildUnmerge({DstRB, MergeTy},
Reg);
318 for (
unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
319 MergeTyParts.
push_back(Unmerge.getReg(i));
322 B.buildMergeLikeInstr(Dst, MergeTyParts);
324 MI.eraseFromParent();
327void RegBankLegalizeHelper::widenLoad(MachineInstr &
MI, LLT WideTy,
329 MachineFunction &MF = B.getMF();
330 assert(
MI.getNumMemOperands() == 1);
331 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
333 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
337 auto WideLoad = B.buildLoad({DstRB, WideTy},
Base, *WideMMO);
340 B.buildTrunc(Dst, WideLoad);
343 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
345 LLT DstTy = MRI.getType(Dst);
347 for (
unsigned i = 0; i < NumElts; ++i) {
348 MergeTyParts.
push_back(Unmerge.getReg(i));
350 B.buildMergeLikeInstr(Dst, MergeTyParts);
352 MI.eraseFromParent();
355void RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &
MI)
const {
358 MachineMemOperand &MMO =
MI.getMMO();
361 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
363 if (
MI.getOpcode() == G_LOAD) {
364 B.buildLoad(Dst, Ptr, *WideMMO);
366 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
368 if (
MI.getOpcode() == G_ZEXTLOAD) {
370 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
371 B.buildAnd(Dst, Load, MaskCst);
373 assert(
MI.getOpcode() == G_SEXTLOAD);
374 B.buildSExtInReg(Dst, Load, MemSize);
378 MI.eraseFromParent();
381void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &
MI) {
383 LLT Ty = MRI.getType(Dst);
385 unsigned Opc =
MI.getOpcode();
386 int TrueExtCst =
Opc == G_SEXT ? -1 : 1;
387 if (Ty == S32 || Ty == S16) {
388 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
389 auto False = B.buildConstant({VgprRB, Ty}, 0);
390 B.buildSelect(Dst, Src, True, False);
391 }
else if (Ty == S64) {
392 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
393 auto False = B.buildConstant({VgprRB_S32}, 0);
394 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
395 MachineInstrBuilder
Hi;
404 Hi = B.buildUndef({VgprRB_S32});
410 B.buildMergeValues(Dst, {
Lo.getReg(0),
Hi.getReg(0)});
415 MI.eraseFromParent();
418std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(
Register Reg) {
419 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
420 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
421 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
422 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
423 return {
Lo.getReg(0),
Hi.getReg(0)};
426std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(
Register Reg) {
427 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
428 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
429 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
430 return {
Lo.getReg(0),
Hi.getReg(0)};
433std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(
Register Reg) {
434 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
436 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
437 return {
Lo.getReg(0),
Hi.getReg(0)};
440std::pair<Register, Register>
441RegBankLegalizeHelper::unpackAExtTruncS16(
Register Reg) {
442 auto [Lo32, Hi32] = unpackAExt(
Reg);
443 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
444 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
447void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &
MI) {
449 switch (
MI.getOpcode()) {
450 case AMDGPU::G_SHL: {
451 auto [Val0, Val1] = unpackAExt(
MI.getOperand(1).getReg());
452 auto [Amt0, Amt1] = unpackAExt(
MI.getOperand(2).getReg());
453 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
454 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
457 case AMDGPU::G_LSHR: {
458 auto [Val0, Val1] = unpackZExt(
MI.getOperand(1).getReg());
459 auto [Amt0, Amt1] = unpackZExt(
MI.getOperand(2).getReg());
460 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
461 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
464 case AMDGPU::G_ASHR: {
465 auto [Val0, Val1] = unpackSExt(
MI.getOperand(1).getReg());
466 auto [Amt0, Amt1] = unpackSExt(
MI.getOperand(2).getReg());
467 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
468 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
474 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
475 MI.eraseFromParent();
478void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &
MI) {
480 switch (
MI.getOpcode()) {
482 case AMDGPU::G_SMAX: {
484 auto [Val0_Lo, Val0_Hi] = unpackSExt(
MI.getOperand(1).getReg());
485 auto [Val1_Lo, Val1_Hi] = unpackSExt(
MI.getOperand(2).getReg());
486 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
488 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
493 case AMDGPU::G_UMAX: {
495 auto [Val0_Lo, Val0_Hi] = unpackZExt(
MI.getOperand(1).getReg());
496 auto [Val1_Lo, Val1_Hi] = unpackZExt(
MI.getOperand(2).getReg());
497 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
499 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
506 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
507 MI.eraseFromParent();
510void RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &
MI) {
511 auto [Op1Lo, Op1Hi] = unpackAExt(
MI.getOperand(1).getReg());
512 auto [Op2Lo, Op2Hi] = unpackAExt(
MI.getOperand(2).getReg());
513 auto ResLo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
514 auto ResHi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
515 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
516 {ResLo.getReg(0), ResHi.getReg(0)});
517 MI.eraseFromParent();
522 return (GI->is(Intrinsic::amdgcn_sbfe));
524 return MI.getOpcode() == AMDGPU::G_SBFX;
527void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &
MI) {
534 Register Src =
MI.getOperand(FirstOpnd).getReg();
535 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
536 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
541 unsigned SHROpc =
Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
542 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
550 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
551 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
552 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
553 MI.eraseFromParent();
557 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
558 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
559 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
560 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
561 auto Zero = B.buildConstant({VgprRB, S32}, 0);
562 unsigned BFXOpc =
Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
564 if (WidthImm <= 32) {
566 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo,
Zero, Width});
567 MachineInstrBuilder
Hi;
570 Hi = B.buildAShr(VgprRB_S32,
Lo, B.buildConstant(VgprRB_S32, 31));
575 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
577 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
579 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi,
Zero, Amt});
580 B.buildMergeLikeInstr(Dst, {SHRSrcLo,
Hi});
583 MI.eraseFromParent();
586void RegBankLegalizeHelper::lowerS_BFE(MachineInstr &
MI) {
588 LLT Ty = MRI.getType(DstReg);
591 Register Src =
MI.getOperand(FirstOpnd).getReg();
592 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
593 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
600 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
601 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
602 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset,
Size);
603 unsigned Opc32 =
Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
604 unsigned Opc64 =
Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
605 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
609 auto S_BFE = B.buildInstr(
Opc, {{SgprRB, Ty}},
610 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
612 *ST.getRegisterInfo(), RBI))
615 B.buildCopy(DstReg,
S_BFE->getOperand(0).getReg());
616 MI.eraseFromParent();
619void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &
MI) {
621 LLT DstTy = MRI.getType(Dst);
622 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
623 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
624 auto Op1 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(1).
getReg());
625 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
626 unsigned Opc =
MI.getOpcode();
629 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)},
Flags);
631 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)},
Flags);
632 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
633 MI.eraseFromParent();
636void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &
MI) {
638 assert(MRI.getType(Dst) == V2S16);
639 unsigned Opc =
MI.getOpcode();
642 if (
MI.getNumOperands() == 2) {
643 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(
MI.getOperand(1).getReg());
644 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo},
Flags);
645 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi},
Flags);
646 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
647 MI.eraseFromParent();
652 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(
MI.getOperand(1).getReg());
653 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(
MI.getOperand(2).getReg());
654 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo},
Flags);
655 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi},
Flags);
656 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
657 MI.eraseFromParent();
660void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &
MI) {
662 LLT DstTy = MRI.getType(Dst);
663 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
665 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
666 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
667 auto Op3 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(3).
getReg());
671 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(0), Op3.getReg(0), Flags);
673 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(1), Op3.getReg(1), Flags);
675 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
676 MI.eraseFromParent();
679void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &
MI) {
680 auto Op1 = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
681 int Amt =
MI.getOperand(2).getImm();
685 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
688 Lo = Freeze.getReg(0);
691 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
694 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
695 Hi = B.buildAShr(VgprRB_S32,
Lo, SignExtCst).getReg(0);
699 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
702 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(), {Lo, Hi});
703 MI.eraseFromParent();
706void RegBankLegalizeHelper::lower(MachineInstr &
MI,
708 SmallSet<Register, 4> &WaterfallSgprs) {
714 return lowerVccExtToSel(
MI);
716 LLT Ty = MRI.getType(
MI.getOperand(0).getReg());
717 auto True = B.buildConstant({SgprRB, Ty},
718 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
719 auto False = B.buildConstant({SgprRB, Ty}, 0);
723 B.buildSelect(
MI.getOperand(0).getReg(),
MI.getOperand(1).getReg(), True,
725 MI.eraseFromParent();
729 return lowerUnpackBitShift(
MI);
731 return lowerUnpackMinMax(
MI);
733 return lowerSplitTo16(
MI);
735 const RegisterBank *RB = MRI.getRegBank(
MI.getOperand(0).getReg());
736 MachineInstrBuilder
Hi;
737 switch (
MI.getOpcode()) {
738 case AMDGPU::G_ZEXT: {
739 Hi = B.buildConstant({RB, S32}, 0);
742 case AMDGPU::G_SEXT: {
744 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
745 Hi = B.buildAShr({RB, S32},
MI.getOperand(1).
getReg(), ShiftAmt);
748 case AMDGPU::G_ANYEXT: {
749 Hi = B.buildUndef({RB, S32});
756 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(),
757 {MI.getOperand(1).getReg(), Hi});
758 MI.eraseFromParent();
762 uint64_t ConstVal =
MI.getOperand(1).getCImm()->getZExtValue();
763 B.buildConstant(
MI.getOperand(0).getReg(), ConstVal);
765 MI.eraseFromParent();
770 LLT Ty = MRI.getType(Src);
774 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
776 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
777 auto One = B.buildConstant(VgprRB_S32, 1);
778 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
779 auto Zero = B.buildConstant(VgprRB_S32, 0);
780 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
781 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
783 assert(Ty == S32 || Ty == S16);
784 auto One = B.buildConstant({VgprRB, Ty}, 1);
785 B.buildAnd(BoolSrc, Src, One);
787 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
789 MI.eraseFromParent();
793 return lowerV_BFE(
MI);
795 return lowerS_BFE(
MI);
797 return lowerSplitTo32(
MI);
799 return lowerSplitTo32Select(
MI);
801 return lowerSplitTo32SExtInReg(
MI);
803 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
816 else if (
Size / 128 == 4)
824 else if (DstTy == S96)
825 splitLoad(
MI, {S64, S32}, S32);
826 else if (DstTy == V3S32)
827 splitLoad(
MI, {V2S32, S32}, S32);
828 else if (DstTy == V6S16)
829 splitLoad(
MI, {V4S16, V2S16}, V2S16);
837 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
840 else if (DstTy == V3S32)
841 widenLoad(
MI, V4S32, S32);
842 else if (DstTy == V6S16)
843 widenLoad(
MI, V8S16, V2S16);
851 return lowerUnpackAExt(
MI);
856 if (!WaterfallSgprs.
empty()) {
858 executeInWaterfallLoop(B,
make_range(
I, std::next(
I)), WaterfallSgprs);
935 return isAnyPtr(Ty, 32) ? Ty : LLT();
938 return isAnyPtr(Ty, 64) ? Ty : LLT();
941 return isAnyPtr(Ty, 128) ? Ty : LLT();
1058void RegBankLegalizeHelper::applyMappingDst(
1059 MachineInstr &
MI,
unsigned &
OpIdx,
1060 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1065 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1067 LLT Ty = MRI.getType(
Reg);
1068 [[maybe_unused]]
const RegisterBank *RB = MRI.getRegBank(
Reg);
1070 switch (MethodIDs[
OpIdx]) {
1129 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1132 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1133 B.buildTrunc(
Reg, CopyS32_Vcc);
1139 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1140 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1141 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1142 Op.setReg(NewVgprDstS16);
1143 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1145 B.buildTrunc(
Reg, NewSgprDstS32);
1154 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1155 Op.setReg(NewVgprDst);
1167 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1168 Op.setReg(NewVgprDst);
1176 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1178 if (!MRI.use_empty(
Reg))
1179 B.buildTrunc(
Reg, NewDst);
1192void RegBankLegalizeHelper::applyMappingSrc(
1193 MachineInstr &
MI,
unsigned &
OpIdx,
1194 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1195 SmallSet<Register, 4> &SgprWaterfallOperandRegs) {
1196 for (
unsigned i = 0; i < MethodIDs.
size(); ++
OpIdx, ++i) {
1197 if (MethodIDs[i] ==
None || MethodIDs[i] ==
IntrId || MethodIDs[i] ==
Imm)
1200 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1202 LLT Ty = MRI.getType(
Reg);
1203 const RegisterBank *RB = MRI.getRegBank(
Reg);
1205 switch (MethodIDs[i]) {
1208 assert(RB == VccRB || RB == SgprRB);
1210 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1212 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1213 Op.setReg(CopyVcc_Scc.getReg(0));
1231 assert(Ty == getTyFromID(MethodIDs[i]));
1232 assert(RB == getRegBankFromID(MethodIDs[i]));
1245 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1246 assert(RB == getRegBankFromID(MethodIDs[i]));
1262 assert(Ty == getTyFromID(MethodIDs[i]));
1264 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
1265 Op.setReg(CopyToVgpr.getReg(0));
1279 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1281 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
1282 Op.setReg(CopyToVgpr.getReg(0));
1289 assert(Ty == getTyFromID(MethodIDs[i]));
1299 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1300 Op.setReg(Aext.getReg(0));
1307 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1310 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
1311 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
1312 Op.setReg(BoolInReg.getReg(0));
1318 auto Sext = B.buildSExt(SgprRB_S32,
Reg);
1319 Op.setReg(Sext.getReg(0));
1325 auto Zext = B.buildZExt({SgprRB, S32},
Reg);
1326 Op.setReg(Zext.getReg(0));
1333 auto Sext = B.buildSExt({VgprRB, S32},
Reg);
1334 Op.setReg(Sext.getReg(0));
1341 auto Zext = B.buildZExt({VgprRB, S32},
Reg);
1342 Op.setReg(Zext.getReg(0));
1353 LLT Ty = MRI.getType(Dst);
1356 B.setInsertPt(*
MI.getParent(),
MI.getParent()->getFirstNonPHI());
1358 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1359 MI.getOperand(0).setReg(NewDst);
1360 B.buildTrunc(Dst, NewDst);
1362 for (
unsigned i = 1; i <
MI.getNumOperands(); i += 2) {
1365 auto DefMI = MRI.getVRegDef(
UseReg)->getIterator();
1370 auto NewUse = B.buildAnyExt(SgprRB_S32,
UseReg);
1371 MI.getOperand(i).setReg(NewUse.getReg(0));
1380 if (Ty ==
LLT::scalar(1) && MUI.isDivergent(Dst)) {
1383 "before RegBankLegalize to lower lane mask(vcc) phis");
1401 unsigned StartOpIdx,
1402 unsigned EndOpIdx) {
1403 for (
unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1404 if (
MRI.getRegBankOrNull(
MI.getOperand(i).getReg()) != RB)
1411 const RegisterBank *RB = MRI.getRegBank(
MI.getOperand(0).getReg());
1413 unsigned NumDefs =
MI.getNumDefs();
1414 unsigned NumOperands =
MI.getNumOperands();
1422 for (
unsigned i = NumDefs; i < NumOperands; ++i) {
1424 if (MRI.getRegBank(Reg) != RB) {
1425 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1426 MI.getOperand(i).setReg(Copy.getReg(0));
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
const SmallVectorImpl< MachineOperand > & Cond
void applyMappingTrivial(MachineInstr &MI)
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
void findRuleAndApplyMapping(MachineInstr &MI)
void applyMappingPHI(MachineInstr &MI)
const RegBankLLTMapping & findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
void push_back(const T &Elt)
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isAnyPtr(LLT Ty, unsigned Width)
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Kill
The last use of a register.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LoweringMethodID LoweringMethod
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping