26#include "llvm/IR/IntrinsicsAMDGPU.h"
28#define DEBUG_TYPE "amdgpu-regbanklegalize"
36 : MF(B.getMF()), ST(MF.getSubtarget<
GCNSubtarget>()), B(B),
37 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
38 RBLRules(RBLRules), IsWave32(ST.isWave32()),
39 SgprRB(&RBI.getRegBank(
AMDGPU::SGPRRegBankID)),
40 VgprRB(&RBI.getRegBank(
AMDGPU::VGPRRegBankID)),
41 VccRB(&RBI.getRegBank(
AMDGPU::VCCRegBankID)) {}
47 "No AMDGPU RegBankLegalize rules defined for opcode",
55 "AMDGPU RegBankLegalize: none of the rules defined with "
56 "'Any' for MI's opcode matched MI",
64 B.setInsertPt(*
MI.getParent(), std::next(
MI.getIterator()));
74 if (!lower(
MI, *Mapping, WFI))
83 "Waterfall range not initialized");
97 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
99 MovExecOpc = AMDGPU::S_MOV_B32;
100 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
101 XorTermOpc = AMDGPU::S_XOR_B32_term;
102 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
103 ExecReg = AMDGPU::EXEC_LO;
105 MovExecOpc = AMDGPU::S_MOV_B64;
106 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
107 XorTermOpc = AMDGPU::S_XOR_B64_term;
108 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
109 ExecReg = AMDGPU::EXEC;
113 const int OrigRangeSize = std::distance(BeginIt, EndIt);
121 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
147 MBB.addSuccessor(LoopBB);
150 B.setInsertPt(*LoopBB, LoopBB->
end());
201 auto NewEnd = BodyBB->
end();
202 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
215 auto OldVal = WaterfalledRegMap.
find(OldReg);
216 if (OldVal != WaterfalledRegMap.
end()) {
217 Op.setReg(OldVal->second);
222 LLT OpTy = MRI.getType(OpReg);
225 assert(MRI.getRegBank(OpReg) == VgprRB);
226 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
231 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
233 unsigned NumParts = OpSize / PartSize;
239 CurrentLaneParts.
push_back(CurrentLaneReg);
241 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
242 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
243 for (
unsigned i = 0; i < NumParts; ++i) {
245 CurrentLaneParts.
push_back(UnmergeCurrLane.getReg(i));
249 for (
unsigned i = 0; i < NumParts; ++i) {
250 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
256 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
259 Op.setReg(CurrentLaneReg);
262 WaterfalledRegMap.
insert(std::pair(OldReg,
Op.getReg()));
268 MRI.createVirtualRegister({WaveRC,
LLT::scalar(IsWave32 ? 32 : 64)});
269 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
272 B.buildInstr(AndSaveExecOpc)
275 MRI.setSimpleHint(SavedExec, CondRegLM);
277 B.setInsertPt(*BodyBB, BodyBB->
end());
280 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
286 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
290 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
293 B.setInsertPt(*RestoreExecBB, RestoreExecBB->
begin());
294 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
298 B.setInsertPt(*RemainderBB, RemainderBB->
begin());
303bool RegBankLegalizeHelper::splitLoad(MachineInstr &
MI,
305 MachineFunction &MF = B.getMF();
306 assert(
MI.getNumMemOperands() == 1);
307 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
309 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
311 LLT PtrTy = MRI.getType(
Base);
312 const RegisterBank *PtrRB = MRI.getRegBankOrNull(
Base);
316 unsigned ByteOffset = 0;
317 for (LLT PartTy : LLTBreakdown) {
319 if (ByteOffset == 0) {
320 BasePlusOffset =
Base;
322 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
326 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
327 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
328 LoadPartRegs.
push_back(LoadPart.getReg(0));
334 B.buildMergeLikeInstr(Dst, LoadPartRegs);
340 if (MRI.getType(
Reg) == MergeTy) {
343 auto Unmerge = B.buildUnmerge({DstRB, MergeTy},
Reg);
344 for (
unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
345 MergeTyParts.
push_back(Unmerge.getReg(i));
348 B.buildMergeLikeInstr(Dst, MergeTyParts);
350 MI.eraseFromParent();
354bool RegBankLegalizeHelper::widenLoad(MachineInstr &
MI, LLT WideTy,
356 MachineFunction &MF = B.getMF();
357 assert(
MI.getNumMemOperands() == 1);
358 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
360 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
363 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
364 auto WideLoad = B.buildLoad({DstRB, WideTy},
Base, *WideMMO);
367 B.buildTrunc(Dst, WideLoad);
370 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
372 LLT DstTy = MRI.getType(Dst);
374 for (
unsigned i = 0; i < NumElts; ++i) {
375 MergeTyParts.
push_back(Unmerge.getReg(i));
377 B.buildMergeLikeInstr(Dst, MergeTyParts);
379 MI.eraseFromParent();
383bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &
MI)
const {
386 MachineMemOperand &MMO =
MI.getMMO();
389 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
391 if (
MI.getOpcode() == G_LOAD) {
392 B.buildLoad(Dst, Ptr, *WideMMO);
394 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
396 if (
MI.getOpcode() == G_ZEXTLOAD) {
398 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
399 B.buildAnd(Dst, Load, MaskCst);
401 assert(
MI.getOpcode() == G_SEXTLOAD);
402 B.buildSExtInReg(Dst, Load, MemSize);
406 MI.eraseFromParent();
410bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &
MI) {
412 LLT Ty = MRI.getType(Dst);
414 unsigned Opc =
MI.getOpcode();
415 int TrueExtCst =
Opc == G_SEXT ? -1 : 1;
416 if (Ty == S32 || Ty == S16) {
417 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
418 auto False = B.buildConstant({VgprRB, Ty}, 0);
419 B.buildSelect(Dst, Src, True, False);
420 }
else if (Ty == S64) {
421 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
422 auto False = B.buildConstant({VgprRB_S32}, 0);
423 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
424 MachineInstrBuilder
Hi;
433 Hi = B.buildUndef({VgprRB_S32});
437 MF, MORE,
"amdgpu-regbanklegalize",
438 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported",
MI);
442 B.buildMergeValues(Dst, {
Lo.getReg(0),
Hi.getReg(0)});
445 MF, MORE,
"amdgpu-regbanklegalize",
446 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported",
MI);
450 MI.eraseFromParent();
454std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(
Register Reg) {
455 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
456 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
457 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
458 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
459 return {
Lo.getReg(0),
Hi.getReg(0)};
462std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(
Register Reg) {
463 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
464 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
465 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
466 return {
Lo.getReg(0),
Hi.getReg(0)};
469std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(
Register Reg) {
470 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
472 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
473 return {
Lo.getReg(0),
Hi.getReg(0)};
476std::pair<Register, Register>
477RegBankLegalizeHelper::unpackAExtTruncS16(
Register Reg) {
478 auto [Lo32, Hi32] = unpackAExt(
Reg);
479 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
480 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
483bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &
MI) {
485 switch (
MI.getOpcode()) {
486 case AMDGPU::G_SHL: {
487 auto [Val0, Val1] = unpackAExt(
MI.getOperand(1).getReg());
488 auto [Amt0, Amt1] = unpackAExt(
MI.getOperand(2).getReg());
489 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
490 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
493 case AMDGPU::G_LSHR: {
494 auto [Val0, Val1] = unpackZExt(
MI.getOperand(1).getReg());
495 auto [Amt0, Amt1] = unpackZExt(
MI.getOperand(2).getReg());
496 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
497 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
500 case AMDGPU::G_ASHR: {
501 auto [Val0, Val1] = unpackSExt(
MI.getOperand(1).getReg());
502 auto [Amt0, Amt1] = unpackSExt(
MI.getOperand(2).getReg());
503 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
504 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
509 MF, MORE,
"amdgpu-regbanklegalize",
510 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
514 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
515 MI.eraseFromParent();
519bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &
MI) {
521 switch (
MI.getOpcode()) {
523 case AMDGPU::G_SMAX: {
525 auto [Val0_Lo, Val0_Hi] = unpackSExt(
MI.getOperand(1).getReg());
526 auto [Val1_Lo, Val1_Hi] = unpackSExt(
MI.getOperand(2).getReg());
527 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
529 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
534 case AMDGPU::G_UMAX: {
536 auto [Val0_Lo, Val0_Hi] = unpackZExt(
MI.getOperand(1).getReg());
537 auto [Val1_Lo, Val1_Hi] = unpackZExt(
MI.getOperand(2).getReg());
538 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
540 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
546 MF, MORE,
"amdgpu-regbanklegalize",
547 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented",
MI);
550 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
551 MI.eraseFromParent();
555bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &
MI) {
556 auto [Op1Lo, Op1Hi] = unpackAExt(
MI.getOperand(1).getReg());
557 auto [Op2Lo, Op2Hi] = unpackAExt(
MI.getOperand(2).getReg());
558 auto ResLo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
559 auto ResHi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
560 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
561 {ResLo.getReg(0), ResHi.getReg(0)});
562 MI.eraseFromParent();
568 return (GI->is(Intrinsic::amdgcn_sbfe));
570 return MI.getOpcode() == AMDGPU::G_SBFX;
573bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &
MI) {
580 Register Src =
MI.getOperand(FirstOpnd).getReg();
581 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
582 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
587 unsigned SHROpc =
Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
588 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
596 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
597 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
598 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
599 MI.eraseFromParent();
603 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
604 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
605 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
606 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
607 auto Zero = B.buildConstant({VgprRB, S32}, 0);
608 unsigned BFXOpc =
Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
610 if (WidthImm <= 32) {
612 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo,
Zero, Width});
613 MachineInstrBuilder
Hi;
616 Hi = B.buildAShr(VgprRB_S32,
Lo, B.buildConstant(VgprRB_S32, 31));
621 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
623 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
625 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi,
Zero, Amt});
626 B.buildMergeLikeInstr(Dst, {SHRSrcLo,
Hi});
629 MI.eraseFromParent();
633bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &
MI) {
635 LLT Ty = MRI.getType(DstReg);
638 Register Src =
MI.getOperand(FirstOpnd).getReg();
639 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
640 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
647 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
648 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
649 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset,
Size);
650 unsigned Opc32 =
Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
651 unsigned Opc64 =
Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
652 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
656 auto S_BFE = B.buildInstr(
Opc, {{SgprRB, Ty}},
657 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
659 *ST.getRegisterInfo(), RBI);
661 B.buildCopy(DstReg,
S_BFE->getOperand(0).getReg());
662 MI.eraseFromParent();
666bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &
MI) {
668 LLT DstTy = MRI.getType(Dst);
669 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
670 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
671 auto Op1 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(1).
getReg());
672 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
673 unsigned Opc =
MI.getOpcode();
676 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)},
Flags);
678 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)},
Flags);
679 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
680 MI.eraseFromParent();
684bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &
MI) {
686 assert(MRI.getType(Dst) == S64);
687 auto Op1 = B.buildUnmerge({VgprRB_S32},
MI.getOperand(1).
getReg());
688 auto Op2 = B.buildUnmerge({VgprRB_S32},
MI.getOperand(2).
getReg());
692 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
693 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
694 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
695 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
696 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
697 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
699 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
700 MI.eraseFromParent();
704bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &
MI) {
706 assert(MRI.getType(Dst) == V2S16);
707 unsigned Opc =
MI.getOpcode();
708 unsigned NumOps =
MI.getNumOperands();
711 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(
MI.getOperand(1).getReg());
714 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo},
Flags);
715 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi},
Flags);
716 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
717 MI.eraseFromParent();
721 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(
MI.getOperand(2).getReg());
724 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo},
Flags);
725 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi},
Flags);
726 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
727 MI.eraseFromParent();
732 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(
MI.getOperand(3).getReg());
733 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo},
Flags);
734 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi},
Flags);
735 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
736 MI.eraseFromParent();
740bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &
MI) {
747 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
750 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
751 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
752 if (ST.hasScalarMulHiInsts()) {
753 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
755 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
756 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
757 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
768 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
769 B.buildConstant(Dst1, 0);
772 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
773 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
774 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
776 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
778 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
779 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
780 B.buildCopy(Dst1, AddHi.getReg(1));
783 MI.eraseFromParent();
787bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &
MI) {
789 LLT DstTy = MRI.getType(Dst);
790 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
792 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
793 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
794 auto Op3 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(3).
getReg());
798 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(0), Op3.getReg(0), Flags);
800 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(1), Op3.getReg(1), Flags);
802 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
803 MI.eraseFromParent();
807bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &
MI) {
808 auto Op1 = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
809 int Amt =
MI.getOperand(2).getImm();
813 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
816 Lo = Freeze.getReg(0);
819 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
822 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
823 Hi = B.buildAShr(VgprRB_S32,
Lo, SignExtCst).getReg(0);
827 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
830 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(), {Lo, Hi});
831 MI.eraseFromParent();
835bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &
MI) {
841 unsigned Opc =
MI.getOpcode();
850 case AMDGPU::G_AMDGPU_FFBH_U32:
852 AddOpc = AMDGPU::G_UADDSAT;
853 SearchFromMSB =
true;
855 case AMDGPU::G_AMDGPU_FFBL_B32:
857 AddOpc = AMDGPU::G_UADDSAT;
858 SearchFromMSB =
false;
860 case AMDGPU::G_CTLZ_ZERO_UNDEF:
861 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
862 AddOpc = AMDGPU::G_ADD;
863 SearchFromMSB =
true;
865 case AMDGPU::G_CTTZ_ZERO_UNDEF:
866 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
867 AddOpc = AMDGPU::G_ADD;
868 SearchFromMSB =
false;
874 auto Unmerge = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
881 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ?
Hi :
Lo});
883 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ?
Lo :
Hi});
885 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
886 {Secondary, B.buildConstant(VgprRB_S32, 32)});
887 B.buildUMin(
MI.getOperand(0).getReg(), Primary, Adjusted);
889 MI.eraseFromParent();
893bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &
MI) {
905 LLT VecTy = MRI.getType(Src);
908 MachineRegisterInfo::VRegAttrs VgprRB_EltTy = {VgprRB, ScalarTy};
910 auto Unmerge = B.buildUnmerge(VgprRB_EltTy, Src);
913 Register PrevSelect = Unmerge.getReg(0);
914 for (
unsigned I = 1;
I < NumElts; ++
I) {
915 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)},
I);
918 B.buildSelect(VgprRB_EltTy, Cmp, Unmerge.getReg(
I), PrevSelect)
921 B.buildCopy(Dst, PrevSelect);
923 auto InitUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(0));
924 Register PrevLo = InitUnmerge.getReg(0);
925 Register PrevHi = InitUnmerge.getReg(1);
926 for (
unsigned I = 1;
I < NumElts; ++
I) {
927 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)},
I);
929 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(
I));
930 PrevLo = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(0), PrevLo)
932 PrevHi = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(1), PrevHi)
935 B.buildMergeLikeInstr(Dst, {PrevLo, PrevHi});
938 MF, MORE,
"amdgpu-regbanklegalize",
939 "AMDGPU RegBankLegalize: ExtrVecEltToSel unsupported element type",
MI);
943 MI.eraseFromParent();
947bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &
MI) {
960 LLT SrcTy = MRI.getType(Src);
963 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
964 "expected VGPR src and SGPR idx");
966 auto CastSrc = B.buildBitcast({VgprRB, Vec32Ty}, Src);
969 auto One = B.buildConstant(SgprRB_S32, 1);
970 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
971 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
973 auto ExtLo = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxLo);
974 auto ExtHi = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxHi);
976 B.buildMergeLikeInstr(Dst, {ExtLo.getReg(0), ExtHi.getReg(0)});
978 MI.eraseFromParent();
982bool RegBankLegalizeHelper::lowerInsVecEltToSel(MachineInstr &
MI) {
995 LLT VecTy = MRI.getType(Src);
998 const RegisterBank *SrcRB = MRI.getRegBank(Src);
999 bool IsSGPR = (SrcRB == SgprRB);
1000 SmallVector<Register, 16> Selects;
1004 auto Unmerge = B.buildUnmerge(VgprRB_S32, Src);
1005 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1006 Register EltLo = EltUnmerge.getReg(0);
1007 Register EltHi = EltUnmerge.getReg(1);
1008 for (
unsigned I = 0;
I < NumElts; ++
I) {
1009 auto IdxConst = B.buildConstant(VgprRB_S32,
I);
1012 B.buildSelect(VgprRB_S32, Cmp, EltLo, Unmerge.getReg(2 *
I))
1015 B.buildSelect(VgprRB_S32, Cmp, EltHi, Unmerge.getReg(2 *
I + 1))
1019 auto Vec32 = B.buildBuildVector({VgprRB, Vec32Ty}, Selects);
1020 B.buildBitcast(Dst, Vec32);
1023 MachineRegisterInfo::VRegAttrs SrcRB_EltTy = {SrcRB, ScalarTy};
1024 MachineRegisterInfo::VRegAttrs CmpTy = IsSGPR ? SgprRB_S32 : VccRB_S1;
1025 auto Unmerge = B.buildUnmerge(SrcRB_EltTy, Src);
1026 for (
unsigned I = 0;
I < NumElts; ++
I) {
1027 auto IdxConst = B.buildConstant(SgprRB_S32,
I);
1030 B.buildSelect(SrcRB_EltTy, Cmp, Elt, Unmerge.getReg(
I)).getReg(0));
1032 B.buildMergeLikeInstr(Dst, Selects);
1035 MF, MORE,
"amdgpu-regbanklegalize",
1036 "AMDGPU RegBankLegalize: InsVecEltToSel unsupported element type",
MI);
1040 MI.eraseFromParent();
1044bool RegBankLegalizeHelper::lowerInsVecEltTo32(MachineInstr &
MI) {
1059 LLT SrcTy = MRI.getType(Src);
1062 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1063 "expected VGPR src and SGPR idx");
1065 MachineRegisterInfo::VRegAttrs VgprRB_Vec32Ty = {VgprRB, Vec32Ty};
1067 auto CastSrc = B.buildBitcast(VgprRB_Vec32Ty, Src);
1068 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1071 auto One = B.buildConstant(SgprRB_S32, 1);
1072 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1073 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1075 auto InsLo = B.buildInsertVectorElement(VgprRB_Vec32Ty, CastSrc,
1076 EltUnmerge.getReg(0), IdxLo);
1077 auto InsHi = B.buildInsertVectorElement(VgprRB_Vec32Ty, InsLo,
1078 EltUnmerge.getReg(1), IdxHi);
1080 B.buildBitcast(Dst, InsHi);
1082 MI.eraseFromParent();
1086bool RegBankLegalizeHelper::lowerAbsToNegMax(MachineInstr &
MI) {
1096 LLT Ty = MRI.getType(DstReg);
1102 Zero = B.buildBuildVector({VgprRB, Ty}, {Zero16, Zero16}).
getReg(0);
1104 assert((Ty == S32 || Ty == S16) &&
"unexpected type for AbsToNegMax");
1105 Zero = B.buildConstant({VgprRB, Ty}, 0).
getReg(0);
1108 auto Neg = B.buildSub({VgprRB, Ty},
Zero, SrcReg);
1109 B.buildSMax(DstReg, SrcReg, Neg);
1110 MI.eraseFromParent();
1114bool RegBankLegalizeHelper::lowerAbsToS32(MachineInstr &
MI) {
1124 auto Bitcast = B.buildBitcast({SgprRB_S32},
MI.getOperand(1).
getReg());
1125 auto SextInReg = B.buildSExtInReg({SgprRB_S32},
Bitcast, 16);
1127 B.buildAShr({SgprRB_S32},
Bitcast, B.buildConstant({SgprRB_S32}, 16));
1129 auto AbsLo = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {SextInReg});
1130 auto AbsHi = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {ShiftHi});
1131 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
1132 {AbsLo.getReg(0), AbsHi.getReg(0)});
1134 MI.eraseFromParent();
1138bool RegBankLegalizeHelper::lower(MachineInstr &
MI,
1146 return lowerVccExtToSel(
MI);
1148 LLT Ty = MRI.getType(
MI.getOperand(0).getReg());
1149 auto True = B.buildConstant({SgprRB, Ty},
1150 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
1151 auto False = B.buildConstant({SgprRB, Ty}, 0);
1155 B.buildSelect(
MI.getOperand(0).getReg(),
MI.getOperand(1).getReg(), True,
1157 MI.eraseFromParent();
1161 return lowerUnpackBitShift(
MI);
1163 return lowerUnpackMinMax(
MI);
1165 return lowerSplitTo16(
MI);
1167 const RegisterBank *RB = MRI.getRegBank(
MI.getOperand(0).getReg());
1168 MachineInstrBuilder
Hi;
1169 switch (
MI.getOpcode()) {
1170 case AMDGPU::G_ZEXT: {
1171 Hi = B.buildConstant({RB, S32}, 0);
1174 case AMDGPU::G_SEXT: {
1176 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
1177 Hi = B.buildAShr({RB, S32},
MI.getOperand(1).
getReg(), ShiftAmt);
1180 case AMDGPU::G_ANYEXT: {
1181 Hi = B.buildUndef({RB, S32});
1186 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
1191 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(),
1192 {MI.getOperand(1).getReg(), Hi});
1193 MI.eraseFromParent();
1197 uint64_t ConstVal =
MI.getOperand(1).getCImm()->getZExtValue();
1198 B.buildConstant(
MI.getOperand(0).getReg(), ConstVal);
1200 MI.eraseFromParent();
1205 LLT Ty = MRI.getType(Src);
1209 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
1211 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
1212 auto One = B.buildConstant(VgprRB_S32, 1);
1213 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
1214 auto Zero = B.buildConstant(VgprRB_S32, 0);
1215 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
1216 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
1218 assert(Ty == S32 || Ty == S16);
1219 auto One = B.buildConstant({VgprRB, Ty}, 1);
1220 B.buildAnd(BoolSrc, Src, One);
1222 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
1224 MI.eraseFromParent();
1228 return lowerV_BFE(
MI);
1230 return lowerS_BFE(
MI);
1232 return lowerUniMAD64(
MI);
1234 B.buildMul(
MI.getOperand(0),
MI.getOperand(1),
MI.getOperand(2));
1235 MI.eraseFromParent();
1239 auto Op1 = B.buildTrunc(VgprRB_S32,
MI.getOperand(1));
1240 auto Op2 = B.buildTrunc(VgprRB_S32,
MI.getOperand(2));
1241 auto Zero = B.buildConstant({VgprRB, S64}, 0);
1243 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
1244 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1245 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1247 B.buildInstr(NewOpc, {
MI.getOperand(0).getReg(), {SgprRB, S32}},
1249 MI.eraseFromParent();
1253 return lowerSplitTo32(
MI);
1255 return lowerSplitTo32Mul(
MI);
1257 return lowerSplitTo32Select(
MI);
1259 return lowerSplitTo32SExtInReg(
MI);
1261 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
1272 if (
Size / 128 == 2)
1274 else if (
Size / 128 == 4)
1278 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1284 else if (DstTy == S96)
1285 splitLoad(
MI, {S64, S32}, S32);
1286 else if (DstTy == V3S32)
1287 splitLoad(
MI, {V2S32, S32}, S32);
1288 else if (DstTy == V6S16)
1289 splitLoad(
MI, {V4S16, V2S16}, V2S16);
1292 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1299 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
1301 widenLoad(
MI, S128);
1302 else if (DstTy == V3S32)
1303 widenLoad(
MI, V4S32, S32);
1304 else if (DstTy == V6S16)
1305 widenLoad(
MI, V8S16, V2S16);
1308 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1315 return lowerUnpackAExt(
MI);
1320 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1326 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1328 B.setInstrAndDebugLoc(
MI);
1329 for (
unsigned i =
MI.getNumDefs(); i <
MI.getNumOperands(); ++i) {
1330 MachineOperand &
Op =
MI.getOperand(i);
1334 if (MRI.getRegBank(
Reg) != VgprRB) {
1335 auto Copy = B.buildCopy({VgprRB, MRI.getType(
Reg)},
Reg);
1336 Op.setReg(
Copy.getReg(0));
1346 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1351 B.setInstrAndDebugLoc(
MI);
1354 B.buildUnmerge({SgprRB, V2S16}, Unmerge->
getSourceReg());
1355 for (
unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1356 auto [Dst0S32, Dst1S32] =
1357 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1358 B.buildTrunc(
MI.getOperand(i * 2).getReg(), Dst0S32);
1359 B.buildTrunc(
MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1362 auto [Dst0S32, Dst1S32] = unpackAExt(
MI.getOperand(2).getReg());
1363 B.buildTrunc(
MI.getOperand(0).getReg(), Dst0S32);
1364 B.buildTrunc(
MI.getOperand(1).getReg(), Dst1S32);
1367 MI.eraseFromParent();
1372 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1373 B.setInsertPt(*
MI.getParent(),
MI.getParent()->getFirstNonPHI());
1374 MI.getOperand(0).setReg(NewDst);
1375 B.buildTrunc(Dst, NewDst);
1377 for (
unsigned i = 1; i <
MI.getNumOperands(); i += 2) {
1385 auto NewUse = B.buildAnyExt(SgprRB_S32,
UseReg);
1386 MI.getOperand(i).setReg(NewUse.getReg(0));
1394 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1399 assert(MRI.getRegBankOrNull(
MI.getOperand(0).getReg()) == VgprRB);
1403 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1404 return RB == VgprRB || RB == SgprRB;
1409 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1414 unsigned RsrcIdx = RSrcIntrin->
RsrcArg +
MI.getNumExplicitDefs() + 1;
1415 return applyRegisterBanksVgprWithSgprRsrc(
MI, RsrcIdx);
1421 unsigned RsrcIdx =
MI.getNumOperands();
1422 while (RsrcIdx-- >
MI.getNumExplicitDefs()) {
1423 const MachineOperand &
Op =
MI.getOperand(RsrcIdx);
1424 if (
Op.isReg() &&
Op.getReg().isVirtual())
1427 return applyRegisterBanksVgprWithSgprRsrc(
MI, RsrcIdx);
1430 return lowerSplitBitCount64To32(
MI);
1432 return lowerExtrVecEltToSel(
MI);
1434 return lowerExtrVecEltTo32(
MI);
1436 return lowerInsVecEltToSel(
MI);
1438 return lowerInsVecEltTo32(
MI);
1440 return lowerAbsToNegMax(
MI);
1442 return lowerAbsToS32(
MI);
1446 if (!executeInWaterfallLoop(B, WFI))
1552 return isAnyPtr(Ty, 32) ? Ty : LLT();
1555 return isAnyPtr(Ty, 64) ? Ty : LLT();
1558 return isAnyPtr(Ty, 128) ? Ty : LLT();
1602 const SIRegisterInfo *
TRI =
1603 static_cast<const SIRegisterInfo *
>(MRI.getTargetRegisterInfo());
1605 if (LLTSize >= 32 &&
TRI->getSGPRClassForBitWidth(LLTSize))
1610 const SIRegisterInfo *
TRI =
1611 static_cast<const SIRegisterInfo *
>(MRI.getTargetRegisterInfo());
1717bool RegBankLegalizeHelper::applyMappingDst(
1718 MachineInstr &
MI,
unsigned &
OpIdx,
1719 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1724 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1726 LLT Ty = MRI.getType(
Reg);
1727 [[maybe_unused]]
const RegisterBank *RB = MRI.getRegBank(
Reg);
1729 switch (MethodIDs[
OpIdx]) {
1798 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1800 if (!MRI.use_empty(
Reg)) {
1802 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1803 B.buildTrunc(
Reg, CopyS32_Vcc);
1810 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1811 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1812 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1813 Op.setReg(NewVgprDstS16);
1814 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1816 B.buildTrunc(
Reg, NewSgprDstS32);
1827 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1828 Op.setReg(NewVgprDst);
1841 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1842 Op.setReg(NewVgprDst);
1850 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1852 if (!MRI.use_empty(
Reg))
1853 B.buildTrunc(
Reg, NewDst);
1860 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
1861 B.buildCopy(
Reg,
Op.getReg());
1866 MF, MORE,
"amdgpu-regbanklegalize",
1867 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for",
MI);
1872 MF, MORE,
"amdgpu-regbanklegalize",
1873 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported",
MI);
1881bool RegBankLegalizeHelper::applyMappingSrc(
1882 MachineInstr &
MI,
unsigned &
OpIdx,
1883 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1885 for (
unsigned i = 0; i < MethodIDs.
size(); ++
OpIdx, ++i) {
1886 if (MethodIDs[i] ==
None || MethodIDs[i] ==
IntrId || MethodIDs[i] ==
Imm)
1889 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1891 LLT Ty = MRI.getType(
Reg);
1892 const RegisterBank *RB = MRI.getRegBank(
Reg);
1894 switch (MethodIDs[i]) {
1897 assert(RB == VccRB || RB == SgprRB);
1899 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1901 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1902 Op.setReg(CopyVcc_Scc.getReg(0));
1920 assert(Ty == getTyFromID(MethodIDs[i]));
1921 assert(RB == getRegBankFromID(MethodIDs[i]));
1935 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1936 assert(RB == getRegBankFromID(MethodIDs[i]));
1960 assert(Ty == getTyFromID(MethodIDs[i]));
1962 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
1963 Op.setReg(CopyToVgpr.getReg(0));
1979 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1981 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
1982 Op.setReg(CopyToVgpr.getReg(0));
1989 assert(Ty == getTyFromID(MethodIDs[i]));
1994 WFI.
End = std::next(
MI.getIterator());
2001 assert(Ty == getTyFromID(MethodIDs[i]));
2007 while (
Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
2012 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
2016 B.setInsertPt(*
MI.getParent(), Start);
2025 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2029 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2035 assert(Ty == getTyFromID(MethodIDs[i]));
2039 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2049 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
2050 Op.setReg(Aext.getReg(0));
2057 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
2060 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
2061 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
2062 Op.setReg(BoolInReg.getReg(0));
2068 auto Sext = B.buildSExt(SgprRB_S32,
Reg);
2069 Op.setReg(Sext.getReg(0));
2075 auto Zext = B.buildZExt({SgprRB, S32},
Reg);
2076 Op.setReg(Zext.getReg(0));
2082 auto Aext = B.buildAnyExt({VgprRB, S32},
Reg);
2083 Op.setReg(Aext.getReg(0));
2090 auto Sext = B.buildSExt({VgprRB, S32},
Reg);
2091 Op.setReg(Sext.getReg(0));
2098 auto Zext = B.buildZExt({VgprRB, S32},
Reg);
2099 Op.setReg(Zext.getReg(0));
2104 MF, MORE,
"amdgpu-regbanklegalize",
2105 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported",
MI);
2115 unsigned StartOpIdx,
2116 unsigned EndOpIdx) {
2117 for (
unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
2124bool RegBankLegalizeHelper::applyRegisterBanksVgprWithSgprRsrc(
2125 MachineInstr &
MI,
unsigned RsrcIdx) {
2126 const unsigned NumDefs =
MI.getNumExplicitDefs();
2128 MachineBasicBlock *
MBB =
MI.getParent();
2132 for (
unsigned i = 0; i < NumDefs; ++i) {
2134 if (MRI.getRegBank(
Reg) == VgprRB)
2137 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(
Reg)});
2138 MI.getOperand(i).setReg(NewVgprDst);
2142 B.setInstrAndDebugLoc(
MI);
2145 for (
unsigned i = NumDefs; i < RsrcIdx; ++i) {
2146 MachineOperand &
Op =
MI.getOperand(i);
2154 if (MRI.getRegBank(
Reg) == VgprRB)
2157 auto Copy = B.buildCopy({VgprRB, MRI.getType(
Reg)},
Reg);
2158 Op.setReg(
Copy.getReg(0));
2161 SmallSet<Register, 4> OpsToWaterfall;
2164 for (
unsigned i = RsrcIdx; i <
MI.getNumOperands(); ++i) {
2165 MachineOperand &
Op =
MI.getOperand(i);
2170 if (MRI.getRegBank(
Reg) != SgprRB)
2174 if (!OpsToWaterfall.
empty()) {
2176 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
const SmallVectorImpl< MachineOperand > & Cond
bool findRuleAndApplyMapping(MachineInstr &MI)
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
bool isValid() const
Check for null.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
void push_back(const T &Elt)
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ SgprV4S32_ReadFirstLane
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
@ VerifyAllSgprOrVgprGPHI
@ AextToS32InIncomingBlockGPHI
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LoweringMethodID LoweringMethod
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs
MachineBasicBlock::iterator End