26#include "llvm/IR/IntrinsicsAMDGPU.h"
28#define DEBUG_TYPE "amdgpu-regbanklegalize"
36 : MF(B.getMF()), ST(MF.getSubtarget<
GCNSubtarget>()), B(B),
37 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
38 RBLRules(RBLRules), IsWave32(ST.isWave32()),
39 SgprRB(&RBI.getRegBank(
AMDGPU::SGPRRegBankID)),
40 VgprRB(&RBI.getRegBank(
AMDGPU::VGPRRegBankID)),
41 AgprRB(&RBI.getRegBank(
AMDGPU::AGPRRegBankID)),
42 VccRB(&RBI.getRegBank(
AMDGPU::VCCRegBankID)) {}
48 "No AMDGPU RegBankLegalize rules defined for opcode",
56 "AMDGPU RegBankLegalize: none of the rules defined with "
57 "'Any' for MI's opcode matched MI",
65 B.setInsertPt(*
MI.getParent(), std::next(
MI.getIterator()));
75 if (!lower(
MI, *Mapping, WFI))
84 "Waterfall range not initialized");
98 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
100 MovExecOpc = AMDGPU::S_MOV_B32;
101 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
102 XorTermOpc = AMDGPU::S_XOR_B32_term;
103 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
104 ExecReg = AMDGPU::EXEC_LO;
106 MovExecOpc = AMDGPU::S_MOV_B64;
107 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
108 XorTermOpc = AMDGPU::S_XOR_B64_term;
109 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
110 ExecReg = AMDGPU::EXEC;
114 const int OrigRangeSize = std::distance(BeginIt, EndIt);
122 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
148 MBB.addSuccessor(LoopBB);
151 B.setInsertPt(*LoopBB, LoopBB->
end());
202 auto NewEnd = BodyBB->
end();
203 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
216 auto OldVal = WaterfalledRegMap.
find(OldReg);
217 if (OldVal != WaterfalledRegMap.
end()) {
218 Op.setReg(OldVal->second);
223 LLT OpTy = MRI.getType(OpReg);
226 assert(MRI.getRegBank(OpReg) == VgprRB);
227 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
232 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
234 unsigned NumParts = OpSize / PartSize;
240 CurrentLaneParts.
push_back(CurrentLaneReg);
242 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
243 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
244 for (
unsigned i = 0; i < NumParts; ++i) {
246 CurrentLaneParts.
push_back(UnmergeCurrLane.getReg(i));
250 for (
unsigned i = 0; i < NumParts; ++i) {
251 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
257 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
260 Op.setReg(CurrentLaneReg);
263 WaterfalledRegMap.
insert(std::pair(OldReg,
Op.getReg()));
269 MRI.createVirtualRegister({WaveRC,
LLT::scalar(IsWave32 ? 32 : 64)});
270 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
273 B.buildInstr(AndSaveExecOpc)
276 MRI.setSimpleHint(SavedExec, CondRegLM);
278 B.setInsertPt(*BodyBB, BodyBB->
end());
281 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
287 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
291 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
294 B.setInsertPt(*RestoreExecBB, RestoreExecBB->
begin());
295 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
299 B.setInsertPt(*RemainderBB, RemainderBB->
begin());
304bool RegBankLegalizeHelper::splitLoad(MachineInstr &
MI,
306 MachineFunction &MF = B.getMF();
307 assert(
MI.getNumMemOperands() == 1);
308 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
310 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
312 LLT PtrTy = MRI.getType(
Base);
313 const RegisterBank *PtrRB = MRI.getRegBankOrNull(
Base);
317 unsigned ByteOffset = 0;
318 for (LLT PartTy : LLTBreakdown) {
320 if (ByteOffset == 0) {
321 BasePlusOffset =
Base;
323 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
327 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
328 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
329 LoadPartRegs.
push_back(LoadPart.getReg(0));
335 B.buildMergeLikeInstr(Dst, LoadPartRegs);
341 if (MRI.getType(
Reg) == MergeTy) {
344 auto Unmerge = B.buildUnmerge({DstRB, MergeTy},
Reg);
345 for (
unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
346 MergeTyParts.
push_back(Unmerge.getReg(i));
349 B.buildMergeLikeInstr(Dst, MergeTyParts);
351 MI.eraseFromParent();
355bool RegBankLegalizeHelper::widenLoad(MachineInstr &
MI, LLT WideTy,
357 MachineFunction &MF = B.getMF();
358 assert(
MI.getNumMemOperands() == 1);
359 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
361 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
364 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
365 auto WideLoad = B.buildLoad({DstRB, WideTy},
Base, *WideMMO);
368 B.buildTrunc(Dst, WideLoad);
371 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
373 LLT DstTy = MRI.getType(Dst);
375 for (
unsigned i = 0; i < NumElts; ++i) {
376 MergeTyParts.
push_back(Unmerge.getReg(i));
378 B.buildMergeLikeInstr(Dst, MergeTyParts);
380 MI.eraseFromParent();
384bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &
MI)
const {
387 MachineMemOperand &MMO =
MI.getMMO();
390 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
392 if (
MI.getOpcode() == G_LOAD) {
393 B.buildLoad(Dst, Ptr, *WideMMO);
395 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
397 if (
MI.getOpcode() == G_ZEXTLOAD) {
399 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
400 B.buildAnd(Dst, Load, MaskCst);
402 assert(
MI.getOpcode() == G_SEXTLOAD);
403 B.buildSExtInReg(Dst, Load, MemSize);
407 MI.eraseFromParent();
411bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &
MI) {
413 LLT Ty = MRI.getType(Dst);
415 unsigned Opc =
MI.getOpcode();
416 int TrueExtCst =
Opc == G_SEXT ? -1 : 1;
417 if (Ty == S32 || Ty == S16) {
418 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
419 auto False = B.buildConstant({VgprRB, Ty}, 0);
420 B.buildSelect(Dst, Src, True, False);
421 }
else if (Ty == S64) {
422 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
423 auto False = B.buildConstant({VgprRB_S32}, 0);
424 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
425 MachineInstrBuilder
Hi;
434 Hi = B.buildUndef({VgprRB_S32});
438 MF, MORE,
"amdgpu-regbanklegalize",
439 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported",
MI);
443 B.buildMergeValues(Dst, {
Lo.getReg(0),
Hi.getReg(0)});
446 MF, MORE,
"amdgpu-regbanklegalize",
447 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported",
MI);
451 MI.eraseFromParent();
455std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(
Register Reg) {
456 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
457 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
458 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
459 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
460 return {
Lo.getReg(0),
Hi.getReg(0)};
463std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(
Register Reg) {
464 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
465 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
466 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
467 return {
Lo.getReg(0),
Hi.getReg(0)};
470std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(
Register Reg) {
471 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
473 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
474 return {
Lo.getReg(0),
Hi.getReg(0)};
477std::pair<Register, Register>
478RegBankLegalizeHelper::unpackAExtTruncS16(
Register Reg) {
479 auto [Lo32, Hi32] = unpackAExt(
Reg);
480 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
481 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
484bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &
MI) {
486 switch (
MI.getOpcode()) {
487 case AMDGPU::G_SHL: {
488 auto [Val0, Val1] = unpackAExt(
MI.getOperand(1).getReg());
489 auto [Amt0, Amt1] = unpackAExt(
MI.getOperand(2).getReg());
490 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
491 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
494 case AMDGPU::G_LSHR: {
495 auto [Val0, Val1] = unpackZExt(
MI.getOperand(1).getReg());
496 auto [Amt0, Amt1] = unpackZExt(
MI.getOperand(2).getReg());
497 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
498 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
501 case AMDGPU::G_ASHR: {
502 auto [Val0, Val1] = unpackSExt(
MI.getOperand(1).getReg());
503 auto [Amt0, Amt1] = unpackSExt(
MI.getOperand(2).getReg());
504 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
505 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
510 MF, MORE,
"amdgpu-regbanklegalize",
511 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
515 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
516 MI.eraseFromParent();
520bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &
MI) {
522 switch (
MI.getOpcode()) {
524 case AMDGPU::G_SMAX: {
526 auto [Val0_Lo, Val0_Hi] = unpackSExt(
MI.getOperand(1).getReg());
527 auto [Val1_Lo, Val1_Hi] = unpackSExt(
MI.getOperand(2).getReg());
528 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
530 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
535 case AMDGPU::G_UMAX: {
537 auto [Val0_Lo, Val0_Hi] = unpackZExt(
MI.getOperand(1).getReg());
538 auto [Val1_Lo, Val1_Hi] = unpackZExt(
MI.getOperand(2).getReg());
539 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
541 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
547 MF, MORE,
"amdgpu-regbanklegalize",
548 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented",
MI);
551 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
552 MI.eraseFromParent();
556bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &
MI) {
557 auto [Op1Lo, Op1Hi] = unpackAExt(
MI.getOperand(1).getReg());
558 auto [Op2Lo, Op2Hi] = unpackAExt(
MI.getOperand(2).getReg());
559 auto ResLo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
560 auto ResHi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
561 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
562 {ResLo.getReg(0), ResHi.getReg(0)});
563 MI.eraseFromParent();
569 return (GI->is(Intrinsic::amdgcn_sbfe));
571 return MI.getOpcode() == AMDGPU::G_SBFX;
574bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &
MI) {
581 Register Src =
MI.getOperand(FirstOpnd).getReg();
582 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
583 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
588 unsigned SHROpc =
Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
589 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
597 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
598 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
599 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
600 MI.eraseFromParent();
604 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
605 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
606 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
607 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
608 auto Zero = B.buildConstant({VgprRB, S32}, 0);
609 unsigned BFXOpc =
Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
611 if (WidthImm <= 32) {
613 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo,
Zero, Width});
614 MachineInstrBuilder
Hi;
617 Hi = B.buildAShr(VgprRB_S32,
Lo, B.buildConstant(VgprRB_S32, 31));
622 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
624 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
626 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi,
Zero, Amt});
627 B.buildMergeLikeInstr(Dst, {SHRSrcLo,
Hi});
630 MI.eraseFromParent();
634bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &
MI) {
636 LLT Ty = MRI.getType(DstReg);
639 Register Src =
MI.getOperand(FirstOpnd).getReg();
640 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
641 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
648 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
649 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
650 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset,
Size);
651 unsigned Opc32 =
Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
652 unsigned Opc64 =
Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
653 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
657 auto S_BFE = B.buildInstr(
Opc, {{SgprRB, Ty}},
658 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
660 *ST.getRegisterInfo(), RBI);
662 B.buildCopy(DstReg,
S_BFE->getOperand(0).getReg());
663 MI.eraseFromParent();
667bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &
MI) {
669 LLT DstTy = MRI.getType(Dst);
670 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
671 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
672 auto Op1 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(1).
getReg());
673 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
674 unsigned Opc =
MI.getOpcode();
677 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)},
Flags);
679 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)},
Flags);
680 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
681 MI.eraseFromParent();
685bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &
MI) {
687 assert(MRI.getType(Dst) == S64);
688 auto Op1 = B.buildUnmerge({VgprRB_S32},
MI.getOperand(1).
getReg());
689 auto Op2 = B.buildUnmerge({VgprRB_S32},
MI.getOperand(2).
getReg());
693 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
694 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
695 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
696 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
697 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
698 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
700 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
701 MI.eraseFromParent();
705bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &
MI) {
707 assert(MRI.getType(Dst) == V2S16);
708 unsigned Opc =
MI.getOpcode();
709 unsigned NumOps =
MI.getNumOperands();
712 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(
MI.getOperand(1).getReg());
715 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo},
Flags);
716 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi},
Flags);
717 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
718 MI.eraseFromParent();
722 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(
MI.getOperand(2).getReg());
725 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo},
Flags);
726 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi},
Flags);
727 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
728 MI.eraseFromParent();
733 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(
MI.getOperand(3).getReg());
734 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo},
Flags);
735 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi},
Flags);
736 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
737 MI.eraseFromParent();
741bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &
MI) {
748 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
751 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
752 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
753 if (ST.hasScalarMulHiInsts()) {
754 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
756 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
757 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
758 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
769 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
770 B.buildConstant(Dst1, 0);
773 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
774 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
775 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
777 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
779 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
780 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
781 B.buildCopy(Dst1, AddHi.getReg(1));
784 MI.eraseFromParent();
788bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &
MI) {
790 LLT DstTy = MRI.getType(Dst);
791 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
793 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
794 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
795 auto Op3 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(3).
getReg());
799 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(0), Op3.getReg(0), Flags);
801 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(1), Op3.getReg(1), Flags);
803 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
804 MI.eraseFromParent();
808bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &
MI) {
809 auto Op1 = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
810 int Amt =
MI.getOperand(2).getImm();
814 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
817 Lo = Freeze.getReg(0);
820 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
823 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
824 Hi = B.buildAShr(VgprRB_S32,
Lo, SignExtCst).getReg(0);
828 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
831 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(), {Lo, Hi});
832 MI.eraseFromParent();
836bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &
MI) {
842 unsigned Opc =
MI.getOpcode();
851 case AMDGPU::G_AMDGPU_FFBH_U32:
853 AddOpc = AMDGPU::G_UADDSAT;
854 SearchFromMSB =
true;
856 case AMDGPU::G_AMDGPU_FFBL_B32:
858 AddOpc = AMDGPU::G_UADDSAT;
859 SearchFromMSB =
false;
861 case AMDGPU::G_CTLZ_ZERO_UNDEF:
862 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
863 AddOpc = AMDGPU::G_ADD;
864 SearchFromMSB =
true;
866 case AMDGPU::G_CTTZ_ZERO_UNDEF:
867 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
868 AddOpc = AMDGPU::G_ADD;
869 SearchFromMSB =
false;
875 auto Unmerge = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
882 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ?
Hi :
Lo});
884 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ?
Lo :
Hi});
886 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
887 {Secondary, B.buildConstant(VgprRB_S32, 32)});
888 B.buildUMin(
MI.getOperand(0).getReg(), Primary, Adjusted);
890 MI.eraseFromParent();
894bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &
MI) {
906 LLT VecTy = MRI.getType(Src);
909 MachineRegisterInfo::VRegAttrs VgprRB_EltTy = {VgprRB, ScalarTy};
911 auto Unmerge = B.buildUnmerge(VgprRB_EltTy, Src);
914 Register PrevSelect = Unmerge.getReg(0);
915 for (
unsigned I = 1;
I < NumElts; ++
I) {
916 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)},
I);
919 B.buildSelect(VgprRB_EltTy, Cmp, Unmerge.getReg(
I), PrevSelect)
922 B.buildCopy(Dst, PrevSelect);
924 auto InitUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(0));
925 Register PrevLo = InitUnmerge.getReg(0);
926 Register PrevHi = InitUnmerge.getReg(1);
927 for (
unsigned I = 1;
I < NumElts; ++
I) {
928 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)},
I);
930 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(
I));
931 PrevLo = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(0), PrevLo)
933 PrevHi = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(1), PrevHi)
936 B.buildMergeLikeInstr(Dst, {PrevLo, PrevHi});
939 MF, MORE,
"amdgpu-regbanklegalize",
940 "AMDGPU RegBankLegalize: ExtrVecEltToSel unsupported element type",
MI);
944 MI.eraseFromParent();
948bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &
MI) {
961 LLT SrcTy = MRI.getType(Src);
964 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
965 "expected VGPR src and SGPR idx");
967 auto CastSrc = B.buildBitcast({VgprRB, Vec32Ty}, Src);
970 auto One = B.buildConstant(SgprRB_S32, 1);
971 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
972 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
974 auto ExtLo = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxLo);
975 auto ExtHi = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxHi);
977 B.buildMergeLikeInstr(Dst, {ExtLo.getReg(0), ExtHi.getReg(0)});
979 MI.eraseFromParent();
983bool RegBankLegalizeHelper::lowerInsVecEltToSel(MachineInstr &
MI) {
996 LLT VecTy = MRI.getType(Src);
999 const RegisterBank *SrcRB = MRI.getRegBank(Src);
1000 bool IsSGPR = (SrcRB == SgprRB);
1001 SmallVector<Register, 16> Selects;
1005 auto Unmerge = B.buildUnmerge(VgprRB_S32, Src);
1006 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1007 Register EltLo = EltUnmerge.getReg(0);
1008 Register EltHi = EltUnmerge.getReg(1);
1009 for (
unsigned I = 0;
I < NumElts; ++
I) {
1010 auto IdxConst = B.buildConstant(VgprRB_S32,
I);
1013 B.buildSelect(VgprRB_S32, Cmp, EltLo, Unmerge.getReg(2 *
I))
1016 B.buildSelect(VgprRB_S32, Cmp, EltHi, Unmerge.getReg(2 *
I + 1))
1020 auto Vec32 = B.buildBuildVector({VgprRB, Vec32Ty}, Selects);
1021 B.buildBitcast(Dst, Vec32);
1024 MachineRegisterInfo::VRegAttrs SrcRB_EltTy = {SrcRB, ScalarTy};
1025 MachineRegisterInfo::VRegAttrs CmpTy = IsSGPR ? SgprRB_S32 : VccRB_S1;
1026 auto Unmerge = B.buildUnmerge(SrcRB_EltTy, Src);
1027 for (
unsigned I = 0;
I < NumElts; ++
I) {
1028 auto IdxConst = B.buildConstant(SgprRB_S32,
I);
1031 B.buildSelect(SrcRB_EltTy, Cmp, Elt, Unmerge.getReg(
I)).getReg(0));
1033 B.buildMergeLikeInstr(Dst, Selects);
1036 MF, MORE,
"amdgpu-regbanklegalize",
1037 "AMDGPU RegBankLegalize: InsVecEltToSel unsupported element type",
MI);
1041 MI.eraseFromParent();
1045bool RegBankLegalizeHelper::lowerInsVecEltTo32(MachineInstr &
MI) {
1060 LLT SrcTy = MRI.getType(Src);
1063 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1064 "expected VGPR src and SGPR idx");
1066 MachineRegisterInfo::VRegAttrs VgprRB_Vec32Ty = {VgprRB, Vec32Ty};
1068 auto CastSrc = B.buildBitcast(VgprRB_Vec32Ty, Src);
1069 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1072 auto One = B.buildConstant(SgprRB_S32, 1);
1073 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1074 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1076 auto InsLo = B.buildInsertVectorElement(VgprRB_Vec32Ty, CastSrc,
1077 EltUnmerge.getReg(0), IdxLo);
1078 auto InsHi = B.buildInsertVectorElement(VgprRB_Vec32Ty, InsLo,
1079 EltUnmerge.getReg(1), IdxHi);
1081 B.buildBitcast(Dst, InsHi);
1083 MI.eraseFromParent();
1087bool RegBankLegalizeHelper::lowerAbsToNegMax(MachineInstr &
MI) {
1097 LLT Ty = MRI.getType(DstReg);
1103 Zero = B.buildBuildVector({VgprRB, Ty}, {Zero16, Zero16}).
getReg(0);
1105 assert((Ty == S32 || Ty == S16) &&
"unexpected type for AbsToNegMax");
1106 Zero = B.buildConstant({VgprRB, Ty}, 0).
getReg(0);
1109 auto Neg = B.buildSub({VgprRB, Ty},
Zero, SrcReg);
1110 B.buildSMax(DstReg, SrcReg, Neg);
1111 MI.eraseFromParent();
1115bool RegBankLegalizeHelper::lowerAbsToS32(MachineInstr &
MI) {
1125 auto Bitcast = B.buildBitcast({SgprRB_S32},
MI.getOperand(1).
getReg());
1126 auto SextInReg = B.buildSExtInReg({SgprRB_S32},
Bitcast, 16);
1128 B.buildAShr({SgprRB_S32},
Bitcast, B.buildConstant({SgprRB_S32}, 16));
1130 auto AbsLo = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {SextInReg});
1131 auto AbsHi = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {ShiftHi});
1132 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
1133 {AbsLo.getReg(0), AbsHi.getReg(0)});
1135 MI.eraseFromParent();
1139bool RegBankLegalizeHelper::lower(MachineInstr &
MI,
1147 return lowerVccExtToSel(
MI);
1149 LLT Ty = MRI.getType(
MI.getOperand(0).getReg());
1150 auto True = B.buildConstant({SgprRB, Ty},
1151 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
1152 auto False = B.buildConstant({SgprRB, Ty}, 0);
1156 B.buildSelect(
MI.getOperand(0).getReg(),
MI.getOperand(1).getReg(), True,
1158 MI.eraseFromParent();
1162 return lowerUnpackBitShift(
MI);
1164 return lowerUnpackMinMax(
MI);
1166 return lowerSplitTo16(
MI);
1168 const RegisterBank *RB = MRI.getRegBank(
MI.getOperand(0).getReg());
1169 MachineInstrBuilder
Hi;
1170 switch (
MI.getOpcode()) {
1171 case AMDGPU::G_ZEXT: {
1172 Hi = B.buildConstant({RB, S32}, 0);
1175 case AMDGPU::G_SEXT: {
1177 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
1178 Hi = B.buildAShr({RB, S32},
MI.getOperand(1).
getReg(), ShiftAmt);
1181 case AMDGPU::G_ANYEXT: {
1182 Hi = B.buildUndef({RB, S32});
1187 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
1192 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(),
1193 {MI.getOperand(1).getReg(), Hi});
1194 MI.eraseFromParent();
1198 uint64_t ConstVal =
MI.getOperand(1).getCImm()->getZExtValue();
1199 B.buildConstant(
MI.getOperand(0).getReg(), ConstVal);
1201 MI.eraseFromParent();
1206 LLT Ty = MRI.getType(Src);
1210 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
1212 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
1213 auto One = B.buildConstant(VgprRB_S32, 1);
1214 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
1215 auto Zero = B.buildConstant(VgprRB_S32, 0);
1216 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
1217 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
1219 assert(Ty == S32 || Ty == S16);
1220 auto One = B.buildConstant({VgprRB, Ty}, 1);
1221 B.buildAnd(BoolSrc, Src, One);
1223 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
1225 MI.eraseFromParent();
1229 return lowerV_BFE(
MI);
1231 return lowerS_BFE(
MI);
1233 return lowerUniMAD64(
MI);
1235 B.buildMul(
MI.getOperand(0),
MI.getOperand(1),
MI.getOperand(2));
1236 MI.eraseFromParent();
1240 auto Op1 = B.buildTrunc(VgprRB_S32,
MI.getOperand(1));
1241 auto Op2 = B.buildTrunc(VgprRB_S32,
MI.getOperand(2));
1242 auto Zero = B.buildConstant({VgprRB, S64}, 0);
1244 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
1245 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1246 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1248 B.buildInstr(NewOpc, {
MI.getOperand(0).getReg(), {SgprRB, S32}},
1250 MI.eraseFromParent();
1254 return lowerSplitTo32(
MI);
1256 return lowerSplitTo32Mul(
MI);
1258 return lowerSplitTo32Select(
MI);
1260 return lowerSplitTo32SExtInReg(
MI);
1262 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
1273 if (
Size / 128 == 2)
1275 else if (
Size / 128 == 4)
1279 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1285 else if (DstTy == S96)
1286 splitLoad(
MI, {S64, S32}, S32);
1287 else if (DstTy == V3S32)
1288 splitLoad(
MI, {V2S32, S32}, S32);
1289 else if (DstTy == V6S16)
1290 splitLoad(
MI, {V4S16, V2S16}, V2S16);
1293 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1300 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
1302 widenLoad(
MI, S128);
1303 else if (DstTy == V3S32)
1304 widenLoad(
MI, V4S32, S32);
1305 else if (DstTy == V6S16)
1306 widenLoad(
MI, V8S16, V2S16);
1309 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1316 return lowerUnpackAExt(
MI);
1321 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1327 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1329 B.setInstrAndDebugLoc(
MI);
1330 for (
unsigned i =
MI.getNumDefs(); i <
MI.getNumOperands(); ++i) {
1331 MachineOperand &
Op =
MI.getOperand(i);
1335 if (MRI.getRegBank(
Reg) != VgprRB) {
1336 auto Copy = B.buildCopy({VgprRB, MRI.getType(
Reg)},
Reg);
1337 Op.setReg(
Copy.getReg(0));
1347 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1352 B.setInstrAndDebugLoc(
MI);
1355 B.buildUnmerge({SgprRB, V2S16}, Unmerge->
getSourceReg());
1356 for (
unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1357 auto [Dst0S32, Dst1S32] =
1358 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1359 B.buildTrunc(
MI.getOperand(i * 2).getReg(), Dst0S32);
1360 B.buildTrunc(
MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1363 auto [Dst0S32, Dst1S32] = unpackAExt(
MI.getOperand(2).getReg());
1364 B.buildTrunc(
MI.getOperand(0).getReg(), Dst0S32);
1365 B.buildTrunc(
MI.getOperand(1).getReg(), Dst1S32);
1368 MI.eraseFromParent();
1373 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1374 B.setInsertPt(*
MI.getParent(),
MI.getParent()->getFirstNonPHI());
1375 MI.getOperand(0).setReg(NewDst);
1376 B.buildTrunc(Dst, NewDst);
1378 for (
unsigned i = 1; i <
MI.getNumOperands(); i += 2) {
1386 auto NewUse = B.buildAnyExt(SgprRB_S32,
UseReg);
1387 MI.getOperand(i).setReg(NewUse.getReg(0));
1395 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1400 assert(MRI.getRegBankOrNull(
MI.getOperand(0).getReg()) == VgprRB);
1404 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1405 return RB == VgprRB || RB == SgprRB;
1410 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1415 unsigned RsrcIdx = RSrcIntrin->
RsrcArg +
MI.getNumExplicitDefs() + 1;
1416 return applyRegisterBanksVgprWithSgprRsrc(
MI, RsrcIdx);
1422 unsigned RsrcIdx =
MI.getNumOperands();
1423 while (RsrcIdx-- >
MI.getNumExplicitDefs()) {
1424 const MachineOperand &
Op =
MI.getOperand(RsrcIdx);
1425 if (
Op.isReg() &&
Op.getReg().isVirtual())
1428 return applyRegisterBanksVgprWithSgprRsrc(
MI, RsrcIdx);
1431 return lowerSplitBitCount64To32(
MI);
1433 return lowerExtrVecEltToSel(
MI);
1435 return lowerExtrVecEltTo32(
MI);
1437 return lowerInsVecEltToSel(
MI);
1439 return lowerInsVecEltTo32(
MI);
1441 return lowerAbsToNegMax(
MI);
1443 return lowerAbsToS32(
MI);
1447 if (!executeInWaterfallLoop(B, WFI))
1553 return isAnyPtr(Ty, 32) ? Ty : LLT();
1556 return isAnyPtr(Ty, 64) ? Ty : LLT();
1559 return isAnyPtr(Ty, 128) ? Ty : LLT();
1603 const SIRegisterInfo *
TRI =
1604 static_cast<const SIRegisterInfo *
>(MRI.getTargetRegisterInfo());
1606 if (LLTSize >= 32 &&
TRI->getSGPRClassForBitWidth(LLTSize))
1611 const SIRegisterInfo *
TRI =
1612 static_cast<const SIRegisterInfo *
>(MRI.getTargetRegisterInfo());
1721bool RegBankLegalizeHelper::applyMappingDst(
1722 MachineInstr &
MI,
unsigned &
OpIdx,
1723 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1728 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1730 LLT Ty = MRI.getType(
Reg);
1731 [[maybe_unused]]
const RegisterBank *RB = MRI.getRegBank(
Reg);
1733 switch (MethodIDs[
OpIdx]) {
1805 Register NewAgprDst = MRI.createVirtualRegister({AgprRB, Ty});
1806 Op.setReg(NewAgprDst);
1807 if (!MRI.use_nodbg_empty(
Reg))
1808 B.buildCopy(
Reg, NewAgprDst);
1815 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1817 if (!MRI.use_empty(
Reg)) {
1819 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1820 B.buildTrunc(
Reg, CopyS32_Vcc);
1827 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1828 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1829 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1830 Op.setReg(NewVgprDstS16);
1831 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1833 B.buildTrunc(
Reg, NewSgprDstS32);
1844 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1845 Op.setReg(NewVgprDst);
1858 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1859 Op.setReg(NewVgprDst);
1867 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1869 if (!MRI.use_empty(
Reg))
1870 B.buildTrunc(
Reg, NewDst);
1877 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
1878 B.buildCopy(
Reg,
Op.getReg());
1883 MF, MORE,
"amdgpu-regbanklegalize",
1884 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for",
MI);
1889 MF, MORE,
"amdgpu-regbanklegalize",
1890 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported",
MI);
1898bool RegBankLegalizeHelper::applyMappingSrc(
1899 MachineInstr &
MI,
unsigned &
OpIdx,
1900 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1902 for (
unsigned i = 0; i < MethodIDs.
size(); ++
OpIdx, ++i) {
1903 if (MethodIDs[i] ==
None || MethodIDs[i] ==
IntrId || MethodIDs[i] ==
Imm)
1906 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1908 LLT Ty = MRI.getType(
Reg);
1909 const RegisterBank *RB = MRI.getRegBank(
Reg);
1911 switch (MethodIDs[i]) {
1914 assert(RB == VccRB || RB == SgprRB);
1916 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1918 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1919 Op.setReg(CopyVcc_Scc.getReg(0));
1937 assert(Ty == getTyFromID(MethodIDs[i]));
1938 assert(RB == getRegBankFromID(MethodIDs[i]));
1952 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1953 assert(RB == getRegBankFromID(MethodIDs[i]));
1977 assert(Ty == getTyFromID(MethodIDs[i]));
1979 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
1980 Op.setReg(CopyToVgpr.getReg(0));
1996 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1998 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
1999 Op.setReg(CopyToVgpr.getReg(0));
2005 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
2006 Op.setReg(CopyToVgpr.getReg(0));
2012 auto CopyToAgpr = B.buildCopy({AgprRB, Ty},
Reg);
2013 Op.setReg(CopyToAgpr.getReg(0));
2020 assert(Ty == getTyFromID(MethodIDs[i]));
2025 WFI.
End = std::next(
MI.getIterator());
2032 assert(Ty == getTyFromID(MethodIDs[i]));
2038 while (
Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
2043 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
2047 B.setInsertPt(*
MI.getParent(), Start);
2056 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2060 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2066 assert(Ty == getTyFromID(MethodIDs[i]));
2070 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2080 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
2081 Op.setReg(Aext.getReg(0));
2088 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
2091 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
2092 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
2093 Op.setReg(BoolInReg.getReg(0));
2099 auto Sext = B.buildSExt(SgprRB_S32,
Reg);
2100 Op.setReg(Sext.getReg(0));
2106 auto Zext = B.buildZExt({SgprRB, S32},
Reg);
2107 Op.setReg(Zext.getReg(0));
2113 auto Aext = B.buildAnyExt({VgprRB, S32},
Reg);
2114 Op.setReg(Aext.getReg(0));
2121 auto Sext = B.buildSExt({VgprRB, S32},
Reg);
2122 Op.setReg(Sext.getReg(0));
2129 auto Zext = B.buildZExt({VgprRB, S32},
Reg);
2130 Op.setReg(Zext.getReg(0));
2135 MF, MORE,
"amdgpu-regbanklegalize",
2136 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported",
MI);
2146 unsigned StartOpIdx,
2147 unsigned EndOpIdx) {
2148 for (
unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
2155bool RegBankLegalizeHelper::applyRegisterBanksVgprWithSgprRsrc(
2156 MachineInstr &
MI,
unsigned RsrcIdx) {
2157 const unsigned NumDefs =
MI.getNumExplicitDefs();
2159 MachineBasicBlock *
MBB =
MI.getParent();
2163 for (
unsigned i = 0; i < NumDefs; ++i) {
2165 if (MRI.getRegBank(
Reg) == VgprRB)
2168 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(
Reg)});
2169 MI.getOperand(i).setReg(NewVgprDst);
2173 B.setInstrAndDebugLoc(
MI);
2176 for (
unsigned i = NumDefs; i < RsrcIdx; ++i) {
2177 MachineOperand &
Op =
MI.getOperand(i);
2185 if (MRI.getRegBank(
Reg) == VgprRB)
2188 auto Copy = B.buildCopy({VgprRB, MRI.getType(
Reg)},
Reg);
2189 Op.setReg(
Copy.getReg(0));
2192 SmallSet<Register, 4> OpsToWaterfall;
2195 for (
unsigned i = RsrcIdx; i <
MI.getNumOperands(); ++i) {
2196 MachineOperand &
Op =
MI.getOperand(i);
2201 if (MRI.getRegBank(
Reg) != SgprRB)
2205 if (!OpsToWaterfall.
empty()) {
2207 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
const SmallVectorImpl< MachineOperand > & Cond
bool findRuleAndApplyMapping(MachineInstr &MI)
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
bool isValid() const
Check for null.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
void push_back(const T &Elt)
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ SgprV4S32_ReadFirstLane
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
@ VerifyAllSgprOrVgprGPHI
@ AextToS32InIncomingBlockGPHI
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LoweringMethodID LoweringMethod
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs
MachineBasicBlock::iterator End