18 #define DEBUG_TYPE "si-fold-operands"
23 struct FoldCandidate {
36 bool Commuted_ =
false,
38 UseMI(
MI), OpToFold(
nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
41 if (FoldOp->
isImm()) {
42 ImmToFold = FoldOp->
getImm();
43 }
else if (FoldOp->
isFI()) {
44 FrameIndexToFold = FoldOp->
getIndex();
65 bool isCommuted()
const {
69 bool needsShrink()
const {
70 return ShrinkOpcode != -1;
73 int getShrinkOpcode()
const {
100 std::pair<const MachineOperand *, int> isOMod(
const MachineInstr &
MI)
const;
113 StringRef getPassName()
const override {
return "SI Fold Operands"; }
124 "SI Fold Operands",
false,
false)
126 char SIFoldOperands::
ID = 0;
133 case AMDGPU::V_MAC_F32_e64:
134 return AMDGPU::V_MAD_F32_e64;
135 case AMDGPU::V_MAC_F16_e64:
136 return AMDGPU::V_MAD_F16_e64;
137 case AMDGPU::V_FMAC_F32_e64:
138 return AMDGPU::V_FMA_F32_e64;
139 case AMDGPU::V_FMAC_F16_e64:
140 return AMDGPU::V_FMA_F16_gfx9_e64;
141 case AMDGPU::V_FMAC_LEGACY_F32_e64:
142 return AMDGPU::V_FMA_LEGACY_F32_e64;
143 case AMDGPU::V_FMAC_F64_e64:
144 return AMDGPU::V_FMA_F64_e64;
146 return AMDGPU::INSTRUCTION_LIST_END;
155 if (!OpToFold.
isFI())
160 AMDGPU::OpName::vaddr);
165 AMDGPU::OpName::saddr);
170 AMDGPU::OpName::vaddr);
171 return OpNo == VIdx && SIdx == -1;
175 return new SIFoldOperands();
189 (!
ST.hasDOTOpSelHazard() ||
192 ST.hasInv2PiInlineImm())) {
195 unsigned Opcode =
MI->getOpcode();
196 int OpNo =
MI->getOperandNo(&Old);
199 ModIdx = AMDGPU::OpName::src0_modifiers;
201 ModIdx = AMDGPU::OpName::src1_modifiers;
203 ModIdx = AMDGPU::OpName::src2_modifiers;
207 unsigned Val =
Mod.getImm();
211 switch (
TII.get(Opcode).OpInfo[OpNo].OperandType) {
218 if (!(Fold.ImmToFold & 0xffff)) {
236 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
245 int Op32 = Fold.getShrinkOpcode();
257 if (HaveNonDbgCarryUse) {
269 for (
unsigned I =
MI->getNumOperands() - 1;
I > 0; --
I)
270 MI->removeOperand(
I);
271 MI->setDesc(
TII.get(AMDGPU::IMPLICIT_DEF));
273 if (Fold.isCommuted())
274 TII.commuteInstruction(*Inst32,
false);
278 assert(!Fold.needsShrink() &&
"not handled");
283 if (NewMFMAOpc == -1)
285 MI->setDesc(
TII.get(NewMFMAOpc));
286 MI->untieRegOperand(0);
292 if (Fold.isGlobal()) {
293 Old.
ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
294 Fold.OpToFold->getTargetFlags());
311 for (
auto Candidate : FoldList) {
312 if (Candidate.UseMI ==
MI)
323 for (FoldCandidate &Fold : FoldList)
324 if (Fold.UseMI ==
MI && Fold.UseOpNo == OpNo)
326 LLVM_DEBUG(
dbgs() <<
"Append " << (Commuted ?
"commuted" :
"normal")
327 <<
" operand " << OpNo <<
"\n " << *
MI);
328 FoldList.emplace_back(
MI, OpNo, FoldOp, Commuted, ShrinkOp);
335 if (!
TII->isOperandLegal(*
MI, OpNo, OpToFold)) {
337 unsigned Opc =
MI->getOpcode();
339 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
342 MI->setDesc(
TII->get(NewOpc));
345 MI->untieRegOperand(OpNo);
348 MI->setDesc(
TII->get(Opc));
352 if (OpToFold->
isImm()) {
354 if (Opc == AMDGPU::S_SETREG_B32)
355 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
356 else if (Opc == AMDGPU::S_SETREG_B32_mode)
357 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
359 MI->setDesc(
TII->get(ImmOpc));
371 unsigned CommuteOpNo = OpNo;
377 bool CanCommute =
TII->findCommutedOpIndices(*
MI, CommuteIdx0, CommuteIdx1);
380 if (CommuteIdx0 == OpNo)
381 CommuteOpNo = CommuteIdx1;
382 else if (CommuteIdx1 == OpNo)
383 CommuteOpNo = CommuteIdx0;
391 if (CanCommute && (!
MI->getOperand(CommuteIdx0).isReg() ||
392 !
MI->getOperand(CommuteIdx1).isReg()))
396 !
TII->commuteInstruction(*
MI,
false, CommuteIdx0, CommuteIdx1))
399 if (!
TII->isOperandLegal(*
MI, CommuteOpNo, OpToFold)) {
400 if ((Opc == AMDGPU::V_ADD_CO_U32_e64 ||
401 Opc == AMDGPU::V_SUB_CO_U32_e64 ||
402 Opc == AMDGPU::V_SUBREV_CO_U32_e64) &&
408 unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;
410 if (!OtherOp.
isReg() ||
417 unsigned MaybeCommutedOpc =
MI->getOpcode();
424 TII->commuteInstruction(*
MI,
false, CommuteIdx0, CommuteIdx1);
434 if (
TII->isSALU(
MI->getOpcode())) {
440 if (
TII->isLiteralConstantLike(*OpToFold, OpInfo)) {
442 !
TII->isInlineConstant(*OpToFold, OpInfo)) {
445 auto &
Op =
MI->getOperand(
i);
447 TII->isLiteralConstantLike(
Op, OpInfo)) {
467 switch (
MI.getOpcode()) {
468 case AMDGPU::V_MOV_B32_e32:
469 case AMDGPU::V_MOV_B32_e64:
470 case AMDGPU::V_MOV_B64_PSEUDO:
471 case AMDGPU::V_MOV_B64_e32:
472 case AMDGPU::V_MOV_B64_e64:
489 if (!
Def || !
Def->isRegSequence())
492 for (
unsigned I = 1,
E =
Def->getNumExplicitOperands();
I <
E;
I += 2) {
502 if (
TII->isInlineConstant(*
Op, OpTy))
506 if (!
Op->isReg() ||
Op->getReg().isPhysical())
511 Defs.emplace_back(Sub,
Def->getOperand(
I + 1).getImm());
534 if (OpToFold.
isImm() &&
TII->isInlineConstant(OpToFold, OpTy) &&
535 TII->isOperandLegal(*
UseMI, UseOpIdx, &OpToFold)) {
540 if (!OpToFold.
isReg())
557 if (DefOp.
isImm() &&
TII->isInlineConstant(DefOp, OpTy) &&
558 TII->isOperandLegal(*
UseMI, UseOpIdx, &DefOp)) {
569 for (
unsigned I = 0,
E = Defs.size();
I !=
E; ++
I) {
574 auto SubImm =
Op->getImm();
577 if (!
TII->isInlineConstant(*
Op, OpTy) ||
591 void SIFoldOperands::foldOperand(
622 if (RSUse.getSubReg() != RegSeqDstSubReg)
625 foldOperand(OpToFold, RSUseMI, RSUseMI->
getOperandNo(&RSUse), FoldList,
640 if (
TII->getNamedOperand(*
UseMI, AMDGPU::OpName::srsrc)->getReg() !=
641 MFI->getScratchRSrcReg())
647 *
TII->getNamedOperand(*
UseMI, AMDGPU::OpName::soffset);
658 AMDGPU::OpName::vaddr) != -1 &&
660 AMDGPU::OpName::saddr) == -1) {
668 bool FoldingImmLike =
686 if (
TRI->isSGPRClass(SrcRC) &&
TRI->hasVectorRegisters(DestRC)) {
690 if (
Use.isImplicit())
694 Use.getParent()->getOperandNo(&
Use),
697 for (
auto &
F : CopyUses) {
698 foldOperand(*
F.OpToFold,
F.UseMI,
F.UseOpNo, FoldList, CopiesToReplace);
702 if (DestRC == &AMDGPU::AGPR_32RegClass &&
706 CopiesToReplace.push_back(
UseMI);
714 unsigned MovOp =
TII->getMovOpcode(DestRC);
715 if (MovOp == AMDGPU::COPY)
721 while (ImpOpI != ImpOpE) {
726 CopiesToReplace.push_back(
UseMI);
737 CopiesToReplace.push_back(
UseMI);
758 for (
unsigned I = 0;
I <
Size / 4; ++
I) {
763 int64_t Imm =
Def->getImm();
767 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).
addImm(Imm);
769 }
else if (
Def->isReg() &&
TRI->isAGPR(*
MRI,
Def->getReg())) {
771 Def->setIsKill(
false);
772 if (!SeenAGPRs.
insert(Src)) {
783 Def->setIsKill(
false);
789 if (
TRI->isSGPRReg(*
MRI, Src.Reg)) {
798 if (CopyToVGPR.
Reg) {
800 if (VGPRCopies.
count(CopyToVGPR)) {
801 Vgpr = VGPRCopies[CopyToVGPR];
805 VGPRCopies[CopyToVGPR] = Vgpr;
809 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).
addReg(Vgpr);
813 B.addImm(Defs[
I].second);
827 else if (
ST->hasGFX90AInsts() &&
835 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
836 (UseOpc == AMDGPU::V_READLANE_B32 &&
843 if (FoldingImmLike) {
852 if (OpToFold.
isImm())
890 if (!FoldingImmLike) {
891 if (OpToFold.
isReg() &&
ST->needsAlignedVGPRs()) {
895 if (
TRI->hasVectorRegisters(RC) && OpToFold.
getSubReg()) {
898 RC =
TRI->getCompatibleSubRegClass(RC, SubRC,
SubReg);
903 if (!RC || !
TRI->isProperlyAlignedRC(*RC))
930 Imm = Imm.getLoBits(32);
933 Imm = Imm.getHiBits(32);
949 case AMDGPU::V_AND_B32_e64:
950 case AMDGPU::V_AND_B32_e32:
951 case AMDGPU::S_AND_B32:
954 case AMDGPU::V_OR_B32_e64:
955 case AMDGPU::V_OR_B32_e32:
956 case AMDGPU::S_OR_B32:
959 case AMDGPU::V_XOR_B32_e64:
960 case AMDGPU::V_XOR_B32_e32:
961 case AMDGPU::S_XOR_B32:
964 case AMDGPU::S_XNOR_B32:
967 case AMDGPU::S_NAND_B32:
970 case AMDGPU::S_NOR_B32:
973 case AMDGPU::S_ANDN2_B32:
976 case AMDGPU::S_ORN2_B32:
979 case AMDGPU::V_LSHL_B32_e64:
980 case AMDGPU::V_LSHL_B32_e32:
981 case AMDGPU::S_LSHL_B32:
983 Result =
LHS << (
RHS & 31);
985 case AMDGPU::V_LSHLREV_B32_e64:
986 case AMDGPU::V_LSHLREV_B32_e32:
987 Result =
RHS << (
LHS & 31);
989 case AMDGPU::V_LSHR_B32_e64:
990 case AMDGPU::V_LSHR_B32_e32:
991 case AMDGPU::S_LSHR_B32:
992 Result =
LHS >> (
RHS & 31);
994 case AMDGPU::V_LSHRREV_B32_e64:
995 case AMDGPU::V_LSHRREV_B32_e32:
996 Result =
RHS >> (
LHS & 31);
998 case AMDGPU::V_ASHR_I32_e64:
999 case AMDGPU::V_ASHR_I32_e32:
1000 case AMDGPU::S_ASHR_I32:
1001 Result =
static_cast<int32_t
>(
LHS) >> (
RHS & 31);
1003 case AMDGPU::V_ASHRREV_I32_e64:
1004 case AMDGPU::V_ASHRREV_I32_e32:
1005 Result =
static_cast<int32_t
>(
RHS) >> (
LHS & 31);
1013 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1025 for (
unsigned I =
MI.getNumOperands() - 1;
I >= NumOps; --
I)
1026 MI.removeOperand(
I);
1030 MI.setDesc(NewDesc);
1038 if (
Op.getSubReg() != AMDGPU::NoSubRegister || !
Op.getReg().isVirtual())
1042 if (
Def &&
Def->isMoveImmediate()) {
1057 unsigned Opc =
MI->getOpcode();
1064 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1065 Opc == AMDGPU::S_NOT_B32) &&
1067 MI->getOperand(1).ChangeToImmediate(~Src0->
getImm());
1089 bool IsSGPR =
TRI.isSGPRReg(
MRI,
MI->getOperand(0).getReg());
1093 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1094 MI->removeOperand(Src1Idx);
1099 if (!
MI->isCommutable())
1107 int32_t Src1Val =
static_cast<int32_t
>(Src1->
getImm());
1108 if (Opc == AMDGPU::V_OR_B32_e64 ||
1109 Opc == AMDGPU::V_OR_B32_e32 ||
1110 Opc == AMDGPU::S_OR_B32) {
1113 MI->removeOperand(Src1Idx);
1115 }
else if (Src1Val == -1) {
1117 MI->removeOperand(Src1Idx);
1125 if (
MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
1126 MI->getOpcode() == AMDGPU::V_AND_B32_e32 ||
1127 MI->getOpcode() == AMDGPU::S_AND_B32) {
1130 MI->removeOperand(Src0Idx);
1132 }
else if (Src1Val == -1) {
1134 MI->removeOperand(Src1Idx);
1143 if (
MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
1144 MI->getOpcode() == AMDGPU::V_XOR_B32_e32 ||
1145 MI->getOpcode() == AMDGPU::S_XOR_B32) {
1148 MI->removeOperand(Src1Idx);
1159 unsigned Opc =
MI.getOpcode();
1160 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1161 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1169 if (!Src1Imm->isIdenticalTo(*Src0Imm))
1177 if ((Src1ModIdx != -1 &&
MI.getOperand(Src1ModIdx).getImm() != 0) ||
1178 (Src0ModIdx != -1 &&
MI.getOperand(Src0ModIdx).getImm() != 0))
1186 MI.removeOperand(Src2Idx);
1188 if (Src1ModIdx != -1)
1189 MI.removeOperand(Src1ModIdx);
1190 if (Src0ModIdx != -1)
1191 MI.removeOperand(Src0ModIdx);
1197 bool SIFoldOperands::tryFoldZeroHighBits(
MachineInstr &
MI)
const {
1198 if (
MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1199 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1208 if (
ST->zeroesHigh16BitsOfDest(SrcDef->
getOpcode())) {
1211 MI.eraseFromParent();
1226 bool Changed =
false;
1228 if (OpToFold.
isImm()) {
1248 UsesToProcess.push_back(&
Use);
1249 for (
auto U : UsesToProcess) {
1255 if (CopiesToReplace.empty() && FoldList.empty())
1261 Copy->addImplicitDefUseOperands(*MF);
1263 for (FoldCandidate &Fold : FoldList) {
1264 assert(!Fold.isReg() || Fold.OpToFold);
1265 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1275 assert(Fold.OpToFold && Fold.OpToFold->isReg());
1282 <<
static_cast<int>(Fold.UseOpNo) <<
" of "
1284 }
else if (Fold.isCommuted()) {
1286 TII->commuteInstruction(*Fold.UseMI,
false);
1295 unsigned Op =
MI.getOpcode();
1297 case AMDGPU::V_MAX_F32_e64:
1298 case AMDGPU::V_MAX_F16_e64:
1299 case AMDGPU::V_MAX_F64_e64:
1300 case AMDGPU::V_PK_MAX_F16: {
1310 Src0->
getSubReg() != AMDGPU::NoSubRegister)
1314 if (
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
1318 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers)->getImm();
1320 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers)->getImm();
1326 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1344 if (
TII->getClampMask(*
Def) !=
TII->getClampMask(
MI))
1356 MI.eraseFromParent();
1361 if (
TII->convertToThreeAddress(*
Def,
nullptr,
nullptr))
1362 Def->eraseFromParent();
1369 case AMDGPU::V_MUL_F64_e64: {
1371 case 0x3fe0000000000000:
1373 case 0x4000000000000000:
1375 case 0x4010000000000000:
1381 case AMDGPU::V_MUL_F32_e64: {
1382 switch (
static_cast<uint32_t>(Val)) {
1393 case AMDGPU::V_MUL_F16_e64: {
1394 switch (
static_cast<uint16_t>(Val)) {
1413 std::pair<const MachineOperand *, int>
1415 unsigned Op =
MI.getOpcode();
1417 case AMDGPU::V_MUL_F64_e64:
1418 case AMDGPU::V_MUL_F32_e64:
1419 case AMDGPU::V_MUL_F16_e64: {
1421 if ((
Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32OutputDenormals) ||
1422 ((
Op == AMDGPU::V_MUL_F64_e64 ||
Op == AMDGPU::V_MUL_F16_e64) &&
1423 MFI->getMode().FP64FP16OutputDenormals))
1430 if (Src0->
isImm()) {
1433 }
else if (Src1->
isImm()) {
1441 TII->hasModifiersSet(
MI, AMDGPU::OpName::src0_modifiers) ||
1442 TII->hasModifiersSet(
MI, AMDGPU::OpName::src1_modifiers) ||
1443 TII->hasModifiersSet(
MI, AMDGPU::OpName::omod) ||
1447 return std::make_pair(RegOp, OMod);
1449 case AMDGPU::V_ADD_F64_e64:
1450 case AMDGPU::V_ADD_F32_e64:
1451 case AMDGPU::V_ADD_F16_e64: {
1453 if ((
Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32OutputDenormals) ||
1454 ((
Op == AMDGPU::V_ADD_F64_e64 ||
Op == AMDGPU::V_ADD_F16_e64) &&
1455 MFI->getMode().FP64FP16OutputDenormals))
1464 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::src0_modifiers) &&
1465 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::src1_modifiers) &&
1467 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
1481 std::tie(RegOp, OMod) = isOMod(
MI);
1483 RegOp->
getSubReg() != AMDGPU::NoSubRegister ||
1501 MI.eraseFromParent();
1506 if (
TII->convertToThreeAddress(*
Def,
nullptr,
nullptr))
1507 Def->eraseFromParent();
1516 auto Reg =
MI.getOperand(0).getReg();
1518 if (!
ST->hasGFX90AInsts() || !
TRI->isVGPR(*
MRI,
Reg) ||
1526 for (
auto &
Def : Defs) {
1527 const auto *
Op =
Def.first;
1530 if (
TRI->isAGPR(*
MRI,
Op->getReg()))
1550 if (
Op->getSubReg())
1556 TII->getRegClass(InstDesc, OpIdx,
TRI, *
MI.getMF());
1557 if (!OpRC || !
TRI->isVectorSuperClass(OpRC))
1563 TII->get(AMDGPU::REG_SEQUENCE), Dst);
1565 for (
unsigned I = 0;
I < Defs.size(); ++
I) {
1567 Def->setIsKill(
false);
1575 RS.addImm(Defs[
I].second);
1579 if (!
TII->isOperandLegal(*
UseMI, OpIdx,
Op)) {
1581 RS->eraseFromParent();
1590 MI.eraseFromParent();
1602 bool SIFoldOperands::tryFoldLCSSAPhi(
MachineInstr &PHI) {
1611 !
TRI->isVGPR(*
MRI, PhiIn) || !
TRI->isVGPR(*
MRI, PhiOut))
1620 if (!Copy || !
Copy->isCopy())
1624 if (!
TRI->isAGPR(*
MRI, CopyIn) ||
Copy->getOperand(1).getSubReg())
1634 TII->get(AMDGPU::COPY), PhiOut)
1636 Copy->eraseFromParent();
1646 if (!
ST->hasGFX90AInsts() ||
MI.getNumExplicitDefs() != 1)
1667 while (!
Users.empty()) {
1669 if (!
I->isCopy() && !
I->isRegSequence())
1671 Register DstReg =
I->getOperand(0).getReg();
1672 if (
TRI->isAGPR(*
MRI, DstReg))
1674 MoveRegs.push_back(DstReg);
1676 Users.push_back(&U);
1682 if (!
TII->isOperandLegal(
MI, 0, &
Def)) {
1687 while (!MoveRegs.empty()) {
1703 TII =
ST->getInstrInfo();
1704 TRI = &
TII->getRegisterInfo();
1712 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
1714 bool Changed =
false;
1718 Changed |= tryFoldCndMask(
MI);
1720 if (tryFoldZeroHighBits(
MI)) {
1725 if (
MI.isRegSequence() && tryFoldRegSequence(
MI)) {
1730 if (
MI.isPHI() && tryFoldLCSSAPhi(
MI)) {
1735 if (
MI.mayLoad() && tryFoldLoad(
MI)) {
1740 if (!
TII->isFoldableCopy(
MI)) {
1743 CurrentKnownM0Val =
nullptr;
1749 Changed |= tryFoldClamp(
MI);
1758 if (CurrentKnownM0Val && CurrentKnownM0Val->
isIdenticalTo(NewM0Val)) {
1759 MI.eraseFromParent();
1766 nullptr : &NewM0Val;
1775 if (!FoldingImm && !OpToFold.
isReg())
1787 if (!
MI.getOperand(0).getReg().isVirtual())
1790 Changed |= foldInstOperand(
MI, OpToFold);
1797 auto *InstToErase = &
MI;
1799 auto &
SrcOp = InstToErase->getOperand(1);
1801 InstToErase->eraseFromParent();
1803 InstToErase =
nullptr;
1807 if (!InstToErase || !
TII->isFoldableCopy(*InstToErase))
1810 if (InstToErase && InstToErase->isRegSequence() &&