19#define DEBUG_TYPE "si-fold-operands"
37 bool Commuted_ =
false,
39 UseMI(
MI), OpToFold(
nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
42 if (FoldOp->
isImm()) {
43 ImmToFold = FoldOp->
getImm();
44 }
else if (FoldOp->
isFI()) {
45 FrameIndexToFold = FoldOp->
getIndex();
66 bool needsShrink()
const {
return ShrinkOpcode != -1; }
89 getRegSeqInit(
SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
111 std::pair<const MachineOperand *, int> isOMod(
const MachineInstr &
MI)
const;
137 "SI Fold Operands",
false,
false)
139char SIFoldOperands::
ID = 0;
148 TRI.getSubRegisterClass(RC, MO.getSubReg()))
156 case AMDGPU::V_MAC_F32_e64:
157 return AMDGPU::V_MAD_F32_e64;
158 case AMDGPU::V_MAC_F16_e64:
159 return AMDGPU::V_MAD_F16_e64;
160 case AMDGPU::V_FMAC_F32_e64:
161 return AMDGPU::V_FMA_F32_e64;
162 case AMDGPU::V_FMAC_F16_e64:
163 return AMDGPU::V_FMA_F16_gfx9_e64;
164 case AMDGPU::V_FMAC_F16_t16_e64:
165 return AMDGPU::V_FMA_F16_gfx9_e64;
166 case AMDGPU::V_FMAC_LEGACY_F32_e64:
167 return AMDGPU::V_FMA_LEGACY_F32_e64;
168 case AMDGPU::V_FMAC_F64_e64:
169 return AMDGPU::V_FMA_F64_e64;
171 return AMDGPU::INSTRUCTION_LIST_END;
178 if (!OpToFold.
isFI())
181 const unsigned Opc =
UseMI.getOpcode();
192 return OpNo == VIdx && SIdx == -1;
196 return new SIFoldOperands();
199bool SIFoldOperands::updateOperand(FoldCandidate &Fold)
const {
210 ST->hasInv2PiInlineImm())) {
213 unsigned Opcode =
MI->getOpcode();
214 int OpNo =
MI->getOperandNo(&Old);
217 ModIdx = AMDGPU::OpName::src0_modifiers;
219 ModIdx = AMDGPU::OpName::src1_modifiers;
221 ModIdx = AMDGPU::OpName::src2_modifiers;
225 unsigned Val =
Mod.getImm();
229 switch (
TII->get(Opcode).operands()[OpNo].OperandType) {
235 if (!isUInt<16>(Fold.ImmToFold)) {
236 if (!(Fold.ImmToFold & 0xffff)) {
254 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
262 int Op32 = Fold.ShrinkOpcode;
267 bool HaveNonDbgCarryUse = !
MRI->use_nodbg_empty(Dst1.
getReg());
270 Register NewReg0 =
MRI->createVirtualRegister(Dst0RC);
274 if (HaveNonDbgCarryUse) {
287 for (
unsigned I =
MI->getNumOperands() - 1;
I > 0; --
I)
288 MI->removeOperand(
I);
289 MI->setDesc(
TII->get(AMDGPU::IMPLICIT_DEF));
292 TII->commuteInstruction(*Inst32,
false);
296 assert(!Fold.needsShrink() &&
"not handled");
301 if (NewMFMAOpc == -1)
303 MI->setDesc(
TII->get(NewMFMAOpc));
304 MI->untieRegOperand(0);
310 if (Fold.isGlobal()) {
311 Old.
ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
312 Fold.OpToFold->getTargetFlags());
329 return any_of(FoldList, [&](
const auto &
C) {
return C.UseMI ==
MI; });
337 for (FoldCandidate &Fold : FoldList)
338 if (Fold.UseMI ==
MI && Fold.UseOpNo == OpNo)
340 LLVM_DEBUG(
dbgs() <<
"Append " << (Commuted ?
"commuted" :
"normal")
341 <<
" operand " << OpNo <<
"\n " << *
MI);
348 const unsigned Opc =
MI->getOpcode();
350 auto tryToFoldAsFMAAKorMK = [&]() {
351 if (!OpToFold->
isImm())
354 const bool TryAK = OpNo == 3;
355 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
356 MI->setDesc(
TII->get(NewOpc));
359 bool FoldAsFMAAKorMK =
360 tryAddToFoldList(FoldList,
MI, TryAK ? 3 : 2, OpToFold);
361 if (FoldAsFMAAKorMK) {
363 MI->untieRegOperand(3);
380 MI->setDesc(
TII->get(Opc));
384 if (!
TII->isOperandLegal(*
MI, OpNo, OpToFold)) {
387 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
390 MI->setDesc(
TII->get(NewOpc));
395 bool FoldAsMAD = tryAddToFoldList(FoldList,
MI, OpNo, OpToFold);
397 MI->untieRegOperand(OpNo);
401 MI->removeOperand(
MI->getNumExplicitOperands() - 1);
402 MI->setDesc(
TII->get(Opc));
407 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
408 if (tryToFoldAsFMAAKorMK())
413 if (OpToFold->
isImm()) {
415 if (Opc == AMDGPU::S_SETREG_B32)
416 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
417 else if (Opc == AMDGPU::S_SETREG_B32_mode)
418 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
420 MI->setDesc(
TII->get(ImmOpc));
432 unsigned CommuteOpNo = OpNo;
438 bool CanCommute =
TII->findCommutedOpIndices(*
MI, CommuteIdx0, CommuteIdx1);
441 if (CommuteIdx0 == OpNo)
442 CommuteOpNo = CommuteIdx1;
443 else if (CommuteIdx1 == OpNo)
444 CommuteOpNo = CommuteIdx0;
452 if (CanCommute && (!
MI->getOperand(CommuteIdx0).isReg() ||
453 !
MI->getOperand(CommuteIdx1).isReg()))
457 !
TII->commuteInstruction(*
MI,
false, CommuteIdx0, CommuteIdx1))
460 if (!
TII->isOperandLegal(*
MI, CommuteOpNo, OpToFold)) {
461 if ((Opc == AMDGPU::V_ADD_CO_U32_e64 ||
462 Opc == AMDGPU::V_SUB_CO_U32_e64 ||
463 Opc == AMDGPU::V_SUBREV_CO_U32_e64) &&
468 unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;
470 if (!OtherOp.
isReg() ||
477 unsigned MaybeCommutedOpc =
MI->getOpcode();
484 TII->commuteInstruction(*
MI,
false, CommuteIdx0, CommuteIdx1);
494 if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) &&
495 !OpToFold->
isReg() && !
TII->isInlineConstant(*OpToFold)) {
496 unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2;
498 if (!OpImm.
isReg() &&
499 TII->isInlineConstant(*
MI,
MI->getOperand(OpNo), OpImm))
500 return tryToFoldAsFMAAKorMK();
508 if (Opc == AMDGPU::S_FMAC_F32 &&
509 (OpNo != 1 || !
MI->getOperand(1).isIdenticalTo(
MI->getOperand(2)))) {
510 if (tryToFoldAsFMAAKorMK())
516 if (
TII->isSALU(
MI->getOpcode())) {
521 if (!OpToFold->
isReg() && !
TII->isInlineConstant(*OpToFold, OpInfo)) {
523 for (
unsigned i = 0, e = InstDesc.
getNumOperands(); i != e; ++i) {
524 auto &
Op =
MI->getOperand(i);
525 if (OpNo != i && !
Op.isReg() &&
539 return !
TII->isSDWA(
MI);
545bool SIFoldOperands::getRegSeqInit(
549 if (!Def || !
Def->isRegSequence())
552 for (
unsigned I = 1,
E =
Def->getNumExplicitOperands();
I <
E;
I += 2) {
559 SubDef =
MRI->getVRegDef(Sub->
getReg())) {
562 if (
TII->isInlineConstant(*
Op, OpTy))
566 if (!
Op->isReg() ||
Op->getReg().isPhysical())
571 Defs.emplace_back(Sub,
Def->getOperand(
I + 1).getImm());
577bool SIFoldOperands::tryToFoldACImm(
581 if (UseOpIdx >=
Desc.getNumOperands())
584 uint8_t OpTy =
Desc.operands()[UseOpIdx].OperandType;
591 if (OpToFold.
isImm() &&
TII->isInlineConstant(OpToFold, OpTy) &&
592 TII->isOperandLegal(*
UseMI, UseOpIdx, &OpToFold)) {
597 if (!OpToFold.
isReg())
610 if (!UseOp.
getSubReg() && Def &&
TII->isFoldableCopy(*Def)) {
612 if (DefOp.
isImm() &&
TII->isInlineConstant(DefOp, OpTy) &&
613 TII->isOperandLegal(*
UseMI, UseOpIdx, &DefOp)) {
620 if (!getRegSeqInit(Defs,
UseReg, OpTy))
624 for (
unsigned I = 0,
E = Defs.
size();
I !=
E; ++
I) {
629 auto SubImm =
Op->getImm();
632 if (!
TII->isInlineConstant(*
Op, OpTy) ||
646void SIFoldOperands::foldOperand(
654 if (!isUseSafeToFold(*
UseMI, UseOp))
676 if (RSUse.getSubReg() != RegSeqDstSubReg)
679 foldOperand(OpToFold, RSUseMI, RSUseMI->
getOperandNo(&RSUse), FoldList,
686 if (tryToFoldACImm(OpToFold,
UseMI, UseOpIdx, FoldList))
689 if (frameIndexMayFold(*
UseMI, UseOpIdx, OpToFold)) {
694 if (
TII->getNamedOperand(*
UseMI, AMDGPU::OpName::srsrc)->getReg() !=
695 MFI->getScratchRSrcReg())
701 *
TII->getNamedOperand(*
UseMI, AMDGPU::OpName::soffset);
721 bool FoldingImmLike =
739 if (
TRI->isSGPRClass(SrcRC) &&
TRI->hasVectorRegisters(DestRC)) {
741 for (
auto &
Use :
MRI->use_nodbg_operands(DestReg)) {
743 if (
Use.isImplicit())
747 Use.getParent()->getOperandNo(&
Use),
751 for (
auto &
F : CopyUses) {
752 foldOperand(*
F.OpToFold,
F.UseMI,
F.UseOpNo, FoldList,
757 if (DestRC == &AMDGPU::AGPR_32RegClass &&
769 unsigned MovOp =
TII->getMovOpcode(DestRC);
770 if (MovOp == AMDGPU::COPY)
776 while (ImpOpI != ImpOpE) {
815 for (
unsigned I = 0;
I <
Size / 4; ++
I) {
820 int64_t
Imm =
Def->getImm();
822 auto Tmp =
MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
824 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).
addImm(Imm);
826 }
else if (
Def->isReg() &&
TRI->isAGPR(*
MRI,
Def->getReg())) {
828 Def->setIsKill(
false);
829 if (!SeenAGPRs.
insert(Src)) {
840 Def->setIsKill(
false);
846 if (
TRI->isSGPRReg(*
MRI, Src.Reg)) {
849 auto Tmp =
MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
855 if (CopyToVGPR.
Reg) {
857 if (VGPRCopies.
count(CopyToVGPR)) {
858 Vgpr = VGPRCopies[CopyToVGPR];
860 Vgpr =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
862 VGPRCopies[CopyToVGPR] = Vgpr;
864 auto Tmp =
MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
866 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).
addReg(Vgpr);
870 B.addImm(Defs[
I].second);
883 else if (
TRI->isVGPR(*
MRI, Reg0) &&
TRI->isAGPR(*
MRI, Reg1))
885 else if (
ST->hasGFX90AInsts() &&
TRI->isAGPR(*
MRI, Reg0) &&
892 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
893 (UseOpc == AMDGPU::V_READLANE_B32 &&
900 if (FoldingImmLike) {
909 if (OpToFold.
isImm())
942 UseDesc.
operands()[UseOpIdx].RegClass == -1)
946 if (!FoldingImmLike) {
947 if (OpToFold.
isReg() &&
ST->needsAlignedVGPRs()) {
951 if (
TRI->hasVectorRegisters(RC) && OpToFold.
getSubReg()) {
958 if (!RC || !
TRI->isProperlyAlignedRC(*RC))
962 tryAddToFoldList(FoldList,
UseMI, UseOpIdx, &OpToFold);
991 tryAddToFoldList(FoldList,
UseMI, UseOpIdx, &ImmOp);
995 tryAddToFoldList(FoldList,
UseMI, UseOpIdx, &OpToFold);
1001 case AMDGPU::V_AND_B32_e64:
1002 case AMDGPU::V_AND_B32_e32:
1003 case AMDGPU::S_AND_B32:
1006 case AMDGPU::V_OR_B32_e64:
1007 case AMDGPU::V_OR_B32_e32:
1008 case AMDGPU::S_OR_B32:
1011 case AMDGPU::V_XOR_B32_e64:
1012 case AMDGPU::V_XOR_B32_e32:
1013 case AMDGPU::S_XOR_B32:
1016 case AMDGPU::S_XNOR_B32:
1019 case AMDGPU::S_NAND_B32:
1022 case AMDGPU::S_NOR_B32:
1025 case AMDGPU::S_ANDN2_B32:
1026 Result =
LHS & ~RHS;
1028 case AMDGPU::S_ORN2_B32:
1029 Result =
LHS | ~RHS;
1031 case AMDGPU::V_LSHL_B32_e64:
1032 case AMDGPU::V_LSHL_B32_e32:
1033 case AMDGPU::S_LSHL_B32:
1035 Result =
LHS << (
RHS & 31);
1037 case AMDGPU::V_LSHLREV_B32_e64:
1038 case AMDGPU::V_LSHLREV_B32_e32:
1039 Result =
RHS << (
LHS & 31);
1041 case AMDGPU::V_LSHR_B32_e64:
1042 case AMDGPU::V_LSHR_B32_e32:
1043 case AMDGPU::S_LSHR_B32:
1044 Result =
LHS >> (
RHS & 31);
1046 case AMDGPU::V_LSHRREV_B32_e64:
1047 case AMDGPU::V_LSHRREV_B32_e32:
1048 Result =
RHS >> (
LHS & 31);
1050 case AMDGPU::V_ASHR_I32_e64:
1051 case AMDGPU::V_ASHR_I32_e32:
1052 case AMDGPU::S_ASHR_I32:
1053 Result =
static_cast<int32_t
>(
LHS) >> (
RHS & 31);
1055 case AMDGPU::V_ASHRREV_I32_e64:
1056 case AMDGPU::V_ASHRREV_I32_e32:
1057 Result =
static_cast<int32_t
>(
RHS) >> (
LHS & 31);
1065 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1069 MI.setDesc(NewDesc);
1075 unsigned NumOps =
Desc.getNumOperands() +
Desc.implicit_uses().size() +
1076 Desc.implicit_defs().size();
1078 for (
unsigned I =
MI.getNumOperands() - 1;
I >= NumOps; --
I)
1079 MI.removeOperand(
I);
1085 if (!
Op.isReg() ||
Op.getSubReg() != AMDGPU::NoSubRegister ||
1086 !
Op.getReg().isVirtual())
1090 if (Def &&
Def->isMoveImmediate()) {
1103 if (!
MI->allImplicitDefsAreDead())
1106 unsigned Opc =
MI->getOpcode();
1113 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1114 Opc == AMDGPU::S_NOT_B32) &&
1116 MI->getOperand(1).ChangeToImmediate(~Src0->
getImm());
1137 bool IsSGPR =
TRI->isSGPRReg(*
MRI,
MI->getOperand(0).getReg());
1141 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1142 MI->removeOperand(Src1Idx);
1147 if (!
MI->isCommutable())
1155 int32_t Src1Val =
static_cast<int32_t
>(Src1->
getImm());
1156 if (Opc == AMDGPU::V_OR_B32_e64 ||
1157 Opc == AMDGPU::V_OR_B32_e32 ||
1158 Opc == AMDGPU::S_OR_B32) {
1161 MI->removeOperand(Src1Idx);
1163 }
else if (Src1Val == -1) {
1165 MI->removeOperand(Src1Idx);
1173 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1174 Opc == AMDGPU::S_AND_B32) {
1177 MI->removeOperand(Src0Idx);
1179 }
else if (Src1Val == -1) {
1181 MI->removeOperand(Src1Idx);
1189 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1190 Opc == AMDGPU::S_XOR_B32) {
1193 MI->removeOperand(Src1Idx);
1204 unsigned Opc =
MI.getOpcode();
1205 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1206 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1212 auto *Src0Imm = getImmOrMaterializedImm(*Src0);
1213 auto *Src1Imm = getImmOrMaterializedImm(*Src1);
1214 if (!Src1Imm->isIdenticalTo(*Src0Imm))
1222 if ((Src1ModIdx != -1 &&
MI.getOperand(Src1ModIdx).getImm() != 0) ||
1223 (Src0ModIdx != -1 &&
MI.getOperand(Src0ModIdx).getImm() != 0))
1231 MI.removeOperand(Src2Idx);
1233 if (Src1ModIdx != -1)
1234 MI.removeOperand(Src1ModIdx);
1235 if (Src0ModIdx != -1)
1236 MI.removeOperand(Src0ModIdx);
1242bool SIFoldOperands::tryFoldZeroHighBits(
MachineInstr &
MI)
const {
1243 if (
MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1244 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1253 if (!
ST->zeroesHigh16BitsOfDest(SrcDef->
getOpcode()))
1258 MI.eraseFromParent();
1270 bool Changed =
false;
1272 if (OpToFold.
isImm()) {
1283 if (tryConstantFoldOp(&
UseMI)) {
1291 for (
auto &
Use :
MRI->use_nodbg_operands(Dst.getReg()))
1293 for (
auto *U : UsesToProcess) {
1299 if (CopiesToReplace.
empty() && FoldList.
empty())
1305 Copy->addImplicitDefUseOperands(*MF);
1307 for (FoldCandidate &Fold : FoldList) {
1308 assert(!Fold.isReg() || Fold.OpToFold);
1309 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1319 assert(Fold.OpToFold && Fold.OpToFold->isReg());
1323 MRI->clearKillFlags(Fold.OpToFold->getReg());
1326 <<
static_cast<int>(Fold.UseOpNo) <<
" of "
1328 }
else if (Fold.Commuted) {
1330 TII->commuteInstruction(*Fold.UseMI,
false);
1336bool SIFoldOperands::tryFoldFoldableCopy(
1340 if (
MI.getOperand(0).getReg() == AMDGPU::M0) {
1342 if (CurrentKnownM0Val && CurrentKnownM0Val->
isIdenticalTo(NewM0Val)) {
1343 MI.eraseFromParent();
1358 if (!FoldingImm && !OpToFold.
isReg())
1370 if (!
MI.getOperand(0).getReg().isVirtual())
1373 bool Changed = foldInstOperand(
MI, OpToFold);
1380 auto *InstToErase = &
MI;
1381 while (
MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1382 auto &
SrcOp = InstToErase->getOperand(1);
1384 InstToErase->eraseFromParent();
1386 InstToErase =
nullptr;
1389 InstToErase =
MRI->getVRegDef(SrcReg);
1390 if (!InstToErase || !
TII->isFoldableCopy(*InstToErase))
1394 if (InstToErase && InstToErase->isRegSequence() &&
1395 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1396 InstToErase->eraseFromParent();
1406 unsigned Op =
MI.getOpcode();
1408 case AMDGPU::V_MAX_F32_e64:
1409 case AMDGPU::V_MAX_F16_e64:
1410 case AMDGPU::V_MAX_F16_t16_e64:
1411 case AMDGPU::V_MAX_F64_e64:
1412 case AMDGPU::V_PK_MAX_F16: {
1413 if (!
TII->getNamedOperand(
MI, AMDGPU::OpName::clamp)->getImm())
1422 Src0->
getSubReg() != AMDGPU::NoSubRegister)
1426 if (
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
1430 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers)->getImm();
1432 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers)->getImm();
1438 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1450 if (!ClampSrc || !
MRI->hasOneNonDBGUser(ClampSrc->
getReg()))
1456 if (
TII->getClampMask(*Def) !=
TII->getClampMask(
MI))
1463 LLVM_DEBUG(
dbgs() <<
"Folding clamp " << *DefClamp <<
" into " << *Def);
1467 MRI->replaceRegWith(
MI.getOperand(0).getReg(),
Def->getOperand(0).getReg());
1468 MI.eraseFromParent();
1473 if (
TII->convertToThreeAddress(*Def,
nullptr,
nullptr))
1474 Def->eraseFromParent();
1481 case AMDGPU::V_MUL_F64_e64: {
1483 case 0x3fe0000000000000:
1485 case 0x4000000000000000:
1487 case 0x4010000000000000:
1493 case AMDGPU::V_MUL_F32_e64: {
1494 switch (
static_cast<uint32_t>(Val)) {
1505 case AMDGPU::V_MUL_F16_e64:
1506 case AMDGPU::V_MUL_F16_t16_e64: {
1507 switch (
static_cast<uint16_t>(Val)) {
1526std::pair<const MachineOperand *, int>
1528 unsigned Op =
MI.getOpcode();
1530 case AMDGPU::V_MUL_F64_e64:
1531 case AMDGPU::V_MUL_F32_e64:
1532 case AMDGPU::V_MUL_F16_t16_e64:
1533 case AMDGPU::V_MUL_F16_e64: {
1535 if ((
Op == AMDGPU::V_MUL_F32_e64 &&
1537 ((
Op == AMDGPU::V_MUL_F64_e64 ||
Op == AMDGPU::V_MUL_F16_e64 ||
1538 Op == AMDGPU::V_MUL_F16_t16_e64) &&
1546 if (Src0->
isImm()) {
1549 }
else if (Src1->
isImm()) {
1557 TII->hasModifiersSet(
MI, AMDGPU::OpName::src0_modifiers) ||
1558 TII->hasModifiersSet(
MI, AMDGPU::OpName::src1_modifiers) ||
1559 TII->hasModifiersSet(
MI, AMDGPU::OpName::omod) ||
1560 TII->hasModifiersSet(
MI, AMDGPU::OpName::clamp))
1563 return std::pair(RegOp, OMod);
1565 case AMDGPU::V_ADD_F64_e64:
1566 case AMDGPU::V_ADD_F32_e64:
1567 case AMDGPU::V_ADD_F16_e64:
1568 case AMDGPU::V_ADD_F16_t16_e64: {
1570 if ((
Op == AMDGPU::V_ADD_F32_e64 &&
1572 ((
Op == AMDGPU::V_ADD_F64_e64 ||
Op == AMDGPU::V_ADD_F16_e64 ||
1573 Op == AMDGPU::V_ADD_F16_t16_e64) &&
1583 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::src0_modifiers) &&
1584 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::src1_modifiers) &&
1585 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::clamp) &&
1586 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
1600 std::tie(RegOp, OMod) = isOMod(
MI);
1602 RegOp->
getSubReg() != AMDGPU::NoSubRegister ||
1603 !
MRI->hasOneNonDBGUser(RegOp->
getReg()))
1613 if (
TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1619 MRI->replaceRegWith(
MI.getOperand(0).getReg(),
Def->getOperand(0).getReg());
1620 MI.eraseFromParent();
1625 if (
TII->convertToThreeAddress(*Def,
nullptr,
nullptr))
1626 Def->eraseFromParent();
1635 auto Reg =
MI.getOperand(0).getReg();
1637 if (!
ST->hasGFX90AInsts() || !
TRI->isVGPR(*
MRI, Reg) ||
1638 !
MRI->hasOneNonDBGUse(Reg))
1645 for (
auto &Def : Defs) {
1646 const auto *
Op =
Def.first;
1649 if (
TRI->isAGPR(*
MRI,
Op->getReg()))
1663 if (!
TRI->isVGPR(*
MRI, Reg) || !
MRI->hasOneNonDBGUse(Reg))
1665 Op = &*
MRI->use_nodbg_begin(Reg);
1669 if (
Op->getSubReg())
1675 TII->getRegClass(InstDesc, OpIdx,
TRI, *
MI.getMF());
1676 if (!OpRC || !
TRI->isVectorSuperClass(OpRC))
1679 const auto *NewDstRC =
TRI->getEquivalentAGPRClass(
MRI->getRegClass(Reg));
1680 auto Dst =
MRI->createVirtualRegister(NewDstRC);
1682 TII->get(AMDGPU::REG_SEQUENCE), Dst);
1684 for (
unsigned I = 0;
I < Defs.size(); ++
I) {
1686 Def->setIsKill(
false);
1694 RS.addImm(Defs[
I].second);
1698 if (!
TII->isOperandLegal(*
UseMI, OpIdx,
Op)) {
1700 RS->eraseFromParent();
1708 if (
MRI->use_nodbg_empty(
MI.getOperand(0).getReg()))
1709 MI.eraseFromParent();
1717 Register &OutReg,
unsigned &OutSubReg) {
1727 if (
TRI.isAGPR(
MRI, CopySrcReg)) {
1728 OutReg = CopySrcReg;
1737 if (!CopySrcDef || !CopySrcDef->
isCopy())
1744 OtherCopySrc.
getSubReg() != AMDGPU::NoSubRegister ||
1745 !
TRI.isAGPR(
MRI, OtherCopySrcReg))
1748 OutReg = OtherCopySrcReg;
1786 if (!
TRI->isVGPR(*
MRI, PhiOut))
1792 for (
unsigned K = 1;
K <
PHI.getNumExplicitOperands();
K += 2) {
1795 if (!Copy || !
Copy->isCopy())
1799 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
1804 if (
const auto *SubRC =
TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
1815 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
1819 for (
unsigned K = 1;
K <
PHI.getNumExplicitOperands();
K += 2) {
1827 unsigned CopyOpc = AMDGPU::COPY;
1832 if (
Def->isCopy()) {
1834 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
1848 if (IsAGPR32 && !
ST->hasGFX90AInsts() && !
MRI->hasOneNonDBGUse(Reg) &&
1850 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1853 InsertMBB =
Def->getParent();
1860 Register NewReg =
MRI->createVirtualRegister(ARC);
1862 TII->get(CopyOpc), NewReg)
1871 Register NewReg =
MRI->createVirtualRegister(ARC);
1872 PHI.getOperand(0).setReg(NewReg);
1878 TII->get(AMDGPU::COPY), PhiOut)
1888 if (!
ST->hasGFX90AInsts() ||
MI.getNumExplicitDefs() != 1)
1909 while (!
Users.empty()) {
1911 if (!
I->isCopy() && !
I->isRegSequence())
1913 Register DstReg =
I->getOperand(0).getReg();
1917 if (
TRI->isAGPR(*
MRI, DstReg))
1921 Users.push_back(&U);
1925 MRI->setRegClass(DefReg,
TRI->getEquivalentAGPRClass(RC));
1926 if (!
TII->isOperandLegal(
MI, 0, &Def)) {
1927 MRI->setRegClass(DefReg, RC);
1931 while (!MoveRegs.
empty()) {
1933 MRI->setRegClass(Reg,
TRI->getEquivalentAGPRClass(
MRI->getRegClass(Reg)));
1976 if (
ST->hasGFX90AInsts())
1983 for (
auto &
MI :
MBB) {
1987 if (!
TRI->isAGPR(*
MRI,
MI.getOperand(0).getReg()))
1990 for (
unsigned K = 1;
K <
MI.getNumOperands();
K += 2) {
1998 bool Changed =
false;
1999 for (
const auto &[Entry, MOs] : RegToMO) {
2000 if (MOs.size() == 1)
2011 MRI->createVirtualRegister(
TRI->getEquivalentVGPRClass(ARC));
2014 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2018 Register TempAGPR =
MRI->createVirtualRegister(ARC);
2020 TII->get(AMDGPU::COPY), TempAGPR)
2042 TII =
ST->getInstrInfo();
2043 TRI = &
TII->getRegisterInfo();
2051 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2053 bool Changed =
false;
2057 Changed |= tryFoldCndMask(
MI);
2059 if (tryFoldZeroHighBits(
MI)) {
2064 if (
MI.isRegSequence() && tryFoldRegSequence(
MI)) {
2069 if (
MI.isPHI() && tryFoldPhiAGPR(
MI)) {
2074 if (
MI.mayLoad() && tryFoldLoad(
MI)) {
2079 if (
TII->isFoldableCopy(
MI)) {
2080 Changed |= tryFoldFoldableCopy(
MI, CurrentKnownM0Val);
2085 if (CurrentKnownM0Val &&
MI.modifiesRegister(AMDGPU::M0,
TRI))
2086 CurrentKnownM0Val =
nullptr;
2092 Changed |= tryFoldClamp(
MI);
2095 Changed |= tryOptimizeAGPRPhis(*
MBB);
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
iv Induction Variable Users
unsigned const TargetRegisterInfo * TRI
static bool isReg(const MCInst &MI, unsigned OpNo)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static bool isUseMIInFoldList(ArrayRef< FoldCandidate > FoldList, const MachineInstr *MI)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, bool Commuted=false, int ShrinkOp=-1)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
Class for arbitrary precision integers.
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
This class represents an Operation in the Expression.
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
FunctionPass class - This class is used to implement most global optimizations.
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
This holds information about one operand of a machine instruction, indicating the register class for ...
iterator SkipPHIsLabelsAndDebug(iterator I, bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
unsigned getNumOperands() const
Retuns the total number of operands.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
bool isRegSequence() const
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr reads the specified register.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
iterator_range< mop_iterator > implicit_operands()
const MachineOperand & getOperand(unsigned i) const
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A Use represents the edge between a Value definition and its users.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
@ OPERAND_REG_INLINE_C_LAST
@ OPERAND_REG_IMM_V2INT16
@ OPERAND_REG_INLINE_AC_FIRST
@ OPERAND_REG_INLINE_C_V2FP16
@ OPERAND_REG_INLINE_C_FIRST
@ OPERAND_REG_INLINE_AC_LAST
@ OPERAND_REG_INLINE_C_INT32
@ OPERAND_REG_INLINE_C_V2INT16
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionPass * createSIFoldOperandsPass()
DWARFExpression::Operation Op
void initializeSIFoldOperandsPass(PassRegistry &)
iterator_range< df_iterator< T > > depth_first(const T &G)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Description of the encoding of one expression Op.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
A pair composed of a register and a sub-register index.