19#define DEBUG_TYPE "si-fold-operands"
37 bool Commuted_ =
false,
39 UseMI(
MI), OpToFold(
nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
42 if (FoldOp->
isImm()) {
43 ImmToFold = FoldOp->
getImm();
44 }
else if (FoldOp->
isFI()) {
45 FrameIndexToFold = FoldOp->
getIndex();
66 bool needsShrink()
const {
return ShrinkOpcode != -1; }
83 bool canUseImmWithOpSel(FoldCandidate &Fold)
const;
85 bool tryFoldImmWithOpSel(FoldCandidate &Fold)
const;
93 getRegSeqInit(
SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
115 std::pair<const MachineOperand *, int> isOMod(
const MachineInstr &
MI)
const;
141 "SI Fold Operands",
false,
false)
143char SIFoldOperands::
ID = 0;
152 TRI.getSubRegisterClass(RC, MO.getSubReg()))
160 case AMDGPU::V_MAC_F32_e64:
161 return AMDGPU::V_MAD_F32_e64;
162 case AMDGPU::V_MAC_F16_e64:
163 return AMDGPU::V_MAD_F16_e64;
164 case AMDGPU::V_FMAC_F32_e64:
165 return AMDGPU::V_FMA_F32_e64;
166 case AMDGPU::V_FMAC_F16_e64:
167 return AMDGPU::V_FMA_F16_gfx9_e64;
168 case AMDGPU::V_FMAC_F16_t16_e64:
169 return AMDGPU::V_FMA_F16_gfx9_e64;
170 case AMDGPU::V_FMAC_LEGACY_F32_e64:
171 return AMDGPU::V_FMA_LEGACY_F32_e64;
172 case AMDGPU::V_FMAC_F64_e64:
173 return AMDGPU::V_FMA_F64_e64;
175 return AMDGPU::INSTRUCTION_LIST_END;
182 if (!OpToFold.
isFI())
185 const unsigned Opc =
UseMI.getOpcode();
196 return OpNo == VIdx && SIdx == -1;
200 return new SIFoldOperands();
203bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold)
const {
206 const uint64_t TSFlags =
MI->getDesc().TSFlags;
215 unsigned Opcode =
MI->getOpcode();
216 int OpNo =
MI->getOperandNo(&Old);
217 uint8_t OpType =
TII->get(Opcode).operands()[OpNo].OperandType;
233bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold)
const {
236 unsigned Opcode =
MI->getOpcode();
237 int OpNo =
MI->getOperandNo(&Old);
238 uint8_t OpType =
TII->get(Opcode).operands()[OpNo].OperandType;
251 unsigned SrcIdx = ~0;
253 ModIdx = AMDGPU::OpName::src0_modifiers;
256 ModIdx = AMDGPU::OpName::src1_modifiers;
259 ModIdx = AMDGPU::OpName::src2_modifiers;
265 unsigned ModVal =
Mod.getImm();
289 Mod.setImm(NewModVal);
294 if (
static_cast<int16_t
>(
Lo) < 0) {
295 int32_t SExt =
static_cast<int16_t
>(
Lo);
297 Mod.setImm(NewModVal);
324 if (tryFoldToInline(Imm))
333 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
334 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
335 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
338 bool Clamp =
MI->getOperand(ClampIdx).getImm() != 0;
345 if (tryFoldToInline(NegImm)) {
347 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
348 MI->setDesc(
TII->get(NegOpcode));
357bool SIFoldOperands::updateOperand(FoldCandidate &Fold)
const {
362 if (Fold.isImm() && canUseImmWithOpSel(Fold)) {
363 if (tryFoldImmWithOpSel(Fold))
369 int OpNo =
MI->getOperandNo(&Old);
370 if (!
TII->isOperandLegal(*
MI, OpNo, &New))
376 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
384 int Op32 = Fold.ShrinkOpcode;
389 bool HaveNonDbgCarryUse = !
MRI->use_nodbg_empty(Dst1.
getReg());
392 Register NewReg0 =
MRI->createVirtualRegister(Dst0RC);
396 if (HaveNonDbgCarryUse) {
409 for (
unsigned I =
MI->getNumOperands() - 1;
I > 0; --
I)
410 MI->removeOperand(
I);
411 MI->setDesc(
TII->get(AMDGPU::IMPLICIT_DEF));
414 TII->commuteInstruction(*Inst32,
false);
418 assert(!Fold.needsShrink() &&
"not handled");
423 if (NewMFMAOpc == -1)
425 MI->setDesc(
TII->get(NewMFMAOpc));
426 MI->untieRegOperand(0);
432 if (Fold.isGlobal()) {
433 Old.
ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
434 Fold.OpToFold->getTargetFlags());
451 return any_of(FoldList, [&](
const auto &
C) {
return C.UseMI ==
MI; });
459 for (FoldCandidate &Fold : FoldList)
460 if (Fold.UseMI ==
MI && Fold.UseOpNo == OpNo)
462 LLVM_DEBUG(
dbgs() <<
"Append " << (Commuted ?
"commuted" :
"normal")
463 <<
" operand " << OpNo <<
"\n " << *
MI);
470 const unsigned Opc =
MI->getOpcode();
472 auto tryToFoldAsFMAAKorMK = [&]() {
473 if (!OpToFold->
isImm())
476 const bool TryAK = OpNo == 3;
477 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
478 MI->setDesc(
TII->get(NewOpc));
481 bool FoldAsFMAAKorMK =
482 tryAddToFoldList(FoldList,
MI, TryAK ? 3 : 2, OpToFold);
483 if (FoldAsFMAAKorMK) {
485 MI->untieRegOperand(3);
502 MI->setDesc(
TII->get(Opc));
506 bool IsLegal =
TII->isOperandLegal(*
MI, OpNo, OpToFold);
507 if (!IsLegal && OpToFold->
isImm()) {
508 FoldCandidate Fold(
MI, OpNo, OpToFold);
509 IsLegal = canUseImmWithOpSel(Fold);
515 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
518 MI->setDesc(
TII->get(NewOpc));
523 bool FoldAsMAD = tryAddToFoldList(FoldList,
MI, OpNo, OpToFold);
525 MI->untieRegOperand(OpNo);
529 MI->removeOperand(
MI->getNumExplicitOperands() - 1);
530 MI->setDesc(
TII->get(Opc));
535 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
536 if (tryToFoldAsFMAAKorMK())
541 if (OpToFold->
isImm()) {
543 if (Opc == AMDGPU::S_SETREG_B32)
544 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
545 else if (Opc == AMDGPU::S_SETREG_B32_mode)
546 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
548 MI->setDesc(
TII->get(ImmOpc));
563 bool CanCommute =
TII->findCommutedOpIndices(*
MI, OpNo, CommuteOpNo);
571 if (!
MI->getOperand(OpNo).isReg() || !
MI->getOperand(CommuteOpNo).isReg())
574 if (!
TII->commuteInstruction(*
MI,
false, OpNo, CommuteOpNo))
578 if (!
TII->isOperandLegal(*
MI, CommuteOpNo, OpToFold)) {
579 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
580 Opc != AMDGPU::V_SUBREV_CO_U32_e64) ||
582 TII->commuteInstruction(*
MI,
false, OpNo, CommuteOpNo);
589 if (!OtherOp.
isReg() ||
596 unsigned MaybeCommutedOpc =
MI->getOpcode();
606 if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) &&
607 !OpToFold->
isReg() && !
TII->isInlineConstant(*OpToFold)) {
608 unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2;
610 if (!OpImm.
isReg() &&
611 TII->isInlineConstant(*
MI,
MI->getOperand(OpNo), OpImm))
612 return tryToFoldAsFMAAKorMK();
620 if (Opc == AMDGPU::S_FMAC_F32 &&
621 (OpNo != 1 || !
MI->getOperand(1).isIdenticalTo(
MI->getOperand(2)))) {
622 if (tryToFoldAsFMAAKorMK())
628 if (
TII->isSALU(
MI->getOpcode())) {
633 if (!OpToFold->
isReg() && !
TII->isInlineConstant(*OpToFold, OpInfo)) {
635 for (
unsigned i = 0, e = InstDesc.
getNumOperands(); i != e; ++i) {
636 auto &
Op =
MI->getOperand(i);
637 if (OpNo != i && !
Op.isReg() &&
651 return !
TII->isSDWA(
MI);
657bool SIFoldOperands::getRegSeqInit(
661 if (!Def || !
Def->isRegSequence())
664 for (
unsigned I = 1, E =
Def->getNumExplicitOperands();
I < E;
I += 2) {
671 SubDef =
MRI->getVRegDef(Sub->
getReg())) {
674 if (
TII->isInlineConstant(*
Op, OpTy))
678 if (!
Op->isReg() ||
Op->getReg().isPhysical())
683 Defs.emplace_back(Sub,
Def->getOperand(
I + 1).getImm());
689bool SIFoldOperands::tryToFoldACImm(
693 if (UseOpIdx >=
Desc.getNumOperands())
699 uint8_t OpTy =
Desc.operands()[UseOpIdx].OperandType;
700 if (OpToFold.
isImm() &&
TII->isInlineConstant(OpToFold, OpTy) &&
701 TII->isOperandLegal(*
UseMI, UseOpIdx, &OpToFold)) {
706 if (!OpToFold.
isReg())
719 if (!UseOp.
getSubReg() && Def &&
TII->isFoldableCopy(*Def)) {
721 if (DefOp.
isImm() &&
TII->isInlineConstant(DefOp, OpTy) &&
722 TII->isOperandLegal(*
UseMI, UseOpIdx, &DefOp)) {
729 if (!getRegSeqInit(Defs,
UseReg, OpTy))
733 for (
unsigned I = 0, E = Defs.
size();
I != E; ++
I) {
738 auto SubImm =
Op->getImm();
741 if (!
TII->isInlineConstant(*
Op, OpTy) ||
755void SIFoldOperands::foldOperand(
763 if (!isUseSafeToFold(*
UseMI, *UseOp))
780 for (
auto &
Use :
MRI->use_nodbg_operands(RegSeqDstReg))
782 for (
auto *RSUse : UsesToProcess) {
789 if (RSUse->getSubReg() != RegSeqDstSubReg)
792 foldOperand(OpToFold, RSUseMI, RSUseMI->
getOperandNo(RSUse), FoldList,
798 if (tryToFoldACImm(OpToFold,
UseMI, UseOpIdx, FoldList))
801 if (frameIndexMayFold(*
UseMI, UseOpIdx, OpToFold)) {
806 if (
TII->getNamedOperand(*
UseMI, AMDGPU::OpName::srsrc)->getReg() !=
807 MFI->getScratchRSrcReg())
813 *
TII->getNamedOperand(*
UseMI, AMDGPU::OpName::soffset);
833 bool FoldingImmLike =
851 if (DestRC == &AMDGPU::AGPR_32RegClass &&
863 unsigned MovOp =
TII->getMovOpcode(DestRC);
864 if (MovOp == AMDGPU::COPY)
869 while (ImpOpI != ImpOpE) {
876 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
921 for (
unsigned I = 0;
I <
Size / 4; ++
I) {
926 int64_t
Imm =
Def->getImm();
928 auto Tmp =
MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
930 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).
addImm(Imm);
932 }
else if (
Def->isReg() &&
TRI->isAGPR(*
MRI,
Def->getReg())) {
934 Def->setIsKill(
false);
935 if (!SeenAGPRs.
insert(Src)) {
946 Def->setIsKill(
false);
952 if (
TRI->isSGPRReg(*
MRI, Src.Reg)) {
955 auto Tmp =
MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
961 if (CopyToVGPR.
Reg) {
963 if (VGPRCopies.
count(CopyToVGPR)) {
964 Vgpr = VGPRCopies[CopyToVGPR];
966 Vgpr =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
968 VGPRCopies[CopyToVGPR] = Vgpr;
970 auto Tmp =
MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
972 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).
addReg(Vgpr);
976 B.addImm(Defs[
I].second);
989 else if (
TRI->isVGPR(*
MRI, Reg0) &&
TRI->isAGPR(*
MRI, Reg1))
991 else if (
ST->hasGFX90AInsts() &&
TRI->isAGPR(*
MRI, Reg0) &&
998 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
999 (UseOpc == AMDGPU::V_READLANE_B32 &&
1006 if (FoldingImmLike) {
1015 if (OpToFold.
isImm())
1048 UseDesc.
operands()[UseOpIdx].RegClass == -1)
1052 if (!FoldingImmLike) {
1053 if (OpToFold.
isReg() &&
ST->needsAlignedVGPRs()) {
1058 if (
TRI->hasVectorRegisters(RC) && OpToFold.
getSubReg()) {
1065 if (!RC || !
TRI->isProperlyAlignedRC(*RC))
1069 tryAddToFoldList(FoldList,
UseMI, UseOpIdx, &OpToFold);
1080 TRI->getRegClass(FoldDesc.
operands()[0].RegClass);
1090 if (UseOp->
getSubReg() == AMDGPU::sub0) {
1098 tryAddToFoldList(FoldList,
UseMI, UseOpIdx, &ImmOp);
1102 tryAddToFoldList(FoldList,
UseMI, UseOpIdx, &OpToFold);
1108 case AMDGPU::V_AND_B32_e64:
1109 case AMDGPU::V_AND_B32_e32:
1110 case AMDGPU::S_AND_B32:
1113 case AMDGPU::V_OR_B32_e64:
1114 case AMDGPU::V_OR_B32_e32:
1115 case AMDGPU::S_OR_B32:
1118 case AMDGPU::V_XOR_B32_e64:
1119 case AMDGPU::V_XOR_B32_e32:
1120 case AMDGPU::S_XOR_B32:
1123 case AMDGPU::S_XNOR_B32:
1126 case AMDGPU::S_NAND_B32:
1129 case AMDGPU::S_NOR_B32:
1132 case AMDGPU::S_ANDN2_B32:
1133 Result =
LHS & ~RHS;
1135 case AMDGPU::S_ORN2_B32:
1136 Result =
LHS | ~RHS;
1138 case AMDGPU::V_LSHL_B32_e64:
1139 case AMDGPU::V_LSHL_B32_e32:
1140 case AMDGPU::S_LSHL_B32:
1142 Result =
LHS << (
RHS & 31);
1144 case AMDGPU::V_LSHLREV_B32_e64:
1145 case AMDGPU::V_LSHLREV_B32_e32:
1146 Result =
RHS << (
LHS & 31);
1148 case AMDGPU::V_LSHR_B32_e64:
1149 case AMDGPU::V_LSHR_B32_e32:
1150 case AMDGPU::S_LSHR_B32:
1151 Result =
LHS >> (
RHS & 31);
1153 case AMDGPU::V_LSHRREV_B32_e64:
1154 case AMDGPU::V_LSHRREV_B32_e32:
1155 Result =
RHS >> (
LHS & 31);
1157 case AMDGPU::V_ASHR_I32_e64:
1158 case AMDGPU::V_ASHR_I32_e32:
1159 case AMDGPU::S_ASHR_I32:
1160 Result =
static_cast<int32_t
>(
LHS) >> (
RHS & 31);
1162 case AMDGPU::V_ASHRREV_I32_e64:
1163 case AMDGPU::V_ASHRREV_I32_e32:
1164 Result =
static_cast<int32_t
>(
RHS) >> (
LHS & 31);
1172 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1176 MI.setDesc(NewDesc);
1182 unsigned NumOps =
Desc.getNumOperands() +
Desc.implicit_uses().size() +
1183 Desc.implicit_defs().size();
1185 for (
unsigned I =
MI.getNumOperands() - 1;
I >= NumOps; --
I)
1186 MI.removeOperand(
I);
1192 if (!
Op.isReg() ||
Op.getSubReg() != AMDGPU::NoSubRegister ||
1193 !
Op.getReg().isVirtual())
1197 if (Def &&
Def->isMoveImmediate()) {
1210 if (!
MI->allImplicitDefsAreDead())
1213 unsigned Opc =
MI->getOpcode();
1220 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1221 Opc == AMDGPU::S_NOT_B32) &&
1223 MI->getOperand(1).ChangeToImmediate(~Src0->
getImm());
1244 bool IsSGPR =
TRI->isSGPRReg(*
MRI,
MI->getOperand(0).getReg());
1248 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1249 MI->removeOperand(Src1Idx);
1254 if (!
MI->isCommutable())
1262 int32_t Src1Val =
static_cast<int32_t
>(Src1->
getImm());
1263 if (Opc == AMDGPU::V_OR_B32_e64 ||
1264 Opc == AMDGPU::V_OR_B32_e32 ||
1265 Opc == AMDGPU::S_OR_B32) {
1268 MI->removeOperand(Src1Idx);
1270 }
else if (Src1Val == -1) {
1272 MI->removeOperand(Src1Idx);
1280 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1281 Opc == AMDGPU::S_AND_B32) {
1284 MI->removeOperand(Src0Idx);
1286 }
else if (Src1Val == -1) {
1288 MI->removeOperand(Src1Idx);
1296 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1297 Opc == AMDGPU::S_XOR_B32) {
1300 MI->removeOperand(Src1Idx);
1311 unsigned Opc =
MI.getOpcode();
1312 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1313 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1319 auto *Src0Imm = getImmOrMaterializedImm(*Src0);
1320 auto *Src1Imm = getImmOrMaterializedImm(*Src1);
1321 if (!Src1Imm->isIdenticalTo(*Src0Imm))
1329 if ((Src1ModIdx != -1 &&
MI.getOperand(Src1ModIdx).getImm() != 0) ||
1330 (Src0ModIdx != -1 &&
MI.getOperand(Src0ModIdx).getImm() != 0))
1338 MI.removeOperand(Src2Idx);
1340 if (Src1ModIdx != -1)
1341 MI.removeOperand(Src1ModIdx);
1342 if (Src0ModIdx != -1)
1343 MI.removeOperand(Src0ModIdx);
1349bool SIFoldOperands::tryFoldZeroHighBits(
MachineInstr &
MI)
const {
1350 if (
MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1351 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1360 if (!
ST->zeroesHigh16BitsOfDest(SrcDef->
getOpcode()))
1364 MRI->replaceRegWith(Dst, Src1);
1365 if (!
MI.getOperand(2).isKill())
1366 MRI->clearKillFlags(Src1);
1367 MI.eraseFromParent();
1379 bool Changed =
false;
1381 if (OpToFold.
isImm()) {
1392 if (tryConstantFoldOp(&
UseMI)) {
1400 for (
auto &
Use :
MRI->use_nodbg_operands(Dst.getReg()))
1402 for (
auto *U : UsesToProcess) {
1408 if (CopiesToReplace.
empty() && FoldList.
empty())
1414 Copy->addImplicitDefUseOperands(*MF);
1416 for (FoldCandidate &Fold : FoldList) {
1417 assert(!Fold.isReg() || Fold.OpToFold);
1418 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1428 assert(Fold.OpToFold && Fold.OpToFold->isReg());
1432 MRI->clearKillFlags(Fold.OpToFold->getReg());
1435 <<
static_cast<int>(Fold.UseOpNo) <<
" of "
1437 }
else if (Fold.Commuted) {
1439 TII->commuteInstruction(*Fold.UseMI,
false);
1445bool SIFoldOperands::tryFoldFoldableCopy(
1449 if (
MI.getOperand(0).getReg() == AMDGPU::M0) {
1451 if (CurrentKnownM0Val && CurrentKnownM0Val->
isIdenticalTo(NewM0Val)) {
1452 MI.eraseFromParent();
1464 if (
MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
1466 if (
TII->hasAnyModifiersSet(
MI))
1468 OpToFoldPtr = &
MI.getOperand(2);
1470 OpToFoldPtr = &
MI.getOperand(1);
1475 if (!FoldingImm && !OpToFold.
isReg())
1487 if (!
MI.getOperand(0).getReg().isVirtual())
1490 bool Changed = foldInstOperand(
MI, OpToFold);
1497 auto *InstToErase = &
MI;
1498 while (
MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1499 auto &
SrcOp = InstToErase->getOperand(1);
1501 InstToErase->eraseFromParent();
1503 InstToErase =
nullptr;
1506 InstToErase =
MRI->getVRegDef(SrcReg);
1507 if (!InstToErase || !
TII->isFoldableCopy(*InstToErase))
1511 if (InstToErase && InstToErase->isRegSequence() &&
1512 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1513 InstToErase->eraseFromParent();
1523 unsigned Op =
MI.getOpcode();
1525 case AMDGPU::V_MAX_F32_e64:
1526 case AMDGPU::V_MAX_F16_e64:
1527 case AMDGPU::V_MAX_F16_t16_e64:
1528 case AMDGPU::V_MAX_F16_fake16_e64:
1529 case AMDGPU::V_MAX_F64_e64:
1530 case AMDGPU::V_MAX_NUM_F64_e64:
1531 case AMDGPU::V_PK_MAX_F16: {
1532 if (
MI.mayRaiseFPException())
1535 if (!
TII->getNamedOperand(
MI, AMDGPU::OpName::clamp)->getImm())
1544 Src0->
getSubReg() != AMDGPU::NoSubRegister)
1548 if (
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
1552 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers)->getImm();
1554 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers)->getImm();
1560 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1572 if (!ClampSrc || !
MRI->hasOneNonDBGUser(ClampSrc->
getReg()))
1578 if (
TII->getClampMask(*Def) !=
TII->getClampMask(
MI))
1581 if (
Def->mayRaiseFPException())
1588 LLVM_DEBUG(
dbgs() <<
"Folding clamp " << *DefClamp <<
" into " << *Def);
1594 Register MIDstReg =
MI.getOperand(0).getReg();
1595 if (
TRI->isSGPRReg(*
MRI, DefReg)) {
1602 MRI->replaceRegWith(MIDstReg, DefReg);
1604 MI.eraseFromParent();
1609 if (
TII->convertToThreeAddress(*Def,
nullptr,
nullptr))
1610 Def->eraseFromParent();
1617 case AMDGPU::V_MUL_F64_e64:
1618 case AMDGPU::V_MUL_F64_pseudo_e64: {
1620 case 0x3fe0000000000000:
1622 case 0x4000000000000000:
1624 case 0x4010000000000000:
1630 case AMDGPU::V_MUL_F32_e64: {
1631 switch (
static_cast<uint32_t>(Val)) {
1642 case AMDGPU::V_MUL_F16_e64:
1643 case AMDGPU::V_MUL_F16_t16_e64:
1644 case AMDGPU::V_MUL_F16_fake16_e64: {
1645 switch (
static_cast<uint16_t>(Val)) {
1664std::pair<const MachineOperand *, int>
1666 unsigned Op =
MI.getOpcode();
1668 case AMDGPU::V_MUL_F64_e64:
1669 case AMDGPU::V_MUL_F64_pseudo_e64:
1670 case AMDGPU::V_MUL_F32_e64:
1671 case AMDGPU::V_MUL_F16_t16_e64:
1672 case AMDGPU::V_MUL_F16_fake16_e64:
1673 case AMDGPU::V_MUL_F16_e64: {
1675 if ((
Op == AMDGPU::V_MUL_F32_e64 &&
1677 ((
Op == AMDGPU::V_MUL_F64_e64 ||
Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
1678 Op == AMDGPU::V_MUL_F16_e64 ||
Op == AMDGPU::V_MUL_F16_t16_e64 ||
1679 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
1680 MFI->getMode().FP64FP16Denormals.Output !=
1682 MI.mayRaiseFPException())
1689 if (Src0->
isImm()) {
1692 }
else if (Src1->
isImm()) {
1700 TII->hasModifiersSet(
MI, AMDGPU::OpName::src0_modifiers) ||
1701 TII->hasModifiersSet(
MI, AMDGPU::OpName::src1_modifiers) ||
1702 TII->hasModifiersSet(
MI, AMDGPU::OpName::omod) ||
1703 TII->hasModifiersSet(
MI, AMDGPU::OpName::clamp))
1706 return std::pair(RegOp, OMod);
1708 case AMDGPU::V_ADD_F64_e64:
1709 case AMDGPU::V_ADD_F64_pseudo_e64:
1710 case AMDGPU::V_ADD_F32_e64:
1711 case AMDGPU::V_ADD_F16_e64:
1712 case AMDGPU::V_ADD_F16_t16_e64:
1713 case AMDGPU::V_ADD_F16_fake16_e64: {
1715 if ((
Op == AMDGPU::V_ADD_F32_e64 &&
1717 ((
Op == AMDGPU::V_ADD_F64_e64 ||
Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
1718 Op == AMDGPU::V_ADD_F16_e64 ||
Op == AMDGPU::V_ADD_F16_t16_e64 ||
1719 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
1729 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::src0_modifiers) &&
1730 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::src1_modifiers) &&
1731 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::clamp) &&
1732 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
1746 std::tie(RegOp, OMod) = isOMod(
MI);
1748 RegOp->
getSubReg() != AMDGPU::NoSubRegister ||
1749 !
MRI->hasOneNonDBGUser(RegOp->
getReg()))
1757 if (
Def->mayRaiseFPException())
1762 if (
TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1768 MRI->replaceRegWith(
MI.getOperand(0).getReg(),
Def->getOperand(0).getReg());
1769 MI.eraseFromParent();
1774 if (
TII->convertToThreeAddress(*Def,
nullptr,
nullptr))
1775 Def->eraseFromParent();
1784 auto Reg =
MI.getOperand(0).getReg();
1786 if (!
ST->hasGFX90AInsts() || !
TRI->isVGPR(*
MRI, Reg) ||
1787 !
MRI->hasOneNonDBGUse(Reg))
1794 for (
auto &[
Op, SubIdx] : Defs) {
1797 if (
TRI->isAGPR(*
MRI,
Op->getReg()))
1811 if (!
TRI->isVGPR(*
MRI, Reg) || !
MRI->hasOneNonDBGUse(Reg))
1813 Op = &*
MRI->use_nodbg_begin(Reg);
1817 if (
Op->getSubReg())
1823 TII->getRegClass(InstDesc, OpIdx,
TRI, *
MI.getMF());
1824 if (!OpRC || !
TRI->isVectorSuperClass(OpRC))
1827 const auto *NewDstRC =
TRI->getEquivalentAGPRClass(
MRI->getRegClass(Reg));
1828 auto Dst =
MRI->createVirtualRegister(NewDstRC);
1830 TII->get(AMDGPU::REG_SEQUENCE), Dst);
1832 for (
auto &[Def, SubIdx] : Defs) {
1833 Def->setIsKill(
false);
1845 if (!
TII->isOperandLegal(*
UseMI, OpIdx,
Op)) {
1847 RS->eraseFromParent();
1855 if (
MRI->use_nodbg_empty(
MI.getOperand(0).getReg()))
1856 MI.eraseFromParent();
1864 Register &OutReg,
unsigned &OutSubReg) {
1874 if (
TRI.isAGPR(
MRI, CopySrcReg)) {
1875 OutReg = CopySrcReg;
1884 if (!CopySrcDef || !CopySrcDef->
isCopy())
1891 OtherCopySrc.
getSubReg() != AMDGPU::NoSubRegister ||
1892 !
TRI.isAGPR(
MRI, OtherCopySrcReg))
1895 OutReg = OtherCopySrcReg;
1933 if (!
TRI->isVGPR(*
MRI, PhiOut))
1939 for (
unsigned K = 1;
K <
PHI.getNumExplicitOperands();
K += 2) {
1942 if (!Copy || !
Copy->isCopy())
1946 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
1951 if (
const auto *SubRC =
TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
1962 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
1966 for (
unsigned K = 1;
K <
PHI.getNumExplicitOperands();
K += 2) {
1974 unsigned CopyOpc = AMDGPU::COPY;
1979 if (
Def->isCopy()) {
1981 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
1995 if (IsAGPR32 && !
ST->hasGFX90AInsts() && !
MRI->hasOneNonDBGUse(Reg) &&
1997 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2000 InsertMBB =
Def->getParent();
2007 Register NewReg =
MRI->createVirtualRegister(ARC);
2009 TII->get(CopyOpc), NewReg)
2018 Register NewReg =
MRI->createVirtualRegister(ARC);
2019 PHI.getOperand(0).setReg(NewReg);
2025 TII->get(AMDGPU::COPY), PhiOut)
2035 if (!
ST->hasGFX90AInsts() ||
MI.getNumExplicitDefs() != 1)
2056 while (!
Users.empty()) {
2058 if (!
I->isCopy() && !
I->isRegSequence())
2060 Register DstReg =
I->getOperand(0).getReg();
2064 if (
TRI->isAGPR(*
MRI, DstReg))
2068 Users.push_back(&U);
2072 MRI->setRegClass(DefReg,
TRI->getEquivalentAGPRClass(RC));
2073 if (!
TII->isOperandLegal(
MI, 0, &Def)) {
2074 MRI->setRegClass(DefReg, RC);
2078 while (!MoveRegs.
empty()) {
2080 MRI->setRegClass(Reg,
TRI->getEquivalentAGPRClass(
MRI->getRegClass(Reg)));
2123 if (
ST->hasGFX90AInsts())
2130 for (
auto &
MI :
MBB) {
2134 if (!
TRI->isAGPR(*
MRI,
MI.getOperand(0).getReg()))
2137 for (
unsigned K = 1;
K <
MI.getNumOperands();
K += 2) {
2147 bool Changed =
false;
2148 for (
const auto &[Entry, MOs] : RegToMO) {
2149 if (MOs.size() == 1)
2160 MRI->createVirtualRegister(
TRI->getEquivalentVGPRClass(ARC));
2163 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2167 Register TempAGPR =
MRI->createVirtualRegister(ARC);
2169 TII->get(AMDGPU::COPY), TempAGPR)
2191 TII =
ST->getInstrInfo();
2192 TRI = &
TII->getRegisterInfo();
2200 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2202 bool Changed =
false;
2206 Changed |= tryFoldCndMask(
MI);
2208 if (tryFoldZeroHighBits(
MI)) {
2213 if (
MI.isRegSequence() && tryFoldRegSequence(
MI)) {
2218 if (
MI.isPHI() && tryFoldPhiAGPR(
MI)) {
2223 if (
MI.mayLoad() && tryFoldLoad(
MI)) {
2228 if (
TII->isFoldableCopy(
MI)) {
2229 Changed |= tryFoldFoldableCopy(
MI, CurrentKnownM0Val);
2234 if (CurrentKnownM0Val &&
MI.modifiesRegister(AMDGPU::M0,
TRI))
2235 CurrentKnownM0Val =
nullptr;
2241 Changed |= tryFoldClamp(
MI);
2244 Changed |= tryOptimizeAGPRPhis(*
MBB);
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
iv Induction Variable Users
unsigned const TargetRegisterInfo * TRI
static bool isReg(const MCInst &MI, unsigned OpNo)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static bool isUseMIInFoldList(ArrayRef< FoldCandidate > FoldList, const MachineInstr *MI)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, bool Commuted=false, int ShrinkOp=-1)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
support::ulittle16_t & Lo
support::ulittle16_t & Hi
Class for arbitrary precision integers.
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
This class represents an Operation in the Expression.
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
FunctionPass class - This class is used to implement most global optimizations.
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
This holds information about one operand of a machine instruction, indicating the register class for ...
iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
bool isRegSequence() const
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
iterator_range< mop_iterator > implicit_operands()
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A Use represents the edge between a Value definition and its users.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo)
Does this operand support only inlinable literals?
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
@ OPERAND_REG_INLINE_C_V2BF16
@ OPERAND_REG_IMM_V2INT16
@ OPERAND_REG_INLINE_C_V2FP16
@ OPERAND_REG_INLINE_AC_V2INT16
@ OPERAND_REG_INLINE_C_INT32
@ OPERAND_REG_INLINE_C_V2INT16
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionPass * createSIFoldOperandsPass()
DWARFExpression::Operation Op
void initializeSIFoldOperandsPass(PassRegistry &)
iterator_range< df_iterator< T > > depth_first(const T &G)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Description of the encoding of one expression Op.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
A pair composed of a register and a sub-register index.