19#define DEBUG_TYPE "si-fold-operands"
37 bool Commuted_ =
false,
39 UseMI(
MI), OpToFold(
nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
42 if (FoldOp->
isImm()) {
43 ImmToFold = FoldOp->
getImm();
44 }
else if (FoldOp->
isFI()) {
45 FrameIndexToFold = FoldOp->
getIndex();
66 bool needsShrink()
const {
return ShrinkOpcode != -1; }
83 bool canUseImmWithOpSel(FoldCandidate &Fold)
const;
85 bool tryFoldImmWithOpSel(FoldCandidate &Fold)
const;
93 getRegSeqInit(
SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
115 std::pair<const MachineOperand *, int> isOMod(
const MachineInstr &
MI)
const;
141 "SI Fold Operands",
false,
false)
143char SIFoldOperands::
ID = 0;
152 TRI.getSubRegisterClass(RC, MO.getSubReg()))
160 case AMDGPU::V_MAC_F32_e64:
161 return AMDGPU::V_MAD_F32_e64;
162 case AMDGPU::V_MAC_F16_e64:
163 return AMDGPU::V_MAD_F16_e64;
164 case AMDGPU::V_FMAC_F32_e64:
165 return AMDGPU::V_FMA_F32_e64;
166 case AMDGPU::V_FMAC_F16_e64:
167 return AMDGPU::V_FMA_F16_gfx9_e64;
168 case AMDGPU::V_FMAC_F16_t16_e64:
169 return AMDGPU::V_FMA_F16_gfx9_e64;
170 case AMDGPU::V_FMAC_LEGACY_F32_e64:
171 return AMDGPU::V_FMA_LEGACY_F32_e64;
172 case AMDGPU::V_FMAC_F64_e64:
173 return AMDGPU::V_FMA_F64_e64;
175 return AMDGPU::INSTRUCTION_LIST_END;
182 if (!OpToFold.
isFI())
185 const unsigned Opc =
UseMI.getOpcode();
196 return OpNo == VIdx && SIdx == -1;
200 return new SIFoldOperands();
203bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold)
const {
206 const uint64_t TSFlags =
MI->getDesc().TSFlags;
215 unsigned Opcode =
MI->getOpcode();
216 int OpNo =
MI->getOperandNo(&Old);
217 uint8_t OpType =
TII->get(Opcode).operands()[OpNo].OperandType;
233bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold)
const {
236 unsigned Opcode =
MI->getOpcode();
237 int OpNo =
MI->getOperandNo(&Old);
238 uint8_t OpType =
TII->get(Opcode).operands()[OpNo].OperandType;
251 unsigned SrcIdx = ~0;
253 ModIdx = AMDGPU::OpName::src0_modifiers;
256 ModIdx = AMDGPU::OpName::src1_modifiers;
259 ModIdx = AMDGPU::OpName::src2_modifiers;
265 unsigned ModVal =
Mod.getImm();
289 Mod.setImm(NewModVal);
294 if (
static_cast<int16_t
>(
Lo) < 0) {
295 int32_t SExt =
static_cast<int16_t
>(
Lo);
297 Mod.setImm(NewModVal);
324 if (tryFoldToInline(Imm))
333 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
334 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
335 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
338 bool Clamp =
MI->getOperand(ClampIdx).getImm() != 0;
345 if (tryFoldToInline(NegImm)) {
347 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
348 MI->setDesc(
TII->get(NegOpcode));
357bool SIFoldOperands::updateOperand(FoldCandidate &Fold)
const {
362 if (Fold.isImm() && canUseImmWithOpSel(Fold)) {
363 if (tryFoldImmWithOpSel(Fold))
369 int OpNo =
MI->getOperandNo(&Old);
370 if (!
TII->isOperandLegal(*
MI, OpNo, &New))
376 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
384 int Op32 = Fold.ShrinkOpcode;
389 bool HaveNonDbgCarryUse = !
MRI->use_nodbg_empty(Dst1.
getReg());
392 Register NewReg0 =
MRI->createVirtualRegister(Dst0RC);
396 if (HaveNonDbgCarryUse) {
409 for (
unsigned I =
MI->getNumOperands() - 1;
I > 0; --
I)
410 MI->removeOperand(
I);
411 MI->setDesc(
TII->get(AMDGPU::IMPLICIT_DEF));
414 TII->commuteInstruction(*Inst32,
false);
418 assert(!Fold.needsShrink() &&
"not handled");
423 if (NewMFMAOpc == -1)
425 MI->setDesc(
TII->get(NewMFMAOpc));
426 MI->untieRegOperand(0);
432 if (Fold.isGlobal()) {
433 Old.
ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
434 Fold.OpToFold->getTargetFlags());
451 return any_of(FoldList, [&](
const auto &
C) {
return C.UseMI ==
MI; });
459 for (FoldCandidate &Fold : FoldList)
460 if (Fold.UseMI ==
MI && Fold.UseOpNo == OpNo)
462 LLVM_DEBUG(
dbgs() <<
"Append " << (Commuted ?
"commuted" :
"normal")
463 <<
" operand " << OpNo <<
"\n " << *
MI);
470 const unsigned Opc =
MI->getOpcode();
472 auto tryToFoldAsFMAAKorMK = [&]() {
473 if (!OpToFold->
isImm())
476 const bool TryAK = OpNo == 3;
477 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
478 MI->setDesc(
TII->get(NewOpc));
481 bool FoldAsFMAAKorMK =
482 tryAddToFoldList(FoldList,
MI, TryAK ? 3 : 2, OpToFold);
483 if (FoldAsFMAAKorMK) {
485 MI->untieRegOperand(3);
502 MI->setDesc(
TII->get(Opc));
506 bool IsLegal =
TII->isOperandLegal(*
MI, OpNo, OpToFold);
507 if (!IsLegal && OpToFold->
isImm()) {
508 FoldCandidate Fold(
MI, OpNo, OpToFold);
509 IsLegal = canUseImmWithOpSel(Fold);
515 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
518 MI->setDesc(
TII->get(NewOpc));
523 bool FoldAsMAD = tryAddToFoldList(FoldList,
MI, OpNo, OpToFold);
525 MI->untieRegOperand(OpNo);
529 MI->removeOperand(
MI->getNumExplicitOperands() - 1);
530 MI->setDesc(
TII->get(Opc));
535 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
536 if (tryToFoldAsFMAAKorMK())
541 if (OpToFold->
isImm()) {
543 if (Opc == AMDGPU::S_SETREG_B32)
544 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
545 else if (Opc == AMDGPU::S_SETREG_B32_mode)
546 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
548 MI->setDesc(
TII->get(ImmOpc));
563 bool CanCommute =
TII->findCommutedOpIndices(*
MI, OpNo, CommuteOpNo);
571 if (!
MI->getOperand(OpNo).isReg() || !
MI->getOperand(CommuteOpNo).isReg())
574 if (!
TII->commuteInstruction(*
MI,
false, OpNo, CommuteOpNo))
578 if (!
TII->isOperandLegal(*
MI, CommuteOpNo, OpToFold)) {
579 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
580 Opc != AMDGPU::V_SUBREV_CO_U32_e64) ||
582 TII->commuteInstruction(*
MI,
false, OpNo, CommuteOpNo);
589 if (!OtherOp.
isReg() ||
596 unsigned MaybeCommutedOpc =
MI->getOpcode();
606 if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) &&
607 !OpToFold->
isReg() && !
TII->isInlineConstant(*OpToFold)) {
608 unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2;
610 if (!OpImm.
isReg() &&
611 TII->isInlineConstant(*
MI,
MI->getOperand(OpNo), OpImm))
612 return tryToFoldAsFMAAKorMK();
620 if (Opc == AMDGPU::S_FMAC_F32 &&
621 (OpNo != 1 || !
MI->getOperand(1).isIdenticalTo(
MI->getOperand(2)))) {
622 if (tryToFoldAsFMAAKorMK())
628 if (
TII->isSALU(
MI->getOpcode())) {
633 if (!OpToFold->
isReg() && !
TII->isInlineConstant(*OpToFold, OpInfo)) {
635 for (
unsigned i = 0, e = InstDesc.
getNumOperands(); i != e; ++i) {
636 auto &
Op =
MI->getOperand(i);
637 if (OpNo != i && !
Op.isReg() &&
651 return !
TII->isSDWA(
MI);
657bool SIFoldOperands::getRegSeqInit(
661 if (!Def || !
Def->isRegSequence())
664 for (
unsigned I = 1, E =
Def->getNumExplicitOperands();
I < E;
I += 2) {
671 SubDef =
MRI->getVRegDef(Sub->
getReg())) {
674 if (
TII->isInlineConstant(*
Op, OpTy))
678 if (!
Op->isReg() ||
Op->getReg().isPhysical())
683 Defs.emplace_back(Sub,
Def->getOperand(
I + 1).getImm());
689bool SIFoldOperands::tryToFoldACImm(
693 if (UseOpIdx >=
Desc.getNumOperands())
699 uint8_t OpTy =
Desc.operands()[UseOpIdx].OperandType;
700 if (OpToFold.
isImm() &&
TII->isInlineConstant(OpToFold, OpTy) &&
701 TII->isOperandLegal(*
UseMI, UseOpIdx, &OpToFold)) {
706 if (!OpToFold.
isReg())
719 if (!UseOp.
getSubReg() && Def &&
TII->isFoldableCopy(*Def)) {
721 if (DefOp.
isImm() &&
TII->isInlineConstant(DefOp, OpTy) &&
722 TII->isOperandLegal(*
UseMI, UseOpIdx, &DefOp)) {
729 if (!getRegSeqInit(Defs,
UseReg, OpTy))
733 for (
unsigned I = 0, E = Defs.
size();
I != E; ++
I) {
738 auto SubImm =
Op->getImm();
741 if (!
TII->isInlineConstant(*
Op, OpTy) ||
755void SIFoldOperands::foldOperand(
763 if (!isUseSafeToFold(*
UseMI, *UseOp))
780 for (
auto &
Use :
MRI->use_nodbg_operands(RegSeqDstReg))
782 for (
auto *RSUse : UsesToProcess) {
789 if (RSUse->getSubReg() != RegSeqDstSubReg)
792 foldOperand(OpToFold, RSUseMI, RSUseMI->
getOperandNo(RSUse), FoldList,
798 if (tryToFoldACImm(OpToFold,
UseMI, UseOpIdx, FoldList))
801 if (frameIndexMayFold(*
UseMI, UseOpIdx, OpToFold)) {
806 if (
TII->getNamedOperand(*
UseMI, AMDGPU::OpName::srsrc)->getReg() !=
807 MFI->getScratchRSrcReg())
813 *
TII->getNamedOperand(*
UseMI, AMDGPU::OpName::soffset);
833 bool FoldingImmLike =
851 if (DestRC == &AMDGPU::AGPR_32RegClass &&
863 unsigned MovOp =
TII->getMovOpcode(DestRC);
864 if (MovOp == AMDGPU::COPY)
869 while (ImpOpI != ImpOpE) {
876 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
921 for (
unsigned I = 0;
I <
Size / 4; ++
I) {
926 int64_t
Imm =
Def->getImm();
928 auto Tmp =
MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
930 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).
addImm(Imm);
932 }
else if (
Def->isReg() &&
TRI->isAGPR(*
MRI,
Def->getReg())) {
934 Def->setIsKill(
false);
935 if (!SeenAGPRs.
insert(Src)) {
946 Def->setIsKill(
false);
952 if (
TRI->isSGPRReg(*
MRI, Src.Reg)) {
955 auto Tmp =
MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
961 if (CopyToVGPR.
Reg) {
963 if (VGPRCopies.
count(CopyToVGPR)) {
964 Vgpr = VGPRCopies[CopyToVGPR];
966 Vgpr =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
968 VGPRCopies[CopyToVGPR] = Vgpr;
970 auto Tmp =
MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
972 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).
addReg(Vgpr);
976 B.addImm(Defs[
I].second);
989 else if (
TRI->isVGPR(*
MRI, Reg0) &&
TRI->isAGPR(*
MRI, Reg1))
991 else if (
ST->hasGFX90AInsts() &&
TRI->isAGPR(*
MRI, Reg0) &&
998 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
999 (UseOpc == AMDGPU::V_READLANE_B32 &&
1006 if (FoldingImmLike) {
1015 if (OpToFold.
isImm())
1048 UseDesc.
operands()[UseOpIdx].RegClass == -1)
1052 if (!FoldingImmLike) {
1053 if (OpToFold.
isReg() &&
ST->needsAlignedVGPRs()) {
1058 if (
TRI->hasVectorRegisters(RC) && OpToFold.
getSubReg()) {
1065 if (!RC || !
TRI->isProperlyAlignedRC(*RC))
1069 tryAddToFoldList(FoldList,
UseMI, UseOpIdx, &OpToFold);
1080 TRI->getRegClass(FoldDesc.
operands()[0].RegClass);
1090 if (UseOp->
getSubReg() == AMDGPU::sub0) {
1098 tryAddToFoldList(FoldList,
UseMI, UseOpIdx, &ImmOp);
1102 tryAddToFoldList(FoldList,
UseMI, UseOpIdx, &OpToFold);
1108 case AMDGPU::V_AND_B32_e64:
1109 case AMDGPU::V_AND_B32_e32:
1110 case AMDGPU::S_AND_B32:
1113 case AMDGPU::V_OR_B32_e64:
1114 case AMDGPU::V_OR_B32_e32:
1115 case AMDGPU::S_OR_B32:
1118 case AMDGPU::V_XOR_B32_e64:
1119 case AMDGPU::V_XOR_B32_e32:
1120 case AMDGPU::S_XOR_B32:
1123 case AMDGPU::S_XNOR_B32:
1126 case AMDGPU::S_NAND_B32:
1129 case AMDGPU::S_NOR_B32:
1132 case AMDGPU::S_ANDN2_B32:
1133 Result =
LHS & ~RHS;
1135 case AMDGPU::S_ORN2_B32:
1136 Result =
LHS | ~RHS;
1138 case AMDGPU::V_LSHL_B32_e64:
1139 case AMDGPU::V_LSHL_B32_e32:
1140 case AMDGPU::S_LSHL_B32:
1142 Result =
LHS << (
RHS & 31);
1144 case AMDGPU::V_LSHLREV_B32_e64:
1145 case AMDGPU::V_LSHLREV_B32_e32:
1146 Result =
RHS << (
LHS & 31);
1148 case AMDGPU::V_LSHR_B32_e64:
1149 case AMDGPU::V_LSHR_B32_e32:
1150 case AMDGPU::S_LSHR_B32:
1151 Result =
LHS >> (
RHS & 31);
1153 case AMDGPU::V_LSHRREV_B32_e64:
1154 case AMDGPU::V_LSHRREV_B32_e32:
1155 Result =
RHS >> (
LHS & 31);
1157 case AMDGPU::V_ASHR_I32_e64:
1158 case AMDGPU::V_ASHR_I32_e32:
1159 case AMDGPU::S_ASHR_I32:
1160 Result =
static_cast<int32_t
>(
LHS) >> (
RHS & 31);
1162 case AMDGPU::V_ASHRREV_I32_e64:
1163 case AMDGPU::V_ASHRREV_I32_e32:
1164 Result =
static_cast<int32_t
>(
RHS) >> (
LHS & 31);
1172 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1176 MI.setDesc(NewDesc);
1182 unsigned NumOps =
Desc.getNumOperands() +
Desc.implicit_uses().size() +
1183 Desc.implicit_defs().size();
1185 for (
unsigned I =
MI.getNumOperands() - 1;
I >= NumOps; --
I)
1186 MI.removeOperand(
I);
1192 if (!
Op.isReg() ||
Op.getSubReg() != AMDGPU::NoSubRegister ||
1193 !
Op.getReg().isVirtual())
1197 if (Def &&
Def->isMoveImmediate()) {
1210 if (!
MI->allImplicitDefsAreDead())
1213 unsigned Opc =
MI->getOpcode();
1220 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1221 Opc == AMDGPU::S_NOT_B32) &&
1223 MI->getOperand(1).ChangeToImmediate(~Src0->
getImm());
1244 bool IsSGPR =
TRI->isSGPRReg(*
MRI,
MI->getOperand(0).getReg());
1248 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1249 MI->removeOperand(Src1Idx);
1254 if (!
MI->isCommutable())
1262 int32_t Src1Val =
static_cast<int32_t
>(Src1->
getImm());
1263 if (Opc == AMDGPU::V_OR_B32_e64 ||
1264 Opc == AMDGPU::V_OR_B32_e32 ||
1265 Opc == AMDGPU::S_OR_B32) {
1268 MI->removeOperand(Src1Idx);
1270 }
else if (Src1Val == -1) {
1272 MI->removeOperand(Src1Idx);
1280 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1281 Opc == AMDGPU::S_AND_B32) {
1284 MI->removeOperand(Src0Idx);
1286 }
else if (Src1Val == -1) {
1288 MI->removeOperand(Src1Idx);
1296 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1297 Opc == AMDGPU::S_XOR_B32) {
1300 MI->removeOperand(Src1Idx);
1311 unsigned Opc =
MI.getOpcode();
1312 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1313 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1319 auto *Src0Imm = getImmOrMaterializedImm(*Src0);
1320 auto *Src1Imm = getImmOrMaterializedImm(*Src1);
1321 if (!Src1Imm->isIdenticalTo(*Src0Imm))
1329 if ((Src1ModIdx != -1 &&
MI.getOperand(Src1ModIdx).getImm() != 0) ||
1330 (Src0ModIdx != -1 &&
MI.getOperand(Src0ModIdx).getImm() != 0))
1338 MI.removeOperand(Src2Idx);
1340 if (Src1ModIdx != -1)
1341 MI.removeOperand(Src1ModIdx);
1342 if (Src0ModIdx != -1)
1343 MI.removeOperand(Src0ModIdx);
1349bool SIFoldOperands::tryFoldZeroHighBits(
MachineInstr &
MI)
const {
1350 if (
MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1351 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1360 if (!
ST->zeroesHigh16BitsOfDest(SrcDef->
getOpcode()))
1365 MI.eraseFromParent();
1377 bool Changed =
false;
1379 if (OpToFold.
isImm()) {
1390 if (tryConstantFoldOp(&
UseMI)) {
1398 for (
auto &
Use :
MRI->use_nodbg_operands(Dst.getReg()))
1400 for (
auto *U : UsesToProcess) {
1406 if (CopiesToReplace.
empty() && FoldList.
empty())
1412 Copy->addImplicitDefUseOperands(*MF);
1414 for (FoldCandidate &Fold : FoldList) {
1415 assert(!Fold.isReg() || Fold.OpToFold);
1416 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1426 assert(Fold.OpToFold && Fold.OpToFold->isReg());
1430 MRI->clearKillFlags(Fold.OpToFold->getReg());
1433 <<
static_cast<int>(Fold.UseOpNo) <<
" of "
1435 }
else if (Fold.Commuted) {
1437 TII->commuteInstruction(*Fold.UseMI,
false);
1443bool SIFoldOperands::tryFoldFoldableCopy(
1447 if (
MI.getOperand(0).getReg() == AMDGPU::M0) {
1449 if (CurrentKnownM0Val && CurrentKnownM0Val->
isIdenticalTo(NewM0Val)) {
1450 MI.eraseFromParent();
1465 if (!FoldingImm && !OpToFold.
isReg())
1477 if (!
MI.getOperand(0).getReg().isVirtual())
1480 bool Changed = foldInstOperand(
MI, OpToFold);
1487 auto *InstToErase = &
MI;
1488 while (
MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1489 auto &
SrcOp = InstToErase->getOperand(1);
1491 InstToErase->eraseFromParent();
1493 InstToErase =
nullptr;
1496 InstToErase =
MRI->getVRegDef(SrcReg);
1497 if (!InstToErase || !
TII->isFoldableCopy(*InstToErase))
1501 if (InstToErase && InstToErase->isRegSequence() &&
1502 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1503 InstToErase->eraseFromParent();
1513 unsigned Op =
MI.getOpcode();
1515 case AMDGPU::V_MAX_F32_e64:
1516 case AMDGPU::V_MAX_F16_e64:
1517 case AMDGPU::V_MAX_F16_t16_e64:
1518 case AMDGPU::V_MAX_F16_fake16_e64:
1519 case AMDGPU::V_MAX_F64_e64:
1520 case AMDGPU::V_MAX_NUM_F64_e64:
1521 case AMDGPU::V_PK_MAX_F16: {
1522 if (!
TII->getNamedOperand(
MI, AMDGPU::OpName::clamp)->getImm())
1531 Src0->
getSubReg() != AMDGPU::NoSubRegister)
1535 if (
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
1539 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers)->getImm();
1541 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers)->getImm();
1547 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1559 if (!ClampSrc || !
MRI->hasOneNonDBGUser(ClampSrc->
getReg()))
1565 if (
TII->getClampMask(*Def) !=
TII->getClampMask(
MI))
1572 LLVM_DEBUG(
dbgs() <<
"Folding clamp " << *DefClamp <<
" into " << *Def);
1576 MRI->replaceRegWith(
MI.getOperand(0).getReg(),
Def->getOperand(0).getReg());
1577 MI.eraseFromParent();
1582 if (
TII->convertToThreeAddress(*Def,
nullptr,
nullptr))
1583 Def->eraseFromParent();
1590 case AMDGPU::V_MUL_F64_e64:
1591 case AMDGPU::V_MUL_F64_pseudo_e64: {
1593 case 0x3fe0000000000000:
1595 case 0x4000000000000000:
1597 case 0x4010000000000000:
1603 case AMDGPU::V_MUL_F32_e64: {
1604 switch (
static_cast<uint32_t>(Val)) {
1615 case AMDGPU::V_MUL_F16_e64:
1616 case AMDGPU::V_MUL_F16_t16_e64:
1617 case AMDGPU::V_MUL_F16_fake16_e64: {
1618 switch (
static_cast<uint16_t>(Val)) {
1637std::pair<const MachineOperand *, int>
1639 unsigned Op =
MI.getOpcode();
1641 case AMDGPU::V_MUL_F64_e64:
1642 case AMDGPU::V_MUL_F64_pseudo_e64:
1643 case AMDGPU::V_MUL_F32_e64:
1644 case AMDGPU::V_MUL_F16_t16_e64:
1645 case AMDGPU::V_MUL_F16_fake16_e64:
1646 case AMDGPU::V_MUL_F16_e64: {
1648 if ((
Op == AMDGPU::V_MUL_F32_e64 &&
1650 ((
Op == AMDGPU::V_MUL_F64_e64 ||
Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
1651 Op == AMDGPU::V_MUL_F16_e64 ||
Op == AMDGPU::V_MUL_F16_t16_e64 ||
1652 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
1660 if (Src0->
isImm()) {
1663 }
else if (Src1->
isImm()) {
1671 TII->hasModifiersSet(
MI, AMDGPU::OpName::src0_modifiers) ||
1672 TII->hasModifiersSet(
MI, AMDGPU::OpName::src1_modifiers) ||
1673 TII->hasModifiersSet(
MI, AMDGPU::OpName::omod) ||
1674 TII->hasModifiersSet(
MI, AMDGPU::OpName::clamp))
1677 return std::pair(RegOp, OMod);
1679 case AMDGPU::V_ADD_F64_e64:
1680 case AMDGPU::V_ADD_F64_pseudo_e64:
1681 case AMDGPU::V_ADD_F32_e64:
1682 case AMDGPU::V_ADD_F16_e64:
1683 case AMDGPU::V_ADD_F16_t16_e64:
1684 case AMDGPU::V_ADD_F16_fake16_e64: {
1686 if ((
Op == AMDGPU::V_ADD_F32_e64 &&
1688 ((
Op == AMDGPU::V_ADD_F64_e64 ||
Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
1689 Op == AMDGPU::V_ADD_F16_e64 ||
Op == AMDGPU::V_ADD_F16_t16_e64 ||
1690 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
1700 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::src0_modifiers) &&
1701 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::src1_modifiers) &&
1702 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::clamp) &&
1703 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
1717 std::tie(RegOp, OMod) = isOMod(
MI);
1719 RegOp->
getSubReg() != AMDGPU::NoSubRegister ||
1720 !
MRI->hasOneNonDBGUser(RegOp->
getReg()))
1730 if (
TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1736 MRI->replaceRegWith(
MI.getOperand(0).getReg(),
Def->getOperand(0).getReg());
1737 MI.eraseFromParent();
1742 if (
TII->convertToThreeAddress(*Def,
nullptr,
nullptr))
1743 Def->eraseFromParent();
1752 auto Reg =
MI.getOperand(0).getReg();
1754 if (!
ST->hasGFX90AInsts() || !
TRI->isVGPR(*
MRI, Reg) ||
1755 !
MRI->hasOneNonDBGUse(Reg))
1762 for (
auto &Def : Defs) {
1763 const auto *
Op =
Def.first;
1766 if (
TRI->isAGPR(*
MRI,
Op->getReg()))
1780 if (!
TRI->isVGPR(*
MRI, Reg) || !
MRI->hasOneNonDBGUse(Reg))
1782 Op = &*
MRI->use_nodbg_begin(Reg);
1786 if (
Op->getSubReg())
1792 TII->getRegClass(InstDesc, OpIdx,
TRI, *
MI.getMF());
1793 if (!OpRC || !
TRI->isVectorSuperClass(OpRC))
1796 const auto *NewDstRC =
TRI->getEquivalentAGPRClass(
MRI->getRegClass(Reg));
1797 auto Dst =
MRI->createVirtualRegister(NewDstRC);
1799 TII->get(AMDGPU::REG_SEQUENCE), Dst);
1801 for (
unsigned I = 0;
I < Defs.size(); ++
I) {
1803 Def->setIsKill(
false);
1811 RS.addImm(Defs[
I].second);
1815 if (!
TII->isOperandLegal(*
UseMI, OpIdx,
Op)) {
1817 RS->eraseFromParent();
1825 if (
MRI->use_nodbg_empty(
MI.getOperand(0).getReg()))
1826 MI.eraseFromParent();
1834 Register &OutReg,
unsigned &OutSubReg) {
1844 if (
TRI.isAGPR(
MRI, CopySrcReg)) {
1845 OutReg = CopySrcReg;
1854 if (!CopySrcDef || !CopySrcDef->
isCopy())
1861 OtherCopySrc.
getSubReg() != AMDGPU::NoSubRegister ||
1862 !
TRI.isAGPR(
MRI, OtherCopySrcReg))
1865 OutReg = OtherCopySrcReg;
1903 if (!
TRI->isVGPR(*
MRI, PhiOut))
1909 for (
unsigned K = 1;
K <
PHI.getNumExplicitOperands();
K += 2) {
1912 if (!Copy || !
Copy->isCopy())
1916 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
1921 if (
const auto *SubRC =
TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
1932 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
1936 for (
unsigned K = 1;
K <
PHI.getNumExplicitOperands();
K += 2) {
1944 unsigned CopyOpc = AMDGPU::COPY;
1949 if (
Def->isCopy()) {
1951 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
1965 if (IsAGPR32 && !
ST->hasGFX90AInsts() && !
MRI->hasOneNonDBGUse(Reg) &&
1967 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1970 InsertMBB =
Def->getParent();
1977 Register NewReg =
MRI->createVirtualRegister(ARC);
1979 TII->get(CopyOpc), NewReg)
1988 Register NewReg =
MRI->createVirtualRegister(ARC);
1989 PHI.getOperand(0).setReg(NewReg);
1995 TII->get(AMDGPU::COPY), PhiOut)
2005 if (!
ST->hasGFX90AInsts() ||
MI.getNumExplicitDefs() != 1)
2026 while (!
Users.empty()) {
2028 if (!
I->isCopy() && !
I->isRegSequence())
2030 Register DstReg =
I->getOperand(0).getReg();
2034 if (
TRI->isAGPR(*
MRI, DstReg))
2038 Users.push_back(&U);
2042 MRI->setRegClass(DefReg,
TRI->getEquivalentAGPRClass(RC));
2043 if (!
TII->isOperandLegal(
MI, 0, &Def)) {
2044 MRI->setRegClass(DefReg, RC);
2048 while (!MoveRegs.
empty()) {
2050 MRI->setRegClass(Reg,
TRI->getEquivalentAGPRClass(
MRI->getRegClass(Reg)));
2093 if (
ST->hasGFX90AInsts())
2100 for (
auto &
MI :
MBB) {
2104 if (!
TRI->isAGPR(*
MRI,
MI.getOperand(0).getReg()))
2107 for (
unsigned K = 1;
K <
MI.getNumOperands();
K += 2) {
2115 bool Changed =
false;
2116 for (
const auto &[Entry, MOs] : RegToMO) {
2117 if (MOs.size() == 1)
2128 MRI->createVirtualRegister(
TRI->getEquivalentVGPRClass(ARC));
2131 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2135 Register TempAGPR =
MRI->createVirtualRegister(ARC);
2137 TII->get(AMDGPU::COPY), TempAGPR)
2159 TII =
ST->getInstrInfo();
2160 TRI = &
TII->getRegisterInfo();
2168 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2170 bool Changed =
false;
2174 Changed |= tryFoldCndMask(
MI);
2176 if (tryFoldZeroHighBits(
MI)) {
2181 if (
MI.isRegSequence() && tryFoldRegSequence(
MI)) {
2186 if (
MI.isPHI() && tryFoldPhiAGPR(
MI)) {
2191 if (
MI.mayLoad() && tryFoldLoad(
MI)) {
2196 if (
TII->isFoldableCopy(
MI)) {
2197 Changed |= tryFoldFoldableCopy(
MI, CurrentKnownM0Val);
2202 if (CurrentKnownM0Val &&
MI.modifiesRegister(AMDGPU::M0,
TRI))
2203 CurrentKnownM0Val =
nullptr;
2209 Changed |= tryFoldClamp(
MI);
2212 Changed |= tryOptimizeAGPRPhis(*
MBB);
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
iv Induction Variable Users
unsigned const TargetRegisterInfo * TRI
static bool isReg(const MCInst &MI, unsigned OpNo)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static bool isUseMIInFoldList(ArrayRef< FoldCandidate > FoldList, const MachineInstr *MI)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, bool Commuted=false, int ShrinkOp=-1)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
support::ulittle16_t & Lo
support::ulittle16_t & Hi
Class for arbitrary precision integers.
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
This class represents an Operation in the Expression.
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
FunctionPass class - This class is used to implement most global optimizations.
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
This holds information about one operand of a machine instruction, indicating the register class for ...
iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
bool isRegSequence() const
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
iterator_range< mop_iterator > implicit_operands()
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A Use represents the edge between a Value definition and its users.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo)
Does this operand support only inlinable literals?
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
@ OPERAND_REG_INLINE_C_V2BF16
@ OPERAND_REG_IMM_V2INT16
@ OPERAND_REG_INLINE_C_V2FP16
@ OPERAND_REG_INLINE_AC_V2INT16
@ OPERAND_REG_INLINE_C_INT32
@ OPERAND_REG_INLINE_C_V2INT16
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionPass * createSIFoldOperandsPass()
DWARFExpression::Operation Op
void initializeSIFoldOperandsPass(PassRegistry &)
iterator_range< df_iterator< T > > depth_first(const T &G)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Description of the encoding of one expression Op.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
A pair composed of a register and a sub-register index.