25struct MFMAPaddingRatioParser :
public cl::parser<unsigned> {
30 return O.error(
"'" +
Arg +
"' value invalid for uint argument!");
33 return O.error(
"'" +
Arg +
"' value must be in the range [0, 100]!");
43 cl::desc(
"Fill a percentage of the latency between "
44 "neighboring MFMA with s_nops."));
54 IsHazardRecognizerMode(
false),
55 CurrCycleInstr(nullptr),
58 TII(*ST.getInstrInfo()),
59 TRI(
TII.getRegisterInfo()),
60 ClauseUses(
TRI.getNumRegUnits()),
61 ClauseDefs(
TRI.getNumRegUnits()) {
63 TSchedModel.
init(&ST);
68 EmittedInstrs.clear();
80 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
84 return Opcode == AMDGPU::S_GETREG_B32;
89 case AMDGPU::S_SETREG_B32:
90 case AMDGPU::S_SETREG_B32_mode:
91 case AMDGPU::S_SETREG_IMM32_B32:
92 case AMDGPU::S_SETREG_IMM32_B32_mode:
99 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
103 return Opcode == AMDGPU::S_RFE_B64;
108 case AMDGPU::S_MOVRELS_B32:
109 case AMDGPU::S_MOVRELS_B64:
110 case AMDGPU::S_MOVRELD_B32:
111 case AMDGPU::S_MOVRELD_B64:
123 unsigned Opcode =
MI.getOpcode();
127 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
128 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
131 if (!ST.hasGFX940Insts())
139 if (
TII.isAlwaysGDS(
MI.getOpcode()))
142 switch (
MI.getOpcode()) {
143 case AMDGPU::S_SENDMSG:
144 case AMDGPU::S_SENDMSGHALT:
145 case AMDGPU::S_TTRACEDATA:
149 case AMDGPU::DS_PERMUTE_B32:
150 case AMDGPU::DS_BPERMUTE_B32:
153 if (
TII.isDS(
MI.getOpcode())) {
155 AMDGPU::OpName::gds);
156 if (
MI.getOperand(GDS).getImm())
164 unsigned Opcode =
MI.getOpcode();
165 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
176 AMDGPU::OpName::simm16);
196 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
205 && checkVMEMHazards(
MI) > 0)
214 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
217 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
225 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
228 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
231 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
236 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
237 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
241 MI->readsRegister(AMDGPU::LDS_DIRECT))) &&
242 checkReadM0Hazards(
MI) > 0)
253 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
261 while (Quantity > 0) {
262 unsigned Arg = std::min(Quantity, 8u);
270GCNHazardRecognizer::getMFMAPipelineWaitStates(
const MachineInstr &
MI)
const {
277void GCNHazardRecognizer::processBundle() {
281 for (;
MI !=
E &&
MI->isInsideBundle(); ++
MI) {
282 CurrCycleInstr = &*
MI;
285 if (IsHazardRecognizerMode) {
286 fixHazards(CurrCycleInstr);
294 for (
unsigned i = 0, e = std::min(WaitStates,
MaxLookAhead - 1); i <
e; ++i)
295 EmittedInstrs.push_front(
nullptr);
297 EmittedInstrs.push_front(CurrCycleInstr);
300 CurrCycleInstr =
nullptr;
304 assert(IsHazardRecognizerMode);
308 if (
MI->isInsideBundle())
318 IsHazardRecognizerMode =
true;
322 CurrCycleInstr =
nullptr;
333 return std::max(WaitStates, checkSMRDHazards(
MI));
336 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
338 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
344 WaitStates = std::max(WaitStates, checkVMEMHazards(
MI));
347 WaitStates = std::max(WaitStates, checkVALUHazards(
MI));
350 WaitStates = std::max(WaitStates, checkDPPHazards(
MI));
353 WaitStates = std::max(WaitStates, checkDivFMasHazards(
MI));
356 WaitStates = std::max(WaitStates, checkRWLaneHazards(
MI));
361 WaitStates = std::max(WaitStates, checkMAIVALUHazards(
MI));
363 if (
MI->isInlineAsm())
364 return std::max(WaitStates, checkInlineAsmHazards(
MI));
367 return std::max(WaitStates, checkGetRegHazards(
MI));
370 return std::max(WaitStates, checkSetRegHazards(
MI));
373 return std::max(WaitStates, checkRFEHazards(
MI));
377 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
378 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
382 return std::max(WaitStates, checkReadM0Hazards(
MI));
385 return std::max(WaitStates, checkMAIHazards(
MI));
390 return std::max(WaitStates, checkMAILdStHazards(
MI));
396 EmittedInstrs.push_front(
nullptr);
402 if (!CurrCycleInstr) {
403 EmittedInstrs.push_front(
nullptr);
413 if (!NumWaitStates) {
414 CurrCycleInstr =
nullptr;
419 EmittedInstrs.push_front(CurrCycleInstr);
426 EmittedInstrs.push_front(
nullptr);
434 CurrCycleInstr =
nullptr;
438 llvm_unreachable(
"hazard recognizer does not support bottom-up scheduling.");
451template <
typename StateT>
464 switch (IsHazard(State, *
I)) {
474 if (
I->isInlineAsm() ||
I->isMetaInstruction())
477 UpdateState(State, *
I);
481 if (!Visited.
insert(Pred).second)
484 if (
hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
508 if (
I->isInlineAsm())
511 WaitStates += GetNumWaitStates(*
I);
513 if (IsExpired(*
I, WaitStates))
514 return std::numeric_limits<int>::max();
517 int MinWaitStates = std::numeric_limits<int>::max();
519 if (!Visited.
insert(Pred).second)
523 IsExpired, Visited, GetNumWaitStates);
525 MinWaitStates = std::min(MinWaitStates, W);
528 return MinWaitStates;
535 std::next(
MI->getReverseIterator()),
536 0, IsExpired, Visited);
539int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
int Limit) {
540 if (IsHazardRecognizerMode) {
542 return WaitStates >= Limit;
544 return ::getWaitStatesSince(IsHazard, CurrCycleInstr,
IsExpiredFn);
553 if (
MI->isInlineAsm())
558 if (WaitStates >= Limit)
561 return std::numeric_limits<int>::max();
564int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
565 IsHazardFn IsHazardDef,
570 return IsHazardDef(
MI) &&
MI.modifiesRegister(Reg, TRI);
576int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
618int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM) {
624 bool IsSMRD = TII.
isSMRD(*MEM);
650 if (ClauseDefs.
none())
663 return ClauseDefs.
anyCommon(ClauseUses) ? 1 : 0;
666int GCNHazardRecognizer::checkSMRDHazards(
MachineInstr *SMRD) {
667 int WaitStatesNeeded = 0;
669 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
673 return WaitStatesNeeded;
677 int SmrdSgprWaitStates = 4;
690 int WaitStatesNeededForUse =
691 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
693 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
703 int WaitStatesNeededForUse =
704 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
707 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
711 return WaitStatesNeeded;
714int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr* VMEM) {
718 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
722 const int VmemSgprWaitStates = 5;
730 int WaitStatesNeededForUse =
731 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
733 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
735 return WaitStatesNeeded;
738int GCNHazardRecognizer::checkDPPHazards(
MachineInstr *DPP) {
743 int DppVgprWaitStates = 2;
744 int DppExecWaitStates = 5;
745 int WaitStatesNeeded = 0;
747 return TII->isVALU(
MI);
753 int WaitStatesNeededForUse =
754 DppVgprWaitStates - getWaitStatesSinceDef(
758 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
761 WaitStatesNeeded = std::max(
763 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
766 return WaitStatesNeeded;
769int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas) {
774 const int DivFMasWaitStates = 4;
776 return TII->isVALU(
MI);
778 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
781 return DivFMasWaitStates - WaitStatesNeeded;
784int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr) {
786 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
788 const int GetRegWaitStates = 2;
792 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
794 return GetRegWaitStates - WaitStatesNeeded;
797int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr) {
799 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
805 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
806 return SetRegWaitStates - WaitStatesNeeded;
809int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI) {
814 unsigned Opcode =
MI.getOpcode();
820 VDataRCID = Desc.
operands()[VDataIdx].RegClass;
830 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
834 (!SOffset || !SOffset->
isReg()))
842 if (
TII->isMIMG(
MI)) {
849 if (
TII->isFLAT(
MI)) {
859GCNHazardRecognizer::checkVALUHazardsHelper(
const MachineOperand &Def,
866 int WaitStatesNeeded = 0;
868 if (!
TRI->isVectorRegister(
MRI,
Def.getReg()))
869 return WaitStatesNeeded;
872 int DataIdx = createsVALUHazard(
MI);
873 return DataIdx >= 0 &&
874 TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(), Reg);
876 int WaitStatesNeededForDef =
877 VALUWaitStates - getWaitStatesSince(
IsHazardFn, VALUWaitStates);
878 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
880 return WaitStatesNeeded;
883int GCNHazardRecognizer::checkVALUHazards(
MachineInstr *VALU) {
884 int WaitStatesNeeded = 0;
887 const int TransDefWaitstates = 1;
897 if (
Use.isReg() &&
TRI->regsOverlap(Def,
Use.getReg()))
904 int WaitStatesNeededForDef =
906 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
907 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
911 const int Shift16DefWaitstates = 1;
918 if (
auto *DstSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel))
923 !(
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers)
929 if (
auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
933 if (
Use.isReg() &&
TRI->regsOverlap(Def,
Use.getReg()))
941 int WaitStatesNeededForDef =
942 Shift16DefWaitstates -
943 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
944 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
948 const int VALUWriteSGPRVALUReadWaitstates = 2;
949 const int VALUWriteEXECRWLane = 4;
950 const int VALUWriteVGPRReadlaneRead = 1;
958 return MI.modifiesRegister(
UseReg, TRI);
967 int WaitStatesNeededForDef =
968 VALUWriteSGPRVALUReadWaitstates -
969 getWaitStatesSince(IsVALUDefSGPRFn,
970 VALUWriteSGPRVALUReadWaitstates);
971 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
975 if (
VALU->readsRegister(AMDGPU::VCC, TRI)) {
977 int WaitStatesNeededForDef =
978 VALUWriteSGPRVALUReadWaitstates -
979 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
980 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
983 switch (
VALU->getOpcode()) {
984 case AMDGPU::V_READLANE_B32:
985 case AMDGPU::V_READFIRSTLANE_B32: {
988 int WaitStatesNeededForDef =
989 VALUWriteVGPRReadlaneRead -
990 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
991 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
994 case AMDGPU::V_WRITELANE_B32: {
996 int WaitStatesNeededForDef =
997 VALUWriteEXECRWLane -
998 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
999 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1010 return WaitStatesNeeded;
1015 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def,
MRI));
1018 return WaitStatesNeeded;
1021int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA) {
1034 int WaitStatesNeeded = 0;
1039 if (
Op.isReg() &&
Op.isDef()) {
1040 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op,
MRI));
1044 return WaitStatesNeeded;
1047int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane) {
1053 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1061 const int RWLaneWaitStates = 4;
1062 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
1064 return RWLaneWaitStates - WaitStatesSince;
1067int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE) {
1073 const int RFEWaitStates = 1;
1078 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
1079 return RFEWaitStates - WaitStatesNeeded;
1084 const int ReadM0WaitStates = 1;
1086 return ReadM0WaitStates -
1087 getWaitStatesSinceDef(AMDGPU::M0,
IsHazardFn, ReadM0WaitStates);
1091 fixVMEMtoScalarWriteHazards(
MI);
1092 fixVcmpxPermlaneHazards(
MI);
1093 fixSMEMtoVectorWriteHazards(
MI);
1094 fixVcmpxExecWARHazard(
MI);
1095 fixLdsBranchVmemWARHazard(
MI);
1097 fixLdsDirectVALUHazard(
MI);
1098 fixLdsDirectVMEMHazard(
MI);
1100 fixVALUPartialForwardingHazard(
MI);
1101 fixVALUTransUseHazard(
MI);
1103 fixShift64HighRegBug(
MI);
1104 fixVALUMaskWriteHazard(
MI);
1107bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
1114 return (
TII->isVOPC(
MI) ||
1115 ((
TII->isVOP3(
MI) ||
TII->isSDWA(
MI)) &&
MI.isCompare())) &&
1116 MI.modifiesRegister(AMDGPU::EXEC, TRI);
1120 unsigned Opc =
MI.getOpcode();
1122 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1126 std::numeric_limits<int>::max())
1132 auto *Src0 =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
1134 bool IsUndef = Src0->isUndef();
1136 TII->get(AMDGPU::V_MOV_B32_e32))
1143bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
1150 if (
MI->getNumDefs() == 0)
1162 I.findRegisterUseOperand(
Def.getReg(),
false,
TRI);
1172 (
MI.getOpcode() == AMDGPU::S_WAITCNT &&
1173 !
MI.getOperand(0).getImm()) ||
1174 (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1175 MI.getOperand(0).getImm() == 0xffe3);
1179 std::numeric_limits<int>::max())
1184 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1189bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
1197 switch (
MI->getOpcode()) {
1198 case AMDGPU::V_READLANE_B32:
1199 case AMDGPU::V_READFIRSTLANE_B32:
1200 SDSTName = AMDGPU::OpName::vdst;
1203 SDSTName = AMDGPU::OpName::sdst;
1212 for (
const auto &MO :
MI->implicit_operands()) {
1213 if (MO.isDef() &&
TRI->isSGPRClass(
TRI->getPhysRegBaseClass(MO.getReg()))) {
1229 if (
TII->isSALU(
MI)) {
1230 switch (
MI.getOpcode()) {
1231 case AMDGPU::S_SETVSKIP:
1232 case AMDGPU::S_VERSION:
1233 case AMDGPU::S_WAITCNT_VSCNT:
1234 case AMDGPU::S_WAITCNT_VMCNT:
1235 case AMDGPU::S_WAITCNT_EXPCNT:
1238 case AMDGPU::S_WAITCNT_LGKMCNT:
1240 return (
MI.getOperand(1).getImm() == 0) &&
1241 (
MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1242 case AMDGPU::S_WAITCNT: {
1243 const int64_t
Imm =
MI.getOperand(0).getImm();
1245 return (Decoded.
LgkmCnt == 0);
1249 if (
TII->isSOPP(
MI))
1265 std::numeric_limits<int>::max())
1269 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1274bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1279 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1285 return I.readsRegister(AMDGPU::EXEC, TRI);
1291 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))
1293 for (
auto MO :
MI.implicit_operands())
1294 if (MO.isDef() &&
TRI->isSGPRClass(
TRI->getPhysRegBaseClass(MO.getReg())))
1297 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1298 (
MI.getOperand(0).getImm() & 0xfffe) == 0xfffe)
1304 std::numeric_limits<int>::max())
1308 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1315 if (!ST.hasLdsBranchVmemWARHazard())
1320 bool HasLds =
false;
1321 bool HasVmem =
false;
1322 for (
auto &
MBB : MF) {
1323 for (
auto &
MI :
MBB) {
1327 if (HasLds && HasVmem)
1335 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1336 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1337 !
I.getOperand(1).getImm();
1340bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1341 if (!RunLdsBranchVmemWARHazardFixup)
1354 auto InstType = IsHazardInst(*
MI);
1367 auto InstType2 = IsHazardInst(
I);
1368 return InstType2 && InstType != InstType2;
1372 auto InstType2 = IsHazardInst(
I);
1373 if (InstType == InstType2)
1380 std::numeric_limits<int>::max();
1384 std::numeric_limits<int>::max())
1389 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1396bool GCNHazardRecognizer::fixLdsDirectVALUHazard(
MachineInstr *
MI) {
1400 const int NoHazardWaitStates = 15;
1404 bool VisitedTrans =
false;
1410 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1413 if (WaitStates >= NoHazardWaitStates)
1424 auto Count = ::getWaitStatesSince(
IsHazardFn,
MI->getParent(),
1425 std::next(
MI->getReverseIterator()), 0,
1434 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvdst);
1435 WaitVdstOp->
setImm(std::min(Count, NoHazardWaitStates));
1440bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(
MachineInstr *
MI) {
1451 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1455 (
I.getOpcode() == AMDGPU::S_WAITCNT && !
I.getOperand(0).getImm()) ||
1456 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1457 I.getOperand(0).getImm() == 0xffe3);
1461 std::numeric_limits<int>::max())
1465 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1471bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(
MachineInstr *
MI) {
1487 if (SrcVGPRs.
size() <= 1)
1505 const int Intv1plus2MaxVALUs = 2;
1506 const int Intv3MaxVALUs = 4;
1507 const int IntvMaxVALUs = 6;
1508 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1512 int ExecPos = std::numeric_limits<int>::max();
1521 if (State.VALUs > NoHazardVALUWaitStates)
1527 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1528 I.getOperand(0).getImm() == 0x0fff))
1532 bool Changed =
false;
1535 if (!State.DefPos.count(Src) &&
I.modifiesRegister(Src, &TRI)) {
1536 State.DefPos[Src] = State.VALUs;
1541 if (State.ExecPos == std::numeric_limits<int>::max()) {
1542 if (!State.DefPos.empty() &&
I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1543 State.ExecPos = State.VALUs;
1550 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1558 if (State.ExecPos == std::numeric_limits<int>::max())
1561 int PreExecPos = std::numeric_limits<int>::max();
1562 int PostExecPos = std::numeric_limits<int>::max();
1564 for (
auto Entry : State.DefPos) {
1565 int DefVALUs = Entry.second;
1566 if (DefVALUs != std::numeric_limits<int>::max()) {
1567 if (DefVALUs >= State.ExecPos)
1568 PreExecPos = std::min(PreExecPos, DefVALUs);
1569 else if (DefVALUs < State.ExecPos)
1570 PostExecPos = std::min(PostExecPos, DefVALUs);
1575 if (PostExecPos == std::numeric_limits<int>::max())
1579 int Intv3VALUs = PostExecPos;
1580 if (Intv3VALUs > Intv3MaxVALUs)
1584 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1585 if (Intv2VALUs > Intv1plus2MaxVALUs)
1589 if (PreExecPos == std::numeric_limits<int>::max())
1593 int Intv1VALUs = PreExecPos - State.ExecPos;
1594 if (Intv1VALUs > Intv1plus2MaxVALUs)
1598 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1603 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1609 if (!hasHazard<StateType>(State,
IsHazardFn, UpdateStateFn,
MI->getParent(),
1610 std::next(
MI->getReverseIterator()), Visited))
1614 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1620bool GCNHazardRecognizer::fixVALUTransUseHazard(
MachineInstr *
MI) {
1643 const int IntvMaxVALUs = 5;
1644 const int IntvMaxTRANS = 1;
1656 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1662 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1663 I.getOperand(0).getImm() == 0x0fff))
1669 if (
I.modifiesRegister(Src, &TRI)) {
1677 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1685 if (!hasHazard<StateType>(State,
IsHazardFn, UpdateStateFn,
MI->getParent(),
1686 std::next(
MI->getReverseIterator()), Visited))
1692 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1712 TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
1714 TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
1717 TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1719 if (
TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1720 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1727 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2);
1730 if (CurSrc2Reg != AMDGPU::NoRegister &&
1731 TRI->regsOverlap(PrevDstReg, CurSrc2Reg)) {
1734 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2_modifiers);
1735 const bool NoSrc2Mods =
1739 return !(NoSrc2Mods && (
TII->pseudoToMCOpcode(
I.getOpcode()) ==
1740 TII->pseudoToMCOpcode(
MI->getOpcode())));
1751 std::numeric_limits<int>::max())
1754 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(),
TII->get(AMDGPU::V_NOP_e32));
1759bool GCNHazardRecognizer::fixShift64HighRegBug(
MachineInstr *
MI) {
1763 switch (
MI->getOpcode()) {
1766 case AMDGPU::V_LSHLREV_B64_e64:
1767 case AMDGPU::V_LSHRREV_B64_e64:
1768 case AMDGPU::V_ASHRREV_I64_e64:
1779 if (!
TRI.isVGPR(
MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1782 if (AmtReg != AMDGPU::VGPR255 &&
MRI.isPhysRegUsed(AmtReg + 1))
1786 bool OverlappedSrc = Src1->
isReg() &&
TRI.regsOverlap(Src1->
getReg(), AmtReg);
1787 bool OverlappedDst =
MI->modifiesRegister(AmtReg, &TRI);
1788 bool Overlapped = OverlappedSrc || OverlappedDst;
1790 assert(!OverlappedDst || !OverlappedSrc ||
1791 Src1->
getReg() ==
MI->getOperand(0).getReg());
1793 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1796 for (
MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1797 : AMDGPU::VGPR_32RegClass) {
1798 if (!
MI->modifiesRegister(Reg, &TRI) && !
MI->readsRegister(Reg, &TRI)) {
1809 NewAmtLo =
TRI.getSubReg(NewReg, AMDGPU::sub0);
1851 MI->getOperand(0).setReg(NewReg);
1852 if (OverlappedSrc) {
1862 int NSAtoVMEMWaitStates = 1;
1871 const auto *
Offset =
TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
1879 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1880 TII->getInstSizeInBytes(
I) >= 16;
1883 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
1886int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
MachineInstr *
MI) {
1887 int FPAtomicToDenormModeWaitStates = 3;
1892 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1905 switch (
MI.getOpcode()) {
1906 case AMDGPU::S_WAITCNT:
1907 case AMDGPU::S_WAITCNT_VSCNT:
1908 case AMDGPU::S_WAITCNT_VMCNT:
1909 case AMDGPU::S_WAITCNT_EXPCNT:
1910 case AMDGPU::S_WAITCNT_LGKMCNT:
1911 case AMDGPU::S_WAIT_IDLE:
1920 return FPAtomicToDenormModeWaitStates -
1939 int NeighborMFMALatency = 0;
1940 auto IsNeighboringMFMA = [&NeighborMFMALatency,
1945 NeighborMFMALatency = this->getMFMAPipelineWaitStates(
MI);
1949 const int MaxMFMAPipelineWaitStates = 16;
1950 int WaitStatesSinceNeighborMFMA =
1951 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
1953 int NeighborMFMAPaddingNeeded =
1955 WaitStatesSinceNeighborMFMA;
1957 return std::max(0, NeighborMFMAPaddingNeeded);
1961 int WaitStatesNeeded = 0;
1962 unsigned Opc =
MI->getOpcode();
1968 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
1969 const int LegacyVALUWritesVGPRWaitStates = 2;
1970 const int VALUWritesExecWaitStates = 4;
1971 const int MaxWaitStates = 4;
1973 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1974 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1975 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1977 if (WaitStatesNeeded < MaxWaitStates) {
1979 const int MaxWaitStates = 2;
1984 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1985 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
1986 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1988 if (WaitStatesNeeded == MaxWaitStates)
1998 if (
Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2001 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2002 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2003 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2004 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2005 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2006 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2007 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2008 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2009 const int MaxWaitStates = 18;
2011 unsigned HazardDefLatency = 0;
2013 auto IsOverlappedMFMAFn = [
Reg, &HazardDefLatency,
2021 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2022 return TRI.regsOverlap(DstReg, Reg);
2025 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2027 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2029 int OpNo =
Op.getOperandNo();
2030 if (OpNo == SrcCIdx) {
2031 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2032 }
else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2033 switch (HazardDefLatency) {
2034 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2036 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2038 case 16: [[fallthrough]];
2039 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2042 }
else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2043 switch (HazardDefLatency) {
2044 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2046 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2048 case 16: [[fallthrough]];
2049 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2054 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2055 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2057 if (WaitStatesNeeded == MaxWaitStates)
2058 return WaitStatesNeeded;
2061 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2064 return TRI.regsOverlap(Reg, DstReg);
2067 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2068 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2069 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2070 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2071 if (OpNo == SrcCIdx)
2072 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2073 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2074 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2076 WaitStatesNeededForUse = NeedWaitStates -
2077 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2078 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2080 if (WaitStatesNeeded == MaxWaitStates)
2081 return WaitStatesNeeded;
2084 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2085 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2086 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2087 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2088 const int MaxWaitStates = 13;
2089 Register DstReg =
MI->getOperand(0).getReg();
2090 unsigned HazardDefLatency = 0;
2092 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2098 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2099 return TRI.regsOverlap(Reg, DstReg);
2102 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2104 switch (HazardDefLatency) {
2105 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2107 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2109 case 16: [[fallthrough]];
2110 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2114 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2115 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2119 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2121 return WaitStatesNeeded;
2125 int WaitStatesNeeded = 0;
2126 unsigned Opc =
MI->getOpcode();
2138 return WaitStatesNeeded;
2140 const int VALUWritesExecWaitStates = 4;
2141 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2142 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2143 VALUWritesExecWaitStates);
2144 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2150 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2151 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2152 const int GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates = 3;
2153 const int GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates = 5;
2154 const int GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates = 4;
2155 const int GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates = 9;
2156 const int GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates = 8;
2157 const int GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates = 17;
2158 const int GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates = 16;
2159 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2160 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2161 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2162 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2163 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2164 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2165 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2166 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2167 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2168 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2169 const int GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates = 4;
2170 const int GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates = 6;
2171 const int GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates = 10;
2172 const int GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates = 18;
2173 const int GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates = 5;
2174 const int GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates = 7;
2175 const int GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates = 11;
2176 const int GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates = 19;
2177 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2178 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2179 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2180 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2181 const int MaxWaitStates = 19;
2189 auto IsOverlappedMFMAFn = [
Reg, &FullReg, &MI1,
2194 FullReg = (DstReg ==
Reg);
2196 return TRI.regsOverlap(DstReg, Reg);
2199 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2200 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2201 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2204 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2205 if (NumWaitStates == std::numeric_limits<int>::max())
2210 int NeedWaitStates = 0;
2211 if (OpNo == SrcCIdx) {
2214 }
else if (FullReg) {
2215 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2216 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2217 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2218 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2219 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2221 TSchedModel.computeInstrLatency(MI1) == 2)
2222 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2225 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2226 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2227 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2228 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2230 NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2232 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2233 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2235 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2240 switch (TSchedModel.computeInstrLatency(MI1)) {
2244 ? GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates
2245 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates
2247 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2248 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2252 NeedWaitStates =
isXDL(ST, *MI1)
2253 ? GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates
2254 : GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates;
2259 ? GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates
2260 : GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates
2262 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2263 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2265 case 16: [[fallthrough]];
2269 ? GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates
2270 : GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates
2272 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2273 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2279 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2280 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2281 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2282 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2283 NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2285 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2286 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2287 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2290 switch (TSchedModel.computeInstrLatency(MI1)) {
2294 ? GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates
2295 : GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates
2296 : SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2300 NeedWaitStates =
isXDL(ST, *MI1)
2301 ? GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates
2302 : GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates;
2307 ? GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates
2308 : GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates
2309 : SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2311 case 16: [[fallthrough]];
2315 ? GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates
2316 : GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates
2317 : SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2321 if (WaitStatesNeeded >= NeedWaitStates)
2324 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2325 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2327 if (WaitStatesNeeded == MaxWaitStates)
2331 return WaitStatesNeeded;
2339 int WaitStatesNeeded = 0;
2342 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2351 const int AccVgprReadLdStWaitStates = 2;
2352 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2353 const int MaxWaitStates = 2;
2355 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2356 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2357 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2359 if (WaitStatesNeeded == MaxWaitStates)
2360 return WaitStatesNeeded;
2363 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2364 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2369 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 ) <
2370 std::numeric_limits<int>::max();
2373 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2374 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2375 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2378 return WaitStatesNeeded;
2395 int WaitStatesNeeded = 0;
2407 !
TRI.regsOverlap(
MI.getOperand(0).getReg(), Reg))
2416 !
TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
2422 bool DGEMMAfterVALUWrite =
false;
2423 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite,
this](
const MachineInstr &
MI) {
2426 DGEMMAfterVALUWrite =
true;
2430 if (!
TII.isVALU(
MI) || !DGEMMAfterVALUWrite)
2437 AMDGPU::OpName::src2);
2439 if (IsMemOrExport || IsVALU) {
2440 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2441 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2442 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2443 const int GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates = 4;
2444 const int GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates = 6;
2445 const int GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates = 10;
2446 const int GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates = 18;
2447 const int GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates = 5;
2448 const int GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates = 7;
2449 const int GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates = 11;
2450 const int GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates = 19;
2451 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2452 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2453 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2454 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2455 const int DotWriteSameDotReadSrcAB = 3;
2456 const int DotWriteDifferentVALURead = 3;
2457 const int DMFMABetweenVALUWriteVMEMRead = 2;
2458 const int MaxWaitStates = 19;
2466 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2469 int NeedWaitStates = 0;
2470 if (
DOT->getOpcode() ==
MI->getOpcode()) {
2471 if (&
Use - &
MI->getOperand(0) != SrcCIdx)
2472 NeedWaitStates = DotWriteSameDotReadSrcAB;
2474 NeedWaitStates = DotWriteDifferentVALURead;
2477 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2478 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2486 DGEMMAfterVALUWrite =
false;
2487 if (
TRI.isVectorRegister(
MRI, Reg)) {
2488 int WaitStatesNeededForUse =
2489 DMFMABetweenVALUWriteVMEMRead -
2490 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2491 DMFMABetweenVALUWriteVMEMRead);
2493 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2498 WaitStatesSinceDef =
2499 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2503 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2504 int NeedWaitStates = MaxWaitStates;
2505 switch (HazardDefLatency) {
2510 ? GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates
2511 : GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates
2512 : SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2518 ? IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2519 : DMFMA4x4WriteVgprVALUReadWaitStates
2521 ? GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates
2522 : GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates;
2528 ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates
2529 : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates
2530 : SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2532 case 16: [[fallthrough]];
2536 ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
2537 : DMFMA16x16WriteVgprVALUReadWaitStates
2540 ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates
2541 : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates
2542 : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2546 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2547 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2549 if (WaitStatesNeeded == MaxWaitStates)
2554 unsigned Opc =
MI->getOpcode();
2555 const int DMFMAToFMA64WaitStates = 2;
2556 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2557 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2558 Opc == AMDGPU::V_FMAC_F64_dpp) &&
2559 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2560 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2561 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2562 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2565 if (!IsVALU && !IsMemOrExport)
2566 return WaitStatesNeeded;
2569 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2570 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2571 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2572 const int GFX940_SMFMA2PassWriteVgprVALUWawWaitStates = 4;
2573 const int GFX940_SMFMA4PassWriteVgprVALUWawWaitStates = 6;
2574 const int GFX940_SMFMA8PassWriteVgprVALUWawWaitStates = 10;
2575 const int GFX940_SMFMA16PassWriteVgprVALUWawWaitStates = 18;
2576 const int GFX940_XDL2PassWriteVgprVALUWawWaitStates = 5;
2577 const int GFX940_XDL4PassWriteVgprVALUWawWaitStates = 7;
2578 const int GFX940_XDL8PassWriteVgprVALUWawWaitStates = 11;
2579 const int GFX940_XDL16PassWriteVgprVALUWawWaitStates = 19;
2580 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2581 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2582 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2583 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2584 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2585 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2586 const int DotWriteDifferentVALUWrite = 3;
2587 const int MaxWaitStates = 19;
2588 const int MaxWarWaitStates = 15;
2593 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2595 if (DOT &&
DOT->getOpcode() !=
MI->getOpcode())
2596 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2597 WaitStatesSinceDef);
2600 WaitStatesSinceDef =
2601 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2603 int NeedWaitStates = MaxWaitStates;
2604 switch (TSchedModel.computeInstrLatency(MFMA)) {
2608 ? GFX940_XDL2PassWriteVgprVALUWawWaitStates
2609 : GFX940_SMFMA2PassWriteVgprVALUWawWaitStates
2610 : SMFMA4x4WriteVgprVALUWawWaitStates;
2614 NeedWaitStates =
isDGEMM(MFMA->getOpcode())
2615 ? DMFMA4x4WriteVgprVALUWriteWaitStates
2617 ? GFX940_XDL4PassWriteVgprVALUWawWaitStates
2618 : GFX940_SMFMA4PassWriteVgprVALUWawWaitStates;
2623 ? GFX940_XDL8PassWriteVgprVALUWawWaitStates
2624 : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates
2625 : SMFMA16x16WriteVgprVALUWawWaitStates;
2627 case 16: [[fallthrough]];
2629 NeedWaitStates =
isDGEMM(MFMA->getOpcode())
2630 ? DMFMA16x16WriteVgprVALUWriteWaitStates
2633 ? GFX940_XDL16PassWriteVgprVALUWawWaitStates
2634 : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates
2635 : SMFMA32x32WriteVgprVALUWawWaitStates;
2639 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2640 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2642 if (WaitStatesNeeded == MaxWaitStates)
2648 !
MI.readsRegister(Reg, &TRI))
2655 TII.getNamedOperand(
MI, AMDGPU::OpName::src2);
2665 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2670 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2671 int NeedWaitStates = MaxWaitStates;
2672 switch (HazardDefLatency) {
2673 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2676 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2678 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2680 case 16: [[fallthrough]];
2681 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2685 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2686 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2689 return WaitStatesNeeded;
2702 return MAI !=
nullptr;
2706 if (IsMFMAFn(*
MI)) {
2707 int W = getWaitStatesSince(IsMFMAFn, 16);
2709 return W < (int)TSchedModel.computeInstrLatency(MAI);
2715bool GCNHazardRecognizer::fixVALUMaskWriteHazard(
MachineInstr *
MI) {
2732 if (!SDSTOp || !SDSTOp->
isReg())
2736 if (HazardReg == AMDGPU::EXEC ||
2737 HazardReg == AMDGPU::EXEC_LO ||
2738 HazardReg == AMDGPU::EXEC_HI ||
2739 HazardReg == AMDGPU::M0)
2743 switch (
I.getOpcode()) {
2744 case AMDGPU::V_ADDC_U32_e32:
2745 case AMDGPU::V_ADDC_U32_dpp:
2746 case AMDGPU::V_CNDMASK_B16_e32:
2747 case AMDGPU::V_CNDMASK_B16_dpp:
2748 case AMDGPU::V_CNDMASK_B32_e32:
2749 case AMDGPU::V_CNDMASK_B32_dpp:
2750 case AMDGPU::V_DIV_FMAS_F32_e64:
2751 case AMDGPU::V_DIV_FMAS_F64_e64:
2752 case AMDGPU::V_SUBB_U32_e32:
2753 case AMDGPU::V_SUBB_U32_dpp:
2754 case AMDGPU::V_SUBBREV_U32_e32:
2755 case AMDGPU::V_SUBBREV_U32_dpp:
2757 return HazardReg == AMDGPU::VCC ||
2758 HazardReg == AMDGPU::VCC_LO ||
2759 HazardReg == AMDGPU::VCC_HI;
2760 case AMDGPU::V_ADDC_U32_e64:
2761 case AMDGPU::V_ADDC_U32_e64_dpp:
2762 case AMDGPU::V_CNDMASK_B16_e64:
2763 case AMDGPU::V_CNDMASK_B16_e64_dpp:
2764 case AMDGPU::V_CNDMASK_B32_e64:
2765 case AMDGPU::V_CNDMASK_B32_e64_dpp:
2766 case AMDGPU::V_SUBB_U32_e64:
2767 case AMDGPU::V_SUBB_U32_e64_dpp:
2768 case AMDGPU::V_SUBBREV_U32_e64:
2769 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2773 return TRI.regsOverlap(SSRCOp->
getReg(), HazardReg);
2783 if (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2784 !(
I.getOperand(0).getImm() & 0x1))
2792 for (
int OpNo = 0, End =
I.getNumOperands(); OpNo < End; ++OpNo) {
2800 if (OpReg == AMDGPU::EXEC ||
2801 OpReg == AMDGPU::EXEC_LO ||
2802 OpReg == AMDGPU::EXEC_HI)
2805 if (
Op.isImplicit()) {
2806 if (OpReg == AMDGPU::VCC ||
2807 OpReg == AMDGPU::VCC_LO ||
2808 OpReg == AMDGPU::VCC_HI)
2812 if (
TRI.isSGPRReg(
MRI, OpReg))
2817 if (!
TII.isInlineConstant(Op, OpInfo))
2826 std::numeric_limits<int>::max())
2829 auto NextMI = std::next(
MI->getIterator());
2832 BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
2833 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2837 if (
MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2839 while (NextMI !=
MI->getParent()->end() &&
2840 NextMI->isBundledWithPred()) {
2841 for (
auto &Operand : NextMI->operands()) {
2842 if (Operand.isGlobal())
2843 Operand.setOffset(Operand.getOffset() + 4);
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI)
static bool isLdsDma(const MachineInstr &MI)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isSSetReg(unsigned Opcode)
static bool hasHazard(StateT State, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, DenseSet< const MachineBasicBlock * > &Visited)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &Set)
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
static bool isDGEMM(unsigned Opcode)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
static bool isPermlane(const MachineInstr &MI)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static const uint32_t IV[8]
bool anyCommon(const BitVector &RHS) const
Test if any common bits are set.
bool none() const
none - Returns true if none of the bits are set.
Implements a dense probed hash-table based set.
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
unsigned PreEmitNoopsCommon(MachineInstr *)
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
bool hasShift64HighRegBug() const
bool hasFPAtomicToDenormModeHazard() const
bool hasLdsBranchVmemWARHazard() const
bool hasGFX90AInsts() const
bool hasDstSelForwardingHazard() const
const SIInstrInfo * getInstrInfo() const override
bool hasVALUMaskWriteHazard() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasVcmpxExecWARHazard() const
bool hasReadM0MovRelInterpHazard() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasVcmpxPermlaneHazard() const
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
bool hasNoDataDepHazard() const
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
bool hasTransForwardingHazard() const
bool hasGFX940Insts() const
bool hasReadM0LdsDmaHazard() const
bool hasSMEMtoVectorWriteHazard() const
bool hasVMEMtoScalarWriteHazard() const
bool hasNSAtoVMEMBug() const
bool hasVDecCoExecHazard() const
bool hasReadM0SendMsgHazard() const
bool hasReadM0LdsDirectHazard() const
bool isXNACKEnabled() const
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
bool hasRFEHazards() const
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
bool hasVALUTransUseHazard() const
bool hasLdsDirect() const
Describe properties that are true of each instruction in the target description file.
ArrayRef< MCOperandInfo > operands() const
This holds information about one operand of a machine instruction, indicating the register class for ...
bool isValid() const
isValid - returns true if this iterator is not yet at the end.
Wrapper class representing physical registers. Should be passed by value.
reverse_instr_iterator instr_rend()
instr_iterator instr_end()
iterator_range< pred_iterator > predecessors()
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Wrapper class representing virtual and physical registers.
static bool isMAI(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isVINTRP(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
unsigned getOccupancy() const
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned getMaxLookAhead() const
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
StringRef - Represent a constant reference to a string, i.e.
ProcResIter getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
A Use represents the edge between a Value definition and its users.
unsigned getOperandNo() const
Return the operand # of this use in its User.
LLVM Value Representation.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
Iterator for intrusive lists based on ilist_node.
self_iterator getIterator()
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
IsaVersion getIsaVersion(StringRef GPU)
unsigned getRegBitWidth(unsigned RCID)
Get the size in bits of a register from the register class RC.
bool getMAIIsGFX940XDL(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
bool getMAIIsDGEMM(unsigned Opc)
Returns true if MAI operation is a double precision GEMM.
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.
Summarize the scheduling resources required for an instruction of a particular scheduling class.