27 IsHazardRecognizerMode(
false),
28 CurrCycleInstr(nullptr),
31 TII(*
ST.getInstrInfo()),
32 TRI(
TII.getRegisterInfo()),
33 ClauseUses(
TRI.getNumRegUnits()),
34 ClauseDefs(
TRI.getNumRegUnits()) {
36 TSchedModel.
init(&ST);
40 EmittedInstrs.clear();
52 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
56 return Opcode == AMDGPU::S_GETREG_B32;
61 case AMDGPU::S_SETREG_B32:
62 case AMDGPU::S_SETREG_B32_mode:
63 case AMDGPU::S_SETREG_IMM32_B32:
64 case AMDGPU::S_SETREG_IMM32_B32_mode:
71 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
74 static bool isRFE(
unsigned Opcode) {
75 return Opcode == AMDGPU::S_RFE_B64;
80 case AMDGPU::S_MOVRELS_B32:
81 case AMDGPU::S_MOVRELS_B64:
82 case AMDGPU::S_MOVRELD_B32:
83 case AMDGPU::S_MOVRELD_B64:
91 return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
92 Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 ||
93 Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 ||
94 Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64;
98 unsigned Opcode =
MI.getOpcode();
102 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
103 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
111 if (
TII.isAlwaysGDS(
MI.getOpcode()))
114 switch (
MI.getOpcode()) {
115 case AMDGPU::S_SENDMSG:
116 case AMDGPU::S_SENDMSGHALT:
117 case AMDGPU::S_TTRACEDATA:
121 case AMDGPU::DS_PERMUTE_B32:
122 case AMDGPU::DS_BPERMUTE_B32:
125 if (
TII.isDS(
MI.getOpcode())) {
127 AMDGPU::OpName::gds);
128 if (
MI.getOperand(GDS).getImm())
136 unsigned Opcode =
MI.getOpcode();
137 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
138 Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
143 AMDGPU::OpName::simm16);
163 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
172 && checkVMEMHazards(
MI) > 0)
181 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
184 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
192 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
195 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
198 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
203 checkReadM0Hazards(
MI) > 0)
207 checkReadM0Hazards(
MI) > 0)
218 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
226 while (Quantity > 0) {
234 void GCNHazardRecognizer::processBundle() {
238 for (;
MI !=
E &&
MI->isInsideBundle(); ++
MI) {
239 CurrCycleInstr = &*
MI;
242 if (IsHazardRecognizerMode) {
243 fixHazards(CurrCycleInstr);
252 EmittedInstrs.push_front(
nullptr);
254 EmittedInstrs.push_front(CurrCycleInstr);
257 CurrCycleInstr =
nullptr;
261 IsHazardRecognizerMode =
true;
265 CurrCycleInstr =
nullptr;
276 return std::max(WaitStates, checkSMRDHazards(
MI));
279 WaitStates =
std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
281 WaitStates =
std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
287 WaitStates =
std::max(WaitStates, checkVMEMHazards(
MI));
290 WaitStates =
std::max(WaitStates, checkVALUHazards(
MI));
293 WaitStates =
std::max(WaitStates, checkDPPHazards(
MI));
296 WaitStates =
std::max(WaitStates, checkDivFMasHazards(
MI));
299 WaitStates =
std::max(WaitStates, checkRWLaneHazards(
MI));
304 WaitStates =
std::max(WaitStates, checkMAIVALUHazards(
MI));
306 if (
MI->isInlineAsm())
307 return std::max(WaitStates, checkInlineAsmHazards(
MI));
310 return std::max(WaitStates, checkGetRegHazards(
MI));
313 return std::max(WaitStates, checkSetRegHazards(
MI));
316 return std::max(WaitStates, checkRFEHazards(
MI));
320 return std::max(WaitStates, checkReadM0Hazards(
MI));
323 return std::max(WaitStates, checkReadM0Hazards(
MI));
326 return std::max(WaitStates, checkMAIHazards(
MI));
331 return std::max(WaitStates, checkMAILdStHazards(
MI));
337 EmittedInstrs.push_front(
nullptr);
343 if (!CurrCycleInstr) {
344 EmittedInstrs.push_front(
nullptr);
352 CurrCycleInstr->
isKill()) {
353 CurrCycleInstr =
nullptr;
365 EmittedInstrs.push_front(CurrCycleInstr);
372 EmittedInstrs.push_front(
nullptr);
380 CurrCycleInstr =
nullptr;
384 llvm_unreachable(
"hazard recognizer does not support bottom-up scheduling.");
410 if (
I->isInlineAsm() ||
I->isMetaInstruction())
415 if (IsExpired(&*
I, WaitStates))
419 int MinWaitStates = WaitStates;
422 if (!Visited.
insert(Pred).second)
426 WaitStates, IsExpired, Visited);
431 MinWaitStates = Found ?
std::min(MinWaitStates,
W) :
W;
432 if (IsExpired(
nullptr, MinWaitStates))
433 return MinWaitStates;
439 return MinWaitStates;
449 std::next(
MI->getReverseIterator()),
450 0, IsExpired, Visited);
453 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
int Limit) {
454 if (IsHazardRecognizerMode) {
456 return WaitStates >= Limit;
467 if (
MI->isInlineAsm())
472 if (WaitStates >= Limit)
478 int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
479 IsHazardFn IsHazardDef,
484 return IsHazardDef(
MI) &&
MI->modifiesRegister(
Reg, TRI);
490 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
532 int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM) {
538 bool IsSMRD = TII.
isSMRD(*MEM);
564 if (ClauseDefs.
none())
577 return ClauseDefs.
anyCommon(ClauseUses) ? 1 : 0;
581 int WaitStatesNeeded = 0;
583 WaitStatesNeeded = checkSoftClauseHazards(
SMRD);
587 return WaitStatesNeeded;
591 int SmrdSgprWaitStates = 4;
600 int WaitStatesNeededForUse =
601 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
603 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForUse);
613 int WaitStatesNeededForUse =
614 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
617 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForUse);
621 return WaitStatesNeeded;
624 int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr* VMEM) {
628 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
632 const int VmemSgprWaitStates = 5;
638 int WaitStatesNeededForUse =
639 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
641 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForUse);
643 return WaitStatesNeeded;
651 int DppVgprWaitStates = 2;
652 int DppExecWaitStates = 5;
653 int WaitStatesNeeded = 0;
659 int WaitStatesNeededForUse =
660 DppVgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
663 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForUse);
668 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
671 return WaitStatesNeeded;
674 int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas) {
679 const int DivFMasWaitStates = 4;
681 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
684 return DivFMasWaitStates - WaitStatesNeeded;
687 int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr) {
689 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
691 const int GetRegWaitStates = 2;
695 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
697 return GetRegWaitStates - WaitStatesNeeded;
700 int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr) {
702 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
708 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
709 return SetRegWaitStates - WaitStatesNeeded;
712 int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI) {
717 unsigned Opcode =
MI.getOpcode();
733 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
737 (!SOffset || !SOffset->
isReg()))
745 if (
TII->isMIMG(
MI)) {
752 if (
TII->isFLAT(
MI)) {
768 const int VALUWaitStates = 1;
769 int WaitStatesNeeded = 0;
771 if (!
TRI->isVectorRegister(
MRI,
Def.getReg()))
772 return WaitStatesNeeded;
775 int DataIdx = createsVALUHazard(*
MI);
776 return DataIdx >= 0 &&
779 int WaitStatesNeededForDef =
780 VALUWaitStates - getWaitStatesSince(
IsHazardFn, VALUWaitStates);
781 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForDef);
783 return WaitStatesNeeded;
793 int WaitStatesNeeded = 0;
796 WaitStatesNeeded =
std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Def,
MRI));
799 return WaitStatesNeeded;
802 int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA) {
815 int WaitStatesNeeded = 0;
820 if (
Op.isReg() &&
Op.isDef()) {
821 WaitStatesNeeded =
std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Op,
MRI));
825 return WaitStatesNeeded;
828 int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane) {
834 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
841 return TII->isVALU(*
MI);
844 const int RWLaneWaitStates = 4;
845 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
847 return RWLaneWaitStates - WaitStatesSince;
850 int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE) {
856 const int RFEWaitStates = 1;
861 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
862 return RFEWaitStates - WaitStatesNeeded;
867 const int SMovRelWaitStates = 1;
869 return TII->isSALU(*
MI);
876 fixVMEMtoScalarWriteHazards(
MI);
877 fixVcmpxPermlaneHazards(
MI);
878 fixSMEMtoVectorWriteHazards(
MI);
879 fixVcmpxExecWARHazard(
MI);
880 fixLdsBranchVmemWARHazard(
MI);
883 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
889 return TII->isVOPC(*
MI);
895 unsigned Opc =
MI->getOpcode();
897 Opc != AMDGPU::V_NOP_e32 &&
898 Opc != AMDGPU::V_NOP_e64 &&
899 Opc != AMDGPU::V_NOP_sdwa;
909 auto *Src0 =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
911 bool IsUndef = Src0->isUndef();
913 TII->get(AMDGPU::V_MOV_B32_e32))
920 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
927 if (
MI->getNumDefs() == 0)
948 (
MI->getOpcode() == AMDGPU::S_WAITCNT &&
949 !
MI->getOperand(0).getImm()) ||
950 (
MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
951 MI->getOperand(0).getImm() == 0xffe3));
960 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
965 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
973 switch (
MI->getOpcode()) {
974 case AMDGPU::V_READLANE_B32:
975 case AMDGPU::V_READFIRSTLANE_B32:
976 SDSTName = AMDGPU::OpName::vdst;
979 SDSTName = AMDGPU::OpName::sdst;
988 for (
const auto &MO :
MI->implicit_operands()) {
989 if (MO.isDef() &&
TRI->isSGPRClass(
TRI->getPhysRegClass(MO.getReg()))) {
1006 if (
TII->isSALU(*
MI)) {
1007 switch (
MI->getOpcode()) {
1008 case AMDGPU::S_SETVSKIP:
1009 case AMDGPU::S_VERSION:
1010 case AMDGPU::S_WAITCNT_VSCNT:
1011 case AMDGPU::S_WAITCNT_VMCNT:
1012 case AMDGPU::S_WAITCNT_EXPCNT:
1015 case AMDGPU::S_WAITCNT_LGKMCNT:
1017 return (
MI->getOperand(1).getImm() == 0) &&
1018 (
MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1019 case AMDGPU::S_WAITCNT: {
1020 const int64_t Imm =
MI->getOperand(0).getImm();
1022 return (Decoded.
LgkmCnt == 0);
1026 if (
TII->isSOPP(*
MI))
1047 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1052 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1057 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1063 return I->readsRegister(AMDGPU::EXEC, TRI);
1071 if (
TII->getNamedOperand(*
MI, AMDGPU::OpName::sdst))
1073 for (
auto MO :
MI->implicit_operands())
1074 if (MO.isDef() &&
TRI->isSGPRClass(
TRI->getPhysRegClass(MO.getReg())))
1077 if (
MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1078 (
MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
1088 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1093 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1105 auto InstType = IsHazardInst(
MI);
1110 return I && (IsHazardInst(
I) ||
1111 (
I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1112 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1113 !
I->getOperand(1).getImm()));
1121 auto InstType2 = IsHazardInst(
I);
1122 return InstType2 && InstType != InstType2;
1129 auto InstType2 = IsHazardInst(
I);
1130 if (InstType == InstType2)
1133 return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1134 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1135 !
I->getOperand(1).getImm();
1148 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1155 int GCNHazardRecognizer::checkNSAtoVMEMHazard(
MachineInstr *
MI) {
1156 int NSAtoVMEMWaitStates = 1;
1165 const auto *
Offset =
TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
1173 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1174 TII->getInstSizeInBytes(*
I) >= 16;
1177 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
1180 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
MachineInstr *
MI) {
1181 int FPAtomicToDenormModeWaitStates = 3;
1183 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1196 switch (
MI->getOpcode()) {
1197 case AMDGPU::S_WAITCNT:
1198 case AMDGPU::S_WAITCNT_VSCNT:
1199 case AMDGPU::S_WAITCNT_VMCNT:
1200 case AMDGPU::S_WAITCNT_EXPCNT:
1201 case AMDGPU::S_WAITCNT_LGKMCNT:
1202 case AMDGPU::S_WAIT_IDLE:
1212 return FPAtomicToDenormModeWaitStates -
1223 int WaitStatesNeeded = 0;
1224 unsigned Opc =
MI->getOpcode();
1230 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
1231 const int LegacyVALUWritesVGPRWaitStates = 2;
1232 const int VALUWritesExecWaitStates = 4;
1233 const int MaxWaitStates = 4;
1235 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1236 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1237 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1239 if (WaitStatesNeeded < MaxWaitStates) {
1241 const int MaxWaitStates = 2;
1246 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1247 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
1248 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1250 if (WaitStatesNeeded == MaxWaitStates)
1258 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1259 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1266 if (
Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1269 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1270 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1271 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1272 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1273 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1274 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1275 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1276 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1277 const int MaxWaitStates = 18;
1279 unsigned HazardDefLatency = 0;
1281 auto IsOverlappedMFMAFn = [
Reg, &IsMFMAFn, &HazardDefLatency,
this]
1285 Register DstReg =
MI->getOperand(0).getReg();
1288 HazardDefLatency =
std::max(HazardDefLatency,
1289 TSchedModel.computeInstrLatency(
MI));
1293 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn,
1295 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1297 int OpNo =
MI->getOperandNo(&
Op);
1298 if (OpNo == SrcCIdx) {
1299 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1300 }
else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
1301 switch (HazardDefLatency) {
1302 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1304 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1307 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1310 }
else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1311 switch (HazardDefLatency) {
1312 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1314 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1317 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1322 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1323 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1325 if (WaitStatesNeeded == MaxWaitStates)
1326 return WaitStatesNeeded;
1329 if (
MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1331 Register DstReg =
MI->getOperand(0).getReg();
1335 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1336 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1337 const int AccVGPRWriteAccVgprReadWaitStates = 3;
1338 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1339 if (OpNo == SrcCIdx)
1340 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1341 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
1342 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1344 WaitStatesNeededForUse = NeedWaitStates -
1345 getWaitStatesSinceDef(
Reg, IsAccVgprWriteFn, MaxWaitStates);
1346 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1348 if (WaitStatesNeeded == MaxWaitStates)
1349 return WaitStatesNeeded;
1352 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1353 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1354 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1355 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1356 const int MaxWaitStates = 13;
1357 Register DstReg =
MI->getOperand(0).getReg();
1358 unsigned HazardDefLatency = 0;
1360 auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency,
this]
1365 HazardDefLatency =
std::max(HazardDefLatency,
1366 TSchedModel.computeInstrLatency(
MI));
1370 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1372 switch (HazardDefLatency) {
1373 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1375 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1378 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1382 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1383 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1386 return WaitStatesNeeded;
1390 int WaitStatesNeeded = 0;
1391 unsigned Opc =
MI->getOpcode();
1395 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1396 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1409 return WaitStatesNeeded;
1411 const int VALUWritesExecWaitStates = 4;
1412 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1413 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
1414 VALUWritesExecWaitStates);
1415 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1421 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
1422 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
1423 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
1424 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
1425 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
1426 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
1427 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
1428 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
1429 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
1430 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
1431 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
1432 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
1433 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
1434 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
1435 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
1436 const int MaxWaitStates = 19;
1440 unsigned Reg =
Use.getReg();
1444 auto IsOverlappedDGEMMorXDLFn = [
Reg, &IsMFMAFn, &FullReg, &MI1,
this]
1450 Register DstReg =
MI->getOperand(0).getReg();
1451 FullReg = (DstReg ==
Reg);
1456 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
1457 getWaitStatesSinceDef(
Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
1458 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1460 int NumWaitStates = getWaitStatesSinceDef(
Reg, IsOverlappedDGEMMorXDLFn,
1465 int OpNo =
MI->getOperandNo(&
Use);
1466 unsigned Opc1 = MI1->getOpcode();
1467 int NeedWaitStates = 0;
1468 if (OpNo == SrcCIdx) {
1471 }
else if (FullReg) {
1472 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1473 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
1474 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1475 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
1476 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
1479 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1480 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1482 NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
1484 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1485 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1487 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
1490 switch (TSchedModel.computeInstrLatency(MI1)) {
1493 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
1494 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
1498 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
1499 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
1504 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
1505 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
1511 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1512 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1513 NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
1515 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1516 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1517 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
1520 switch (TSchedModel.computeInstrLatency(MI1)) {
1522 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
1525 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
1529 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
1533 if (WaitStatesNeeded >= NeedWaitStates)
1536 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
1537 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1539 if (WaitStatesNeeded == MaxWaitStates)
1543 return WaitStatesNeeded;
1551 int WaitStatesNeeded = 0;
1554 return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
1563 const int AccVgprReadLdStWaitStates = 2;
1564 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
1565 const int MaxWaitStates = 2;
1567 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1568 getWaitStatesSinceDef(
Reg, IsAccVgprReadFn, MaxWaitStates);
1569 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1571 if (WaitStatesNeeded == MaxWaitStates)
1572 return WaitStatesNeeded;
1575 if (
MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
1576 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1581 return getWaitStatesSinceDef(
Reg, IsVALUFn, 2 ) <
1585 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
1586 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
1587 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1590 return WaitStatesNeeded;
1599 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1600 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1611 int WaitStatesNeeded = 0;
1640 AMDGPU::OpName::src2);
1642 if (IsMemOrExport || IsVALU) {
1643 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
1644 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
1645 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
1646 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
1647 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
1648 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
1649 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
1650 const int DotWriteSameDotReadSrcAB = 3;
1651 const int DotWriteDifferentVALURead = 3;
1652 const int MaxWaitStates = 19;
1660 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
1663 int NeedWaitStates = 0;
1664 if (
DOT->getOpcode() ==
MI->getOpcode()) {
1665 if (&
Use - &
MI->getOperand(0) != SrcCIdx)
1666 NeedWaitStates = DotWriteSameDotReadSrcAB;
1668 NeedWaitStates = DotWriteDifferentVALURead;
1671 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1672 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1676 WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDGEMMorXDLWriteFn,
1681 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
1682 int NeedWaitStates = MaxWaitStates;
1683 switch (HazardDefLatency) {
1685 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
1690 IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
1691 : DMFMA4x4WriteVgprVALUReadWaitStates;
1694 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
1700 ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
1701 : DMFMA16x16WriteVgprVALUReadWaitStates
1702 : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
1706 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1707 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1709 if (WaitStatesNeeded == MaxWaitStates)
1714 unsigned Opc =
MI->getOpcode();
1715 const int DMFMAToFMA64WaitStates = 2;
1716 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
1717 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
1718 Opc == AMDGPU::V_FMAC_F64_dpp) &&
1719 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
1720 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
1721 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
1722 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1725 if (!IsVALU && !IsMemOrExport)
1726 return WaitStatesNeeded;
1729 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
1730 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
1731 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
1732 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
1733 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
1734 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
1735 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
1736 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
1737 const int DotWriteDifferentVALUWrite = 3;
1738 const int MaxWaitStates = 19;
1739 const int MaxWarWaitStates = 15;
1744 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
1746 if (
DOT &&
DOT->getOpcode() !=
MI->getOpcode())
1747 WaitStatesNeeded =
std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
1748 WaitStatesSinceDef);
1751 WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDGEMMorXDLWriteFn,
1754 int NeedWaitStates = MaxWaitStates;
1755 switch (TSchedModel.computeInstrLatency(MFMA)) {
1757 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
1761 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
1764 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
1768 NeedWaitStates =
isDGEMM(MFMA->getOpcode())
1769 ? DMFMA16x16WriteVgprVALUWriteWaitStates
1770 : SMFMA32x32WriteVgprVALUWawWaitStates;
1774 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1775 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1777 if (WaitStatesNeeded == MaxWaitStates)
1781 auto IsSMFMAReadAsCFn = [&
Reg, &IsMFMAFn, &MFMA,
this]
1784 !
MI->readsRegister(
Reg, &TRI))
1797 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
1802 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
1803 int NeedWaitStates = MaxWaitStates;
1804 switch (HazardDefLatency) {
1805 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
1807 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
1810 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
1814 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
1815 WaitStatesNeeded =
std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1818 return WaitStatesNeeded;
1829 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1830 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
1832 return MAI !=
nullptr;
1837 int W = getWaitStatesSince(IsMFMAFn, 16);
1839 return W < (
int)TSchedModel.computeInstrLatency(MAI);