25struct MFMAPaddingRatioParser :
public cl::parser<unsigned> {
30 return O.error(
"'" + Arg +
"' value invalid for uint argument!");
33 return O.error(
"'" + Arg +
"' value must be in the range [0, 100]!");
43 cl::desc(
"Fill a percentage of the latency between "
44 "neighboring MFMA with s_nops."));
54 IsHazardRecognizerMode(
false),
55 CurrCycleInstr(nullptr),
58 TII(*ST.getInstrInfo()),
59 TRI(
TII.getRegisterInfo()),
60 ClauseUses(
TRI.getNumRegUnits()),
61 ClauseDefs(
TRI.getNumRegUnits()) {
63 TSchedModel.
init(&ST);
68 EmittedInstrs.clear();
80 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
84 return Opcode == AMDGPU::S_GETREG_B32;
89 case AMDGPU::S_SETREG_B32:
90 case AMDGPU::S_SETREG_B32_mode:
91 case AMDGPU::S_SETREG_IMM32_B32:
92 case AMDGPU::S_SETREG_IMM32_B32_mode:
99 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
103 return Opcode == AMDGPU::S_RFE_B64;
108 case AMDGPU::S_MOVRELS_B32:
109 case AMDGPU::S_MOVRELS_B64:
110 case AMDGPU::S_MOVRELD_B32:
111 case AMDGPU::S_MOVRELD_B64:
123 unsigned Opcode =
MI.getOpcode();
127 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
128 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
131 if (!ST.hasGFX940Insts())
139 if (
TII.isAlwaysGDS(
MI.getOpcode()))
142 switch (
MI.getOpcode()) {
143 case AMDGPU::S_SENDMSG:
144 case AMDGPU::S_SENDMSGHALT:
145 case AMDGPU::S_TTRACEDATA:
149 case AMDGPU::DS_PERMUTE_B32:
150 case AMDGPU::DS_BPERMUTE_B32:
153 if (
TII.isDS(
MI.getOpcode())) {
155 AMDGPU::OpName::gds);
156 if (
MI.getOperand(GDS).getImm())
164 unsigned Opcode =
MI.getOpcode();
165 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANE64_B32 ||
167 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
169 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
179 AMDGPU::OpName::simm16);
199 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
208 && checkVMEMHazards(
MI) > 0)
217 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
220 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
228 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
231 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
234 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
239 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
240 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
244 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr))) &&
245 checkReadM0Hazards(
MI) > 0)
256 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
264 while (Quantity > 0) {
265 unsigned Arg = std::min(Quantity, 8u);
273GCNHazardRecognizer::getMFMAPipelineWaitStates(
const MachineInstr &
MI)
const {
280void GCNHazardRecognizer::processBundle() {
284 for (;
MI != E &&
MI->isInsideBundle(); ++
MI) {
285 CurrCycleInstr = &*
MI;
288 if (IsHazardRecognizerMode) {
289 fixHazards(CurrCycleInstr);
297 for (
unsigned i = 0, e = std::min(WaitStates,
MaxLookAhead - 1); i <
e; ++i)
298 EmittedInstrs.push_front(
nullptr);
300 EmittedInstrs.push_front(CurrCycleInstr);
303 CurrCycleInstr =
nullptr;
307 assert(IsHazardRecognizerMode);
311 if (
MI->isInsideBundle())
321 IsHazardRecognizerMode =
true;
325 CurrCycleInstr =
nullptr;
336 return std::max(WaitStates, checkSMRDHazards(
MI));
339 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
341 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
347 WaitStates = std::max(WaitStates, checkVMEMHazards(
MI));
350 WaitStates = std::max(WaitStates, checkVALUHazards(
MI));
353 WaitStates = std::max(WaitStates, checkDPPHazards(
MI));
356 WaitStates = std::max(WaitStates, checkDivFMasHazards(
MI));
359 WaitStates = std::max(WaitStates, checkRWLaneHazards(
MI));
364 WaitStates = std::max(WaitStates, checkMAIVALUHazards(
MI));
366 if (
MI->isInlineAsm())
367 return std::max(WaitStates, checkInlineAsmHazards(
MI));
370 return std::max(WaitStates, checkGetRegHazards(
MI));
373 return std::max(WaitStates, checkSetRegHazards(
MI));
376 return std::max(WaitStates, checkRFEHazards(
MI));
380 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
381 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
385 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr)))
386 return std::max(WaitStates, checkReadM0Hazards(
MI));
389 return std::max(WaitStates, checkMAIHazards(
MI));
394 return std::max(WaitStates, checkMAILdStHazards(
MI));
400 EmittedInstrs.push_front(
nullptr);
406 if (!CurrCycleInstr) {
407 EmittedInstrs.push_front(
nullptr);
417 if (!NumWaitStates) {
418 CurrCycleInstr =
nullptr;
423 EmittedInstrs.push_front(CurrCycleInstr);
430 EmittedInstrs.push_front(
nullptr);
438 CurrCycleInstr =
nullptr;
442 llvm_unreachable(
"hazard recognizer does not support bottom-up scheduling.");
455template <
typename StateT>
468 switch (IsHazard(State, *
I)) {
478 if (
I->isInlineAsm() ||
I->isMetaInstruction())
481 UpdateState(State, *
I);
485 if (!Visited.
insert(Pred).second)
488 if (
hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
512 if (
I->isInlineAsm())
515 WaitStates += GetNumWaitStates(*
I);
517 if (IsExpired(*
I, WaitStates))
518 return std::numeric_limits<int>::max();
521 int MinWaitStates = std::numeric_limits<int>::max();
523 if (!Visited.
insert(Pred).second)
527 IsExpired, Visited, GetNumWaitStates);
529 MinWaitStates = std::min(MinWaitStates, W);
532 return MinWaitStates;
539 std::next(
MI->getReverseIterator()),
540 0, IsExpired, Visited);
543int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
int Limit) {
544 if (IsHazardRecognizerMode) {
546 return WaitStates >= Limit;
548 return ::getWaitStatesSince(IsHazard, CurrCycleInstr,
IsExpiredFn);
557 if (
MI->isInlineAsm())
562 if (WaitStates >= Limit)
565 return std::numeric_limits<int>::max();
568int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
569 IsHazardFn IsHazardDef,
574 return IsHazardDef(
MI) &&
MI.modifiesRegister(Reg, TRI);
580int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
620int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM) {
626 bool IsSMRD = TII.
isSMRD(*MEM);
652 if (ClauseDefs.
none())
665 return ClauseDefs.
anyCommon(ClauseUses) ? 1 : 0;
668int GCNHazardRecognizer::checkSMRDHazards(
MachineInstr *SMRD) {
669 int WaitStatesNeeded = 0;
671 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
675 return WaitStatesNeeded;
679 int SmrdSgprWaitStates = 4;
692 int WaitStatesNeededForUse =
693 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
695 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
705 int WaitStatesNeededForUse =
706 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
709 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
713 return WaitStatesNeeded;
716int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr* VMEM) {
720 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
724 const int VmemSgprWaitStates = 5;
732 int WaitStatesNeededForUse =
733 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
735 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
737 return WaitStatesNeeded;
745 int DppVgprWaitStates = 2;
746 int DppExecWaitStates = 5;
747 int WaitStatesNeeded = 0;
749 return TII->isVALU(
MI);
755 int WaitStatesNeededForUse =
756 DppVgprWaitStates - getWaitStatesSinceDef(
760 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
763 WaitStatesNeeded = std::max(
765 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
768 return WaitStatesNeeded;
771int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas) {
776 const int DivFMasWaitStates = 4;
778 return TII->isVALU(
MI);
780 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
783 return DivFMasWaitStates - WaitStatesNeeded;
786int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr) {
788 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
790 const int GetRegWaitStates = 2;
794 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
796 return GetRegWaitStates - WaitStatesNeeded;
799int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr) {
801 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
807 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
808 return SetRegWaitStates - WaitStatesNeeded;
811int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI) {
816 unsigned Opcode =
MI.getOpcode();
822 VDataRCID =
Desc.operands()[VDataIdx].RegClass;
832 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
836 (!SOffset || !SOffset->
isReg()))
844 if (
TII->isMIMG(
MI)) {
851 if (
TII->isFLAT(
MI)) {
861GCNHazardRecognizer::checkVALUHazardsHelper(
const MachineOperand &Def,
868 int WaitStatesNeeded = 0;
870 if (!
TRI->isVectorRegister(
MRI,
Def.getReg()))
871 return WaitStatesNeeded;
874 int DataIdx = createsVALUHazard(
MI);
875 return DataIdx >= 0 &&
876 TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(), Reg);
878 int WaitStatesNeededForDef =
879 VALUWaitStates - getWaitStatesSince(
IsHazardFn, VALUWaitStates);
880 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
882 return WaitStatesNeeded;
885int GCNHazardRecognizer::checkVALUHazards(
MachineInstr *VALU) {
886 int WaitStatesNeeded = 0;
889 const int TransDefWaitstates = 1;
899 if (
Use.isReg() &&
TRI->regsOverlap(Def,
Use.getReg()))
906 int WaitStatesNeededForDef =
908 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
909 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
913 const int Shift16DefWaitstates = 1;
920 if (
auto *DstSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel))
925 !(
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers)
931 if (
auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
935 if (
Use.isReg() &&
TRI->regsOverlap(Def,
Use.getReg()))
943 int WaitStatesNeededForDef =
944 Shift16DefWaitstates -
945 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
946 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
950 const int VALUWriteSGPRVALUReadWaitstates = 2;
951 const int VALUWriteEXECRWLane = 4;
952 const int VALUWriteVGPRReadlaneRead = 1;
960 return MI.modifiesRegister(
UseReg, TRI);
969 int WaitStatesNeededForDef =
970 VALUWriteSGPRVALUReadWaitstates -
971 getWaitStatesSince(IsVALUDefSGPRFn,
972 VALUWriteSGPRVALUReadWaitstates);
973 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
977 if (
VALU->readsRegister(AMDGPU::VCC, TRI)) {
979 int WaitStatesNeededForDef =
980 VALUWriteSGPRVALUReadWaitstates -
981 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
982 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
985 switch (
VALU->getOpcode()) {
986 case AMDGPU::V_READLANE_B32:
987 case AMDGPU::V_READFIRSTLANE_B32: {
990 int WaitStatesNeededForDef =
991 VALUWriteVGPRReadlaneRead -
992 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
993 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
996 case AMDGPU::V_WRITELANE_B32: {
998 int WaitStatesNeededForDef =
999 VALUWriteEXECRWLane -
1000 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1001 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1012 return WaitStatesNeeded;
1017 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def,
MRI));
1020 return WaitStatesNeeded;
1023int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA) {
1036 int WaitStatesNeeded = 0;
1040 if (
Op.isReg() &&
Op.isDef()) {
1042 std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Op,
MRI));
1046 return WaitStatesNeeded;
1049int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane) {
1055 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1063 const int RWLaneWaitStates = 4;
1064 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
1066 return RWLaneWaitStates - WaitStatesSince;
1069int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE) {
1075 const int RFEWaitStates = 1;
1080 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
1081 return RFEWaitStates - WaitStatesNeeded;
1086 const int ReadM0WaitStates = 1;
1088 return ReadM0WaitStates -
1089 getWaitStatesSinceDef(AMDGPU::M0,
IsHazardFn, ReadM0WaitStates);
1093 fixVMEMtoScalarWriteHazards(
MI);
1094 fixVcmpxPermlaneHazards(
MI);
1095 fixSMEMtoVectorWriteHazards(
MI);
1096 fixVcmpxExecWARHazard(
MI);
1097 fixLdsBranchVmemWARHazard(
MI);
1099 fixLdsDirectVALUHazard(
MI);
1100 fixLdsDirectVMEMHazard(
MI);
1102 fixVALUPartialForwardingHazard(
MI);
1103 fixVALUTransUseHazard(
MI);
1105 fixShift64HighRegBug(
MI);
1106 fixVALUMaskWriteHazard(
MI);
1109bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
1116 return (
TII->isVOPC(
MI) ||
1117 ((
TII->isVOP3(
MI) ||
TII->isSDWA(
MI)) &&
MI.isCompare())) &&
1118 MI.modifiesRegister(AMDGPU::EXEC, TRI);
1122 unsigned Opc =
MI.getOpcode();
1124 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1128 std::numeric_limits<int>::max())
1134 auto *Src0 =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
1136 bool IsUndef = Src0->isUndef();
1138 TII->get(AMDGPU::V_MOV_B32_e32))
1145bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
1153 if (
MI->getNumDefs() == 0)
1165 I.findRegisterUseOperand(
Def.getReg(),
TRI,
false);
1175 (
MI.getOpcode() == AMDGPU::S_WAITCNT &&
1176 !
MI.getOperand(0).getImm()) ||
1177 (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1182 std::numeric_limits<int>::max())
1187 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1192bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
1201 switch (
MI->getOpcode()) {
1202 case AMDGPU::V_READLANE_B32:
1203 case AMDGPU::V_READFIRSTLANE_B32:
1204 SDSTName = AMDGPU::OpName::vdst;
1207 SDSTName = AMDGPU::OpName::sdst;
1216 for (
const auto &MO :
MI->implicit_operands()) {
1217 if (MO.isDef() &&
TRI->isSGPRClass(
TRI->getPhysRegBaseClass(MO.getReg()))) {
1233 if (
TII->isSALU(
MI)) {
1234 switch (
MI.getOpcode()) {
1235 case AMDGPU::S_SETVSKIP:
1236 case AMDGPU::S_VERSION:
1237 case AMDGPU::S_WAITCNT_VSCNT:
1238 case AMDGPU::S_WAITCNT_VMCNT:
1239 case AMDGPU::S_WAITCNT_EXPCNT:
1242 case AMDGPU::S_WAITCNT_LGKMCNT:
1244 return (
MI.getOperand(1).getImm() == 0) &&
1245 (
MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1246 case AMDGPU::S_WAITCNT: {
1247 const int64_t
Imm =
MI.getOperand(0).getImm();
1250 return (Decoded.
DsCnt == 0);
1254 if (
TII->isSOPP(
MI))
1270 std::numeric_limits<int>::max())
1274 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1279bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1288 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1294 return I.readsRegister(AMDGPU::EXEC, TRI);
1300 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))
1302 for (
auto MO :
MI.implicit_operands())
1303 if (MO.isDef() &&
TRI->isSGPRClass(
TRI->getPhysRegBaseClass(MO.getReg())))
1306 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1313 std::numeric_limits<int>::max())
1317 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1324 if (!ST.hasLdsBranchVmemWARHazard())
1329 bool HasLds =
false;
1330 bool HasVmem =
false;
1331 for (
auto &
MBB : MF) {
1332 for (
auto &
MI :
MBB) {
1336 if (HasLds && HasVmem)
1344 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1345 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1346 !
I.getOperand(1).getImm();
1349bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1350 if (!RunLdsBranchVmemWARHazardFixup)
1364 auto InstType = IsHazardInst(*
MI);
1377 auto InstType2 = IsHazardInst(
I);
1378 return InstType2 && InstType != InstType2;
1382 auto InstType2 = IsHazardInst(
I);
1383 if (InstType == InstType2)
1390 std::numeric_limits<int>::max();
1394 std::numeric_limits<int>::max())
1399 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1406bool GCNHazardRecognizer::fixLdsDirectVALUHazard(
MachineInstr *
MI) {
1410 const int NoHazardWaitStates = 15;
1414 bool VisitedTrans =
false;
1420 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1423 if (WaitStates >= NoHazardWaitStates)
1434 auto Count = ::getWaitStatesSince(
IsHazardFn,
MI->getParent(),
1435 std::next(
MI->getReverseIterator()), 0,
1444 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvdst);
1445 WaitVdstOp->
setImm(std::min(Count, NoHazardWaitStates));
1450bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(
MachineInstr *
MI) {
1461 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1468 (
I.getOpcode() == AMDGPU::S_WAITCNT && !
I.getOperand(0).getImm()) ||
1469 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1472 !
TII.getNamedOperand(
I, AMDGPU::OpName::waitvsrc)->getImm());
1476 std::numeric_limits<int>::max())
1479 if (LdsdirCanWait) {
1480 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1483 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1490bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(
MachineInstr *
MI) {
1506 if (SrcVGPRs.
size() <= 1)
1524 const int Intv1plus2MaxVALUs = 2;
1525 const int Intv3MaxVALUs = 4;
1526 const int IntvMaxVALUs = 6;
1527 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1531 int ExecPos = std::numeric_limits<int>::max();
1540 if (State.VALUs > NoHazardVALUWaitStates)
1546 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1551 bool Changed =
false;
1554 if (!State.DefPos.count(Src) &&
I.modifiesRegister(Src, &TRI)) {
1555 State.DefPos[Src] = State.VALUs;
1560 if (State.ExecPos == std::numeric_limits<int>::max()) {
1561 if (!State.DefPos.empty() &&
I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1562 State.ExecPos = State.VALUs;
1569 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1577 if (State.ExecPos == std::numeric_limits<int>::max())
1580 int PreExecPos = std::numeric_limits<int>::max();
1581 int PostExecPos = std::numeric_limits<int>::max();
1583 for (
auto Entry : State.DefPos) {
1584 int DefVALUs = Entry.second;
1585 if (DefVALUs != std::numeric_limits<int>::max()) {
1586 if (DefVALUs >= State.ExecPos)
1587 PreExecPos = std::min(PreExecPos, DefVALUs);
1589 PostExecPos = std::min(PostExecPos, DefVALUs);
1594 if (PostExecPos == std::numeric_limits<int>::max())
1598 int Intv3VALUs = PostExecPos;
1599 if (Intv3VALUs > Intv3MaxVALUs)
1603 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1604 if (Intv2VALUs > Intv1plus2MaxVALUs)
1608 if (PreExecPos == std::numeric_limits<int>::max())
1612 int Intv1VALUs = PreExecPos - State.ExecPos;
1613 if (Intv1VALUs > Intv1plus2MaxVALUs)
1617 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1622 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1628 if (!hasHazard<StateType>(State,
IsHazardFn, UpdateStateFn,
MI->getParent(),
1629 std::next(
MI->getReverseIterator()), Visited))
1633 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1639bool GCNHazardRecognizer::fixVALUTransUseHazard(
MachineInstr *
MI) {
1664 const int IntvMaxVALUs = 5;
1665 const int IntvMaxTRANS = 1;
1677 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1683 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1684 I.getOperand(0).getImm() == 0x0fff))
1690 if (
I.modifiesRegister(Src, &TRI)) {
1698 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1706 if (!hasHazard<StateType>(State,
IsHazardFn, UpdateStateFn,
MI->getParent(),
1707 std::next(
MI->getReverseIterator()), Visited))
1713 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1733 TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
1735 TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
1738 TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1740 if (
TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1741 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1750 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
1751 if (
TRI->regsOverlap(PrevDstReg, CurIndex))
1765 std::numeric_limits<int>::max())
1768 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(),
TII->get(AMDGPU::V_NOP_e32));
1773bool GCNHazardRecognizer::fixShift64HighRegBug(
MachineInstr *
MI) {
1778 switch (
MI->getOpcode()) {
1781 case AMDGPU::V_LSHLREV_B64_e64:
1782 case AMDGPU::V_LSHRREV_B64_e64:
1783 case AMDGPU::V_ASHRREV_I64_e64:
1794 if (!
TRI.isVGPR(
MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1797 if (AmtReg != AMDGPU::VGPR255 &&
MRI.isPhysRegUsed(AmtReg + 1))
1801 bool OverlappedSrc = Src1->
isReg() &&
TRI.regsOverlap(Src1->
getReg(), AmtReg);
1802 bool OverlappedDst =
MI->modifiesRegister(AmtReg, &TRI);
1803 bool Overlapped = OverlappedSrc || OverlappedDst;
1805 assert(!OverlappedDst || !OverlappedSrc ||
1806 Src1->
getReg() ==
MI->getOperand(0).getReg());
1808 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1811 for (
MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1812 : AMDGPU::VGPR_32RegClass) {
1813 if (!
MI->modifiesRegister(Reg, &TRI) && !
MI->readsRegister(Reg, &TRI)) {
1824 NewAmtLo =
TRI.getSubReg(NewReg, AMDGPU::sub0);
1866 MI->getOperand(0).setReg(NewReg);
1867 if (OverlappedSrc) {
1877 int NSAtoVMEMWaitStates = 1;
1886 const auto *
Offset =
TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
1894 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1895 TII->getInstSizeInBytes(
I) >= 16;
1898 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
1901int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
MachineInstr *
MI) {
1902 int FPAtomicToDenormModeWaitStates = 3;
1908 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1921 switch (
MI.getOpcode()) {
1922 case AMDGPU::S_WAITCNT:
1923 case AMDGPU::S_WAITCNT_VSCNT:
1924 case AMDGPU::S_WAITCNT_VMCNT:
1925 case AMDGPU::S_WAITCNT_EXPCNT:
1926 case AMDGPU::S_WAITCNT_LGKMCNT:
1927 case AMDGPU::S_WAIT_IDLE:
1936 return FPAtomicToDenormModeWaitStates -
1955 int NeighborMFMALatency = 0;
1956 auto IsNeighboringMFMA = [&NeighborMFMALatency,
1961 NeighborMFMALatency = this->getMFMAPipelineWaitStates(
MI);
1965 const int MaxMFMAPipelineWaitStates = 16;
1966 int WaitStatesSinceNeighborMFMA =
1967 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
1969 int NeighborMFMAPaddingNeeded =
1971 WaitStatesSinceNeighborMFMA;
1973 return std::max(0, NeighborMFMAPaddingNeeded);
1977 int WaitStatesNeeded = 0;
1978 unsigned Opc =
MI->getOpcode();
1984 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
1985 const int LegacyVALUWritesVGPRWaitStates = 2;
1986 const int VALUWritesExecWaitStates = 4;
1987 const int MaxWaitStates = 4;
1989 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1990 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1991 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1993 if (WaitStatesNeeded < MaxWaitStates) {
1995 const int MaxWaitStates = 2;
2000 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2001 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
2002 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2004 if (WaitStatesNeeded == MaxWaitStates)
2014 if (
Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2017 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2018 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2019 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2020 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2021 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2022 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2023 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2024 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2025 const int MaxWaitStates = 18;
2027 unsigned HazardDefLatency = 0;
2029 auto IsOverlappedMFMAFn = [
Reg, &HazardDefLatency,
2037 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2038 return TRI.regsOverlap(DstReg, Reg);
2041 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2043 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2045 int OpNo =
Op.getOperandNo();
2046 if (OpNo == SrcCIdx) {
2047 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2048 }
else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2049 switch (HazardDefLatency) {
2050 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2052 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2054 case 16: [[fallthrough]];
2055 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2058 }
else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2059 switch (HazardDefLatency) {
2060 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2062 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2064 case 16: [[fallthrough]];
2065 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2070 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2071 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2073 if (WaitStatesNeeded == MaxWaitStates)
2074 return WaitStatesNeeded;
2077 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2080 return TRI.regsOverlap(Reg, DstReg);
2083 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2084 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2085 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2086 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2087 if (OpNo == SrcCIdx)
2088 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2089 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2090 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2092 WaitStatesNeededForUse = NeedWaitStates -
2093 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2094 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2096 if (WaitStatesNeeded == MaxWaitStates)
2097 return WaitStatesNeeded;
2100 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2101 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2102 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2103 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2104 const int MaxWaitStates = 13;
2105 Register DstReg =
MI->getOperand(0).getReg();
2106 unsigned HazardDefLatency = 0;
2108 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2114 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2115 return TRI.regsOverlap(Reg, DstReg);
2118 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2120 switch (HazardDefLatency) {
2121 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2123 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2125 case 16: [[fallthrough]];
2126 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2130 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2131 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2135 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2137 return WaitStatesNeeded;
2146 return NumPasses + 1;
2164 return NumPasses + 2;
2172 return NumPasses + 3;
2176 int WaitStatesNeeded = 0;
2177 unsigned Opc =
MI->getOpcode();
2189 return WaitStatesNeeded;
2191 const int VALUWritesExecWaitStates = 4;
2192 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2193 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2194 VALUWritesExecWaitStates);
2195 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2201 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2202 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2203 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2204 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2205 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2206 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2207 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2208 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2209 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2210 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2211 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2212 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2213 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2214 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2215 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2216 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2217 const int MaxWaitStates = 19;
2225 auto IsOverlappedMFMAFn = [
Reg, &FullReg, &MI1,
2230 FullReg = (DstReg ==
Reg);
2232 return TRI.regsOverlap(DstReg, Reg);
2235 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2236 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2237 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2240 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2241 if (NumWaitStates == std::numeric_limits<int>::max())
2246 int NeedWaitStates = 0;
2247 if (OpNo == SrcCIdx) {
2250 }
else if (FullReg) {
2251 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2252 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2253 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2254 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2255 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2257 TSchedModel.computeInstrLatency(MI1) == 2)
2258 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2261 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2262 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2263 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2264 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2266 NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2268 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2269 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2271 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2274 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2288 switch (NumPasses) {
2291 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2292 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2297 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2298 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2303 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2304 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2313 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2314 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2315 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2316 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2317 NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2319 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2320 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2321 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2324 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2336 switch (NumPasses) {
2338 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2343 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2345 case 16: [[fallthrough]];
2347 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2351 if (WaitStatesNeeded >= NeedWaitStates)
2354 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2355 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2357 if (WaitStatesNeeded == MaxWaitStates)
2362 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2364 return WaitStatesNeeded;
2372 int WaitStatesNeeded = 0;
2375 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2384 const int AccVgprReadLdStWaitStates = 2;
2385 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2386 const int MaxWaitStates = 2;
2388 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2389 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2390 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2392 if (WaitStatesNeeded == MaxWaitStates)
2393 return WaitStatesNeeded;
2396 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2397 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2402 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 ) <
2403 std::numeric_limits<int>::max();
2406 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2407 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2408 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2411 return WaitStatesNeeded;
2419 return NumPasses + 2;
2427 return NumPasses + 3;
2435 return NumPasses + 3;
2443 return NumPasses + 2;
2460 int WaitStatesNeeded = 0;
2472 !
TRI.regsOverlap(
MI.getOperand(0).getReg(), Reg))
2481 !
TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
2487 bool DGEMMAfterVALUWrite =
false;
2488 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite,
this](
const MachineInstr &
MI) {
2491 DGEMMAfterVALUWrite =
true;
2495 if (!
TII.isVALU(
MI) || !DGEMMAfterVALUWrite)
2502 AMDGPU::OpName::src2);
2504 if (IsMemOrExport || IsVALU) {
2505 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2506 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2507 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2508 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2509 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2510 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2511 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2512 const int DotWriteSameDotReadSrcAB = 3;
2513 const int DotWriteDifferentVALURead = 3;
2514 const int DMFMABetweenVALUWriteVMEMRead = 2;
2515 const int MaxWaitStates = 19;
2523 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2526 int NeedWaitStates = 0;
2527 if (
DOT->getOpcode() ==
MI->getOpcode()) {
2528 if (&
Use - &
MI->getOperand(0) != SrcCIdx)
2529 NeedWaitStates = DotWriteSameDotReadSrcAB;
2531 NeedWaitStates = DotWriteDifferentVALURead;
2534 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2535 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2543 DGEMMAfterVALUWrite =
false;
2544 if (
TRI.isVectorRegister(
MRI, Reg)) {
2545 int WaitStatesNeededForUse =
2546 DMFMABetweenVALUWriteVMEMRead -
2547 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2548 DMFMABetweenVALUWriteVMEMRead);
2550 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2555 WaitStatesSinceDef =
2556 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2560 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2561 int NumPasses = HazardDefLatency;
2562 int NeedWaitStates = MaxWaitStates;
2564 if (
isDGEMM(MFMA->getOpcode())) {
2565 switch (HazardDefLatency) {
2567 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2568 : DMFMA4x4WriteVgprVALUReadWaitStates;
2572 NeedWaitStates = IsMemOrExport
2573 ? DMFMA16x16WriteVgprMemExpReadWaitStates
2574 : DMFMA16x16WriteVgprVALUReadWaitStates;
2586 switch (HazardDefLatency) {
2588 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2591 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2594 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2601 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2602 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2604 if (WaitStatesNeeded == MaxWaitStates)
2609 unsigned Opc =
MI->getOpcode();
2610 const int DMFMAToFMA64WaitStates = 2;
2611 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2612 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2613 Opc == AMDGPU::V_FMAC_F64_dpp) &&
2614 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2615 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2616 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2617 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2620 if (!IsVALU && !IsMemOrExport)
2621 return WaitStatesNeeded;
2624 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2625 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2626 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2627 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2628 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2629 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2630 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2631 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2632 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2633 const int DotWriteDifferentVALUWrite = 3;
2634 const int MaxWaitStates = 19;
2635 const int MaxWarWaitStates = 15;
2640 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2642 if (DOT &&
DOT->getOpcode() !=
MI->getOpcode())
2643 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2644 WaitStatesSinceDef);
2647 WaitStatesSinceDef =
2648 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2650 int NeedWaitStates = MaxWaitStates;
2651 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2653 if (
isDGEMM(MFMA->getOpcode())) {
2654 switch (NumPasses) {
2656 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2660 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2671 switch (NumPasses) {
2673 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2676 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2679 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2686 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2687 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2689 if (WaitStatesNeeded == MaxWaitStates)
2695 !
MI.readsRegister(Reg, &TRI))
2702 TII.getNamedOperand(
MI, AMDGPU::OpName::src2);
2712 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2717 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2718 int NeedWaitStates = MaxWaitStates;
2719 switch (HazardDefLatency) {
2720 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2723 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2725 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2727 case 16: [[fallthrough]];
2728 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2732 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2733 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2736 return WaitStatesNeeded;
2749 return MAI !=
nullptr;
2753 if (IsMFMAFn(*
MI)) {
2754 int W = getWaitStatesSince(IsMFMAFn, 16);
2756 return W < (int)TSchedModel.computeInstrLatency(MAI);
2762bool GCNHazardRecognizer::fixVALUMaskWriteHazard(
MachineInstr *
MI) {
2779 if (!SDSTOp || !SDSTOp->
isReg())
2783 if (HazardReg == AMDGPU::EXEC ||
2784 HazardReg == AMDGPU::EXEC_LO ||
2785 HazardReg == AMDGPU::EXEC_HI ||
2786 HazardReg == AMDGPU::M0)
2790 switch (
I.getOpcode()) {
2791 case AMDGPU::V_ADDC_U32_e32:
2792 case AMDGPU::V_ADDC_U32_dpp:
2793 case AMDGPU::V_CNDMASK_B16_e32:
2794 case AMDGPU::V_CNDMASK_B16_dpp:
2795 case AMDGPU::V_CNDMASK_B32_e32:
2796 case AMDGPU::V_CNDMASK_B32_dpp:
2797 case AMDGPU::V_DIV_FMAS_F32_e64:
2798 case AMDGPU::V_DIV_FMAS_F64_e64:
2799 case AMDGPU::V_SUBB_U32_e32:
2800 case AMDGPU::V_SUBB_U32_dpp:
2801 case AMDGPU::V_SUBBREV_U32_e32:
2802 case AMDGPU::V_SUBBREV_U32_dpp:
2804 return HazardReg == AMDGPU::VCC ||
2805 HazardReg == AMDGPU::VCC_LO ||
2806 HazardReg == AMDGPU::VCC_HI;
2807 case AMDGPU::V_ADDC_U32_e64:
2808 case AMDGPU::V_ADDC_U32_e64_dpp:
2809 case AMDGPU::V_CNDMASK_B16_e64:
2810 case AMDGPU::V_CNDMASK_B16_e64_dpp:
2811 case AMDGPU::V_CNDMASK_B32_e64:
2812 case AMDGPU::V_CNDMASK_B32_e64_dpp:
2813 case AMDGPU::V_SUBB_U32_e64:
2814 case AMDGPU::V_SUBB_U32_e64_dpp:
2815 case AMDGPU::V_SUBBREV_U32_e64:
2816 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2820 return TRI.regsOverlap(SSRCOp->
getReg(), HazardReg);
2830 if (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2839 for (
int OpNo = 0,
End =
I.getNumOperands(); OpNo <
End; ++OpNo) {
2847 if (OpReg == AMDGPU::EXEC ||
2848 OpReg == AMDGPU::EXEC_LO ||
2849 OpReg == AMDGPU::EXEC_HI)
2852 if (
Op.isImplicit()) {
2853 if (OpReg == AMDGPU::VCC ||
2854 OpReg == AMDGPU::VCC_LO ||
2855 OpReg == AMDGPU::VCC_HI)
2859 if (
TRI.isSGPRReg(
MRI, OpReg))
2864 if (!
TII.isInlineConstant(
Op, OpInfo))
2873 std::numeric_limits<int>::max())
2876 auto NextMI = std::next(
MI->getIterator());
2879 BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
2880 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2884 if (
MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2886 while (NextMI !=
MI->getParent()->end() &&
2887 NextMI->isBundledWithPred()) {
2888 for (
auto &Operand : NextMI->operands()) {
2889 if (Operand.isGlobal())
2890 Operand.setOffset(Operand.getOffset() + 4);
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
Analysis containing CSE Info
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI)
static bool isLdsDma(const MachineInstr &MI)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isSSetReg(unsigned Opcode)
static bool hasHazard(StateT State, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, DenseSet< const MachineBasicBlock * > &Visited)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static bool isDGEMM(unsigned Opcode)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static const uint32_t IV[8]
bool anyCommon(const BitVector &RHS) const
Test if any common bits are set.
bool none() const
none - Returns true if none of the bits are set.
This class represents an Operation in the Expression.
Implements a dense probed hash-table based set.
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
unsigned PreEmitNoopsCommon(MachineInstr *)
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
bool hasShift64HighRegBug() const
bool hasFPAtomicToDenormModeHazard() const
bool hasLdsBranchVmemWARHazard() const
bool hasGFX90AInsts() const
bool hasDstSelForwardingHazard() const
const SIInstrInfo * getInstrInfo() const override
bool hasVALUMaskWriteHazard() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasVcmpxExecWARHazard() const
bool hasReadM0MovRelInterpHazard() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasLdsWaitVMSRC() const
bool hasExtendedWaitCounts() const
bool hasVcmpxPermlaneHazard() const
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
bool hasNoDataDepHazard() const
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
bool hasTransForwardingHazard() const
bool hasGFX940Insts() const
bool hasReadM0LdsDmaHazard() const
bool hasSMEMtoVectorWriteHazard() const
bool hasVMEMtoScalarWriteHazard() const
bool hasNSAtoVMEMBug() const
bool hasVDecCoExecHazard() const
bool hasReadM0SendMsgHazard() const
bool hasReadM0LdsDirectHazard() const
bool isXNACKEnabled() const
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
bool hasRFEHazards() const
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
bool hasVALUTransUseHazard() const
bool hasLdsDirect() const
Describe properties that are true of each instruction in the target description file.
ArrayRef< MCOperandInfo > operands() const
This holds information about one operand of a machine instruction, indicating the register class for ...
Wrapper class representing physical registers. Should be passed by value.
Instructions::const_reverse_iterator const_reverse_instr_iterator
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
instr_iterator instr_end()
iterator_range< pred_iterator > predecessors()
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Wrapper class representing virtual and physical registers.
static bool isMAI(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isVINTRP(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
unsigned getOccupancy() const
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned getMaxLookAhead() const
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
StringRef - Represent a constant reference to a string, i.e.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
ProcResIter getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
A Use represents the edge between a Value definition and its users.
unsigned getOperandNo() const
Return the operand # of this use in its User.
LLVM Value Representation.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isGFX12Plus(const MCSubtargetInfo &STI)
IsaVersion getIsaVersion(StringRef GPU)
bool getMAIIsGFX940XDL(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool getMAIIsDGEMM(unsigned Opc)
Returns true if MAI operation is a double precision GEMM.
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.
Description of the encoding of one expression Op.
Summarize the scheduling resources required for an instruction of a particular scheduling class.
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...