28#define DEBUG_TYPE "gcn-hazard-recognizer"
31 "Number of WMMA hazard V_NOPs hoisted from loops");
33 "Number of WMMA hazards where V_NOP hoisting was not possible");
37struct MFMAPaddingRatioParser :
public cl::parser<unsigned> {
40 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg,
unsigned &
Value) {
42 return O.error(
"'" + Arg +
"' value invalid for uint argument!");
45 return O.error(
"'" + Arg +
"' value must be in the range [0, 100]!");
55 cl::desc(
"Fill a percentage of the latency between "
56 "neighboring MFMA with s_nops."));
61 cl::desc(
"Insert a s_nop x before every instruction"));
65 cl::desc(
"Hoist WMMA hazard V_NOPs from loops to preheaders"));
76 : IsHazardRecognizerMode(
false), CurrCycleInstr(nullptr), MF(MF),
77 ST(MF.getSubtarget<
GCNSubtarget>()), TII(*ST.getInstrInfo()),
78 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
79 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
80 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
85 EmittedInstrs.clear();
86 EmittedVALUInstrs.clear();
87 HasPendingWMMACoexecHazard =
false;
99 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
103 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
108 case AMDGPU::S_SETREG_B32:
109 case AMDGPU::S_SETREG_B32_mode:
110 case AMDGPU::S_SETREG_IMM32_B32:
111 case AMDGPU::S_SETREG_IMM32_B32_mode:
118 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
122 return Opcode == AMDGPU::S_RFE_B64;
127 case AMDGPU::S_MOVRELS_B32:
128 case AMDGPU::S_MOVRELS_B64:
129 case AMDGPU::S_MOVRELD_B32:
130 case AMDGPU::S_MOVRELD_B64:
139 if (
TII.isAlwaysGDS(
MI.getOpcode()))
142 switch (
MI.getOpcode()) {
143 case AMDGPU::S_SENDMSG:
144 case AMDGPU::S_SENDMSGHALT:
145 case AMDGPU::S_TTRACEDATA:
149 case AMDGPU::DS_PERMUTE_B32:
150 case AMDGPU::DS_BPERMUTE_B32:
153 if (
TII.isDS(
MI.getOpcode())) {
154 int GDS = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
155 AMDGPU::OpName::gds);
156 if (
MI.getOperand(GDS).getImm())
164 unsigned Opcode =
MI.getOpcode();
165 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANE64_B32 ||
167 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
169 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
170 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
171 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
173 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
175 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
176 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
177 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
178 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
188 AMDGPU::OpName::simm16);
205 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(
MI) > 0)
208 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
212 if (!IsHazardRecognizerMode) {
213 if (checkWMMACoexecutionHazards(
MI) > 0) {
214 HasPendingWMMACoexecHazard =
true;
219 if (ST.hasNoDataDepHazard())
226 checkVALUHazards(
MI) > 0)
232 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
235 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
241 checkMAIVALUHazards(
MI) > 0)
244 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
247 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
250 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
253 if (((ST.hasReadM0MovRelInterpHazard() &&
255 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
256 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
258 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
259 (ST.hasReadM0LdsDirectHazard() &&
260 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr))) &&
261 checkReadM0Hazards(
MI) > 0)
268 checkMAILdStHazards(
MI) > 0)
271 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
279 while (Quantity > 0) {
280 unsigned Arg = std::min(Quantity, 8u);
288GCNHazardRecognizer::getMFMAPipelineWaitStates(
const MachineInstr &
MI)
const {
289 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&
MI);
290 assert(TSchedModel.getWriteProcResBegin(SC) !=
291 TSchedModel.getWriteProcResEnd(SC));
292 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
295void GCNHazardRecognizer::processBundle() {
299 for (;
MI !=
E &&
MI->isInsideBundle(); ++
MI) {
300 CurrCycleInstr = &*
MI;
303 if (IsHazardRecognizerMode) {
304 fixHazards(CurrCycleInstr);
312 for (
unsigned i = 0, e = std::min(WaitStates,
MaxLookAhead - 1); i <
e; ++i)
313 EmittedInstrs.push_front(
nullptr);
315 EmittedInstrs.push_front(CurrCycleInstr);
318 CurrCycleInstr =
nullptr;
322 assert(IsHazardRecognizerMode);
326 if (
MI->isInsideBundle())
336 IsHazardRecognizerMode =
true;
340 CurrCycleInstr =
nullptr;
355 return std::max(WaitStates, checkSMRDHazards(
MI));
357 if (ST.hasNSAtoVMEMBug())
358 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
360 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
362 if (ST.hasNoDataDepHazard())
366 WaitStates = std::max(WaitStates, checkVMEMHazards(
MI));
369 WaitStates = std::max(WaitStates, checkVALUHazards(
MI));
372 WaitStates = std::max(WaitStates, checkDPPHazards(
MI));
375 WaitStates = std::max(WaitStates, checkDivFMasHazards(
MI));
378 WaitStates = std::max(WaitStates, checkRWLaneHazards(
MI));
383 checkMAIVALUHazards(
MI) > 0)
384 WaitStates = std::max(WaitStates, checkMAIVALUHazards(
MI));
386 if (
MI->isInlineAsm())
387 return std::max(WaitStates, checkInlineAsmHazards(
MI));
390 return std::max(WaitStates, checkGetRegHazards(
MI));
393 return std::max(WaitStates, checkSetRegHazards(
MI));
396 return std::max(WaitStates, checkRFEHazards(
MI));
398 if ((ST.hasReadM0MovRelInterpHazard() &&
400 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
401 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
403 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
404 (ST.hasReadM0LdsDirectHazard() &&
405 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr)))
406 return std::max(WaitStates, checkReadM0Hazards(
MI));
409 return std::max(WaitStates, checkMAIHazards(
MI));
412 return std::max(WaitStates, checkMAILdStHazards(
MI));
415 return std::max(WaitStates, checkPermlaneHazards(
MI));
421 EmittedInstrs.push_front(
nullptr);
427 if (!CurrCycleInstr) {
428 EmittedInstrs.push_front(
nullptr);
430 if (HasPendingWMMACoexecHazard)
431 EmittedVALUInstrs.push_front(
nullptr);
435 HasPendingWMMACoexecHazard =
false;
437 if (CurrCycleInstr->isBundle()) {
442 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
443 if (!NumWaitStates) {
444 CurrCycleInstr =
nullptr;
449 EmittedInstrs.push_front(CurrCycleInstr);
456 EmittedVALUInstrs.push_front(CurrCycleInstr);
462 while (!EmittedVALUInstrs.empty() && EmittedVALUInstrs.front() ==
nullptr)
463 EmittedVALUInstrs.pop_front();
471 EmittedInstrs.push_front(
nullptr);
478 if (EmittedVALUInstrs.size() > MaxVALULookAhead)
479 EmittedVALUInstrs.resize(MaxVALULookAhead);
481 CurrCycleInstr =
nullptr;
485 assert(!IsHazardRecognizerMode &&
486 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
496template <
typename StateT>
506 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
511 static unsigned getHashValue(
const StateMapKey &
Key) {
512 return StateT::getHashValue((*
Key.States)[
Key.Idx]);
514 static unsigned getHashValue(
const StateT &State) {
515 return StateT::getHashValue(State);
517 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
518 return StateT::isEqual((*
LHS.States)[
LHS.Idx], (*
RHS.States)[
RHS.Idx]);
520 static bool isEqual(
const StateT &
LHS,
const StateMapKey &
RHS) {
521 return StateT::isEqual(
LHS, (*
RHS.States)[
RHS.Idx]);
530 StateT State = InitialState;
533 unsigned WorkIdx = 0;
535 bool Expired =
false;
536 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
541 auto Result = IsHazard(State, *
I);
549 if (
I->isInlineAsm() ||
I->isMetaInstruction())
552 UpdateState(State, *
I);
556 unsigned StateIdx = States.
size();
557 StateMapKey
Key = {&States, StateIdx};
558 auto Insertion = StateMap.
insert_as(std::pair(
Key, StateIdx), State);
559 if (Insertion.second) {
562 StateIdx = Insertion.first->second;
565 Worklist.
insert(std::pair(Pred, StateIdx));
568 if (WorkIdx == Worklist.
size())
572 std::tie(
MBB, StateIdx) = Worklist[WorkIdx++];
573 State = States[StateIdx];
574 I =
MBB->instr_rbegin();
591 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
599 if (
I->isInlineAsm())
602 WaitStates += GetNumWaitStates(*
I);
604 if (IsExpired(*
I, WaitStates))
605 return std::numeric_limits<int>::max();
608 int MinWaitStates = std::numeric_limits<int>::max();
610 if (!Visited.
insert(Pred).second)
614 IsExpired, Visited, GetNumWaitStates);
616 MinWaitStates = std::min(MinWaitStates, W);
619 return MinWaitStates;
630 std::next(
MI->getReverseIterator()), 0, IsExpired,
631 Visited, GetNumWaitStates);
634int GCNHazardRecognizer::getWaitStatesSince(
635 IsHazardFn IsHazard,
int Limit, GetNumWaitStatesFn GetNumWaitStates)
const {
636 if (IsHazardRecognizerMode) {
637 auto IsExpiredFn = [Limit](
const MachineInstr &,
int WaitStates) {
638 return WaitStates >= Limit;
640 return ::getWaitStatesSince(IsHazard, CurrCycleInstr,
IsExpiredFn,
645 for (MachineInstr *
MI : EmittedInstrs) {
650 if (
MI->isInlineAsm())
653 WaitStates +=
MI ? GetNumWaitStates(*
MI) : 1;
655 if (WaitStates >= Limit)
658 return std::numeric_limits<int>::max();
661int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
666int GCNHazardRecognizer::getWaitStatesSinceVALU(IsHazardFn IsHazard,
668 if (IsHazardRecognizerMode) {
669 auto GetVALUWaitStates = [](
const MachineInstr &
MI) ->
unsigned {
672 return getWaitStatesSince(IsHazard, Limit, GetVALUWaitStates);
678 assert(Limit <= (
int)MaxVALULookAhead &&
679 "Limit exceeds the EmittedVALUInstrs lookahead window");
681 for (MachineInstr *
MI : EmittedVALUInstrs) {
689 if (WaitStates >= Limit)
692 return std::numeric_limits<int>::max();
695int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
696 IsHazardFn IsHazardDef,
698 const SIRegisterInfo *TRI = ST.getRegisterInfo();
701 return IsHazardDef(
MI) &&
MI.modifiesRegister(
Reg, TRI);
707int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
722 for (MCRegUnit Unit :
TRI.regunits(
Reg))
723 BV.
set(
static_cast<unsigned>(Unit));
735void GCNHazardRecognizer::addClauseInst(
const MachineInstr &
MI)
const {
747int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM)
const {
750 if (!ST.isXNACKEnabled())
753 bool IsSMRD = TII.isSMRD(*MEM);
767 for (MachineInstr *
MI : EmittedInstrs) {
779 if (ClauseDefs.none())
792 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
795int GCNHazardRecognizer::checkSMRDHazards(
MachineInstr *SMRD)
const {
796 int WaitStatesNeeded = 0;
798 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
801 if (!ST.hasSMRDReadVALUDefHazard())
802 return WaitStatesNeeded;
806 int SmrdSgprWaitStates = 4;
807 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
808 return TII.isVALU(
MI,
true);
810 auto IsBufferHazardDefFn = [
this](
const MachineInstr &
MI) {
811 return TII.isSALU(
MI);
814 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
816 for (
const MachineOperand &Use :
SMRD->uses()) {
819 int WaitStatesNeededForUse =
820 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
822 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
832 int WaitStatesNeededForUse =
833 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
836 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
840 return WaitStatesNeeded;
843int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr *VMEM)
const {
844 if (!ST.hasVMEMReadSGPRVALUDefHazard())
847 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
851 const int VmemSgprWaitStates = 5;
852 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
853 return TII.isVALU(
MI,
true);
855 for (
const MachineOperand &Use :
VMEM->uses()) {
856 if (!
Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(),
Use.getReg()))
859 int WaitStatesNeededForUse =
860 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
862 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
864 return WaitStatesNeeded;
868 const SIRegisterInfo *TRI = ST.getRegisterInfo();
869 const SIInstrInfo *TII = ST.getInstrInfo();
872 int DppVgprWaitStates = 2;
873 int DppExecWaitStates = 5;
874 int WaitStatesNeeded = 0;
875 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
876 return TII->isVALU(
MI,
true);
879 for (
const MachineOperand &Use :
DPP->uses()) {
880 if (!
Use.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Use.getReg()))
882 int WaitStatesNeededForUse =
883 DppVgprWaitStates - getWaitStatesSinceDef(
885 [](
const MachineInstr &) { return true; },
887 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
890 WaitStatesNeeded = std::max(
892 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
895 return WaitStatesNeeded;
898int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas)
const {
899 const SIInstrInfo *TII = ST.getInstrInfo();
903 const int DivFMasWaitStates = 4;
904 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
905 return TII->isVALU(
MI,
true);
907 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
910 return DivFMasWaitStates - WaitStatesNeeded;
913int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr)
const {
914 const SIInstrInfo *TII = ST.getInstrInfo();
915 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
917 const int GetRegWaitStates = 2;
918 auto IsHazardFn = [TII, GetRegHWReg](
const MachineInstr &
MI) {
921 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
923 return GetRegWaitStates - WaitStatesNeeded;
926int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr)
const {
927 const SIInstrInfo *TII = ST.getInstrInfo();
928 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
930 const int SetRegWaitStates = ST.getSetRegWaitStates();
931 auto IsHazardFn = [TII, HWReg](
const MachineInstr &
MI) {
934 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
935 return SetRegWaitStates - WaitStatesNeeded;
938int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI)
const {
942 const SIInstrInfo *TII = ST.getInstrInfo();
943 unsigned Opcode =
MI.getOpcode();
944 const MCInstrDesc &
Desc =
MI.getDesc();
946 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
949 VDataRCID = TII->getOpRegClassID(
Desc.operands()[VDataIdx]);
951 if (TII->isMUBUF(
MI) || TII->isMTBUF(
MI)) {
961 if (ST.hasVDecCoExecHazard())
963 const MachineOperand *SOffset =
964 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
965 if (!SOffset || !SOffset->
isReg())
974 if (TII->isMIMG(
MI)) {
975 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
977 Desc.operands()[SRsrcIdx])) == 256);
981 if (TII->isFLAT(
MI)) {
993int GCNHazardRecognizer::checkUniformWindowVALUHazardsHelper(
998 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1000 auto IsHazard = [&](
const MachineInstr &
MI) {
1001 int DataIdx = createsVALUHazard(
MI);
1002 return DataIdx >= 0 &&
1003 TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(),
Reg);
1006 return std::max(0, 1 - getWaitStatesSince(IsHazard, 1));
1009int GCNHazardRecognizer::checkSOFFSETWindowVALUHazardsHelper(
1016 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1017 const SIInstrInfo *TII = ST.getInstrInfo();
1019 int WaitStatesNeeded = 0;
1023 for (
int Window = 1; Window <= 2; ++Window) {
1024 auto IsHazard = [&](
const MachineInstr &
MI) {
1025 int DataIdx = createsVALUHazard(
MI);
1027 !TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(),
Reg))
1032 if (Window == 1 || !TII->isBUF(
MI))
1035 const MachineOperand *SOffset =
1036 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
1037 return !SOffset || !SOffset->
isReg();
1039 WaitStatesNeeded = std::max(WaitStatesNeeded,
1040 Window - getWaitStatesSince(IsHazard, Window));
1043 return WaitStatesNeeded;
1046int GCNHazardRecognizer::checkVALUHazardsHelper(
1051 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1053 if (!TRI->isVectorRegister(MRI,
Def.getReg()))
1056 if (ST.hasVDecCoExecHazard())
1057 return checkSOFFSETWindowVALUHazardsHelper(
Def.getReg());
1059 return checkUniformWindowVALUHazardsHelper(
Def.getReg());
1075 unsigned Opcode =
MI.getOpcode();
1085 if (
auto *DstSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel))
1087 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1093 if (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src0_modifiers) &
1095 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1099 (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src2_modifiers) &
1101 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1107 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1128 for (
auto &Operand : VALU->operands()) {
1129 if (Operand.isReg() &&
TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1136int GCNHazardRecognizer::checkVALUHazards(
MachineInstr *VALU)
const {
1137 int WaitStatesNeeded = 0;
1140 const int TransDefWaitstates = 1;
1142 auto IsTransDefFn = [
this,
VALU](
const MachineInstr &
MI) {
1145 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1146 const SIInstrInfo *TII = ST.getInstrInfo();
1147 Register Def = TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)->getReg();
1149 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1150 if (
Use.isReg() && TRI->regsOverlap(Def,
Use.getReg()))
1157 int WaitStatesNeededForDef =
1158 TransDefWaitstates -
1159 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1160 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1163 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1164 const int Shift16DefWaitstates = 1;
1166 auto IsShift16BitDefFn = [
this,
VALU](
const MachineInstr &ProducerMI) {
1167 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1168 const MachineOperand *ForwardedDst =
1174 if (ProducerMI.isInlineAsm()) {
1176 for (
auto &Def : ProducerMI.all_defs()) {
1185 int WaitStatesNeededForDef =
1186 Shift16DefWaitstates -
1187 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1188 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1191 if (ST.hasVDecCoExecHazard()) {
1192 const int VALUWriteSGPRVALUReadWaitstates = 2;
1193 const int VALUWriteEXECRWLane = 4;
1194 const int VALUWriteVGPRReadlaneRead = 1;
1196 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1197 const MachineRegisterInfo &MRI = MF.getRegInfo();
1199 auto IsVALUDefSGPRFn = [&
UseReg, TRI](
const MachineInstr &
MI) {
1202 return MI.modifiesRegister(
UseReg, TRI);
1205 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1210 if (TRI->isSGPRReg(MRI,
UseReg)) {
1211 int WaitStatesNeededForDef =
1212 VALUWriteSGPRVALUReadWaitstates -
1213 getWaitStatesSince(IsVALUDefSGPRFn,
1214 VALUWriteSGPRVALUReadWaitstates);
1215 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1219 if (
VALU->readsRegister(AMDGPU::VCC, TRI)) {
1221 int WaitStatesNeededForDef =
1222 VALUWriteSGPRVALUReadWaitstates -
1223 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1224 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1227 switch (
VALU->getOpcode()) {
1228 case AMDGPU::V_READLANE_B32:
1229 case AMDGPU::V_READFIRSTLANE_B32: {
1230 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1232 int WaitStatesNeededForDef =
1233 VALUWriteVGPRReadlaneRead -
1234 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1235 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1238 case AMDGPU::V_WRITELANE_B32: {
1240 int WaitStatesNeededForDef =
1241 VALUWriteEXECRWLane -
1242 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1243 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1253 if (!ST.has12DWordStoreHazard())
1254 return WaitStatesNeeded;
1256 const MachineRegisterInfo &MRI = MF.getRegInfo();
1258 for (
const MachineOperand &Def :
VALU->defs()) {
1259 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1262 return WaitStatesNeeded;
1265int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA)
const {
1274 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1275 !ST.hasCvtScaleForwardingHazard())
1278 const MachineRegisterInfo &MRI = MF.getRegInfo();
1279 int WaitStatesNeeded = 0;
1281 for (
const MachineOperand &
Op :
1283 if (
Op.isReg() &&
Op.isDef()) {
1284 if (!TRI.isVectorRegister(MRI,
Op.getReg()))
1287 if (ST.has12DWordStoreHazard()) {
1289 std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Op, MRI));
1294 if (ST.hasDstSelForwardingHazard()) {
1295 const int Shift16DefWaitstates = 1;
1297 auto IsShift16BitDefFn = [
this, &
IA](
const MachineInstr &ProducerMI) {
1301 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1302 IA->readsRegister(Dst->getReg(), &TRI);
1304 if (ProducerMI.isInlineAsm()) {
1306 for (
auto &Def : ProducerMI.all_defs()) {
1307 if (
IA->modifiesRegister(
Def.getReg(), &TRI) ||
1308 IA->readsRegister(
Def.getReg(), &TRI)) {
1317 int WaitStatesNeededForDef =
1318 Shift16DefWaitstates -
1319 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1320 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1323 return WaitStatesNeeded;
1326int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane)
const {
1327 const SIInstrInfo *TII = ST.getInstrInfo();
1328 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1329 const MachineRegisterInfo &MRI = MF.getRegInfo();
1331 const MachineOperand *LaneSelectOp =
1332 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1334 if (!LaneSelectOp->
isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->
getReg()))
1339 return TII->isVALU(
MI,
true);
1342 const int RWLaneWaitStates = 4;
1343 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
1345 return RWLaneWaitStates - WaitStatesSince;
1348int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE)
const {
1349 if (!ST.hasRFEHazards())
1352 const SIInstrInfo *TII = ST.getInstrInfo();
1354 const int RFEWaitStates = 1;
1359 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
1360 return RFEWaitStates - WaitStatesNeeded;
1363int GCNHazardRecognizer::checkReadM0Hazards(
MachineInstr *
MI)
const {
1364 const SIInstrInfo *TII = ST.getInstrInfo();
1365 const int ReadM0WaitStates = 1;
1366 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isSALU(
MI); };
1367 return ReadM0WaitStates -
1368 getWaitStatesSinceDef(AMDGPU::M0,
IsHazardFn, ReadM0WaitStates);
1373 int WaitStatesNeeded,
bool IsHoisting) {
1375 for (
int I = 0;
I < WaitStatesNeeded; ++
I)
1376 BuildMI(
MBB, InsertPt,
DL, TII.get(AMDGPU::V_NOP_e32));
1380 fixVMEMtoScalarWriteHazards(
MI);
1381 fixVcmpxPermlaneHazards(
MI);
1382 fixSMEMtoVectorWriteHazards(
MI);
1383 fixVcmpxExecWARHazard(
MI);
1384 fixLdsBranchVmemWARHazard(
MI);
1385 if (ST.hasLdsDirect()) {
1386 fixLdsDirectVALUHazard(
MI);
1387 fixLdsDirectVMEMHazard(
MI);
1389 fixVALUPartialForwardingHazard(
MI);
1390 fixVALUTransUseHazard(
MI);
1391 fixVALUTransCoexecutionHazards(
MI);
1393 fixWMMACoexecutionHazards(
MI);
1394 fixShift64HighRegBug(
MI);
1395 fixVALUMaskWriteHazard(
MI);
1396 fixRequiredExportPriority(
MI);
1397 if (ST.requiresWaitIdleBeforeGetReg())
1398 fixGetRegWaitIdle(
MI);
1399 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1400 fixDsAtomicAsyncBarrierArriveB64(
MI);
1401 if (ST.hasScratchBaseForwardingHazard())
1402 fixScratchBaseForwardingHazard(
MI);
1403 if (ST.setRegModeNeedsVNOPs())
1409 return (
TII.isVOPC(
MI) ||
1410 (
MI.isCompare() && (
TII.isVOP3(
MI) ||
TII.isSDWA(
MI)))) &&
1411 MI.modifiesRegister(AMDGPU::EXEC, &
TRI);
1414bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
1418 const SIInstrInfo *TII = ST.getInstrInfo();
1419 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1425 unsigned Opc =
MI.getOpcode();
1427 Opc != AMDGPU::V_NOP_e32 &&
Opc != AMDGPU::V_NOP_e64 &&
1428 Opc != AMDGPU::V_NOP_sdwa;
1432 std::numeric_limits<int>::max())
1438 auto *Src0 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
1440 bool IsUndef = Src0->isUndef();
1442 TII->get(AMDGPU::V_MOV_B32_e32))
1449bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
1450 if (!ST.hasVMEMtoScalarWriteHazard())
1452 assert(!ST.hasExtendedWaitCounts());
1457 if (
MI->getNumDefs() == 0)
1460 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1466 for (
const MachineOperand &Def :
MI->defs()) {
1467 const MachineOperand *
Op =
1468 I.findRegisterUseOperand(
Def.getReg(), TRI,
false);
1478 (
MI.getOpcode() == AMDGPU::S_WAITCNT &&
1479 !
MI.getOperand(0).getImm()) ||
1480 (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1485 std::numeric_limits<int>::max())
1488 const SIInstrInfo *TII = ST.getInstrInfo();
1490 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1495bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
1496 if (!ST.hasSMEMtoVectorWriteHazard())
1498 assert(!ST.hasExtendedWaitCounts());
1503 AMDGPU::OpName SDSTName;
1504 switch (
MI->getOpcode()) {
1505 case AMDGPU::V_READLANE_B32:
1506 case AMDGPU::V_READFIRSTLANE_B32:
1507 SDSTName = AMDGPU::OpName::vdst;
1510 SDSTName = AMDGPU::OpName::sdst;
1514 const SIInstrInfo *TII = ST.getInstrInfo();
1515 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1517 const MachineOperand *SDST = TII->getNamedOperand(*
MI, SDSTName);
1519 for (
const auto &MO :
MI->implicit_operands()) {
1520 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1531 auto IsHazardFn = [SDSTReg, TRI](
const MachineInstr &
I) {
1536 if (TII->isSALU(
MI)) {
1537 switch (
MI.getOpcode()) {
1538 case AMDGPU::S_SETVSKIP:
1539 case AMDGPU::S_VERSION:
1540 case AMDGPU::S_WAITCNT_VSCNT:
1541 case AMDGPU::S_WAITCNT_VMCNT:
1542 case AMDGPU::S_WAITCNT_EXPCNT:
1545 case AMDGPU::S_WAITCNT_LGKMCNT:
1547 return (
MI.getOperand(1).getImm() == 0) &&
1548 (
MI.getOperand(0).
getReg() == AMDGPU::SGPR_NULL);
1549 case AMDGPU::S_WAITCNT: {
1550 const int64_t
Imm =
MI.getOperand(0).getImm();
1557 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1558 "unexpected wait count instruction");
1560 if (TII->isSOPP(
MI))
1576 std::numeric_limits<int>::max())
1580 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1585bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1586 if (!ST.hasVcmpxExecWARHazard())
1588 assert(!ST.hasExtendedWaitCounts());
1593 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1594 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1600 return I.readsRegister(AMDGPU::EXEC, TRI);
1603 const SIInstrInfo *TII = ST.getInstrInfo();
1604 auto IsExpiredFn = [TII, TRI](
const MachineInstr &
MI, int) {
1606 if (TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))
1608 for (
auto MO :
MI.implicit_operands())
1609 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1612 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1619 std::numeric_limits<int>::max())
1623 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1630 if (!ST.hasLdsBranchVmemWARHazard())
1635 bool HasLds =
false;
1636 bool HasVmem =
false;
1637 for (
auto &
MBB : MF) {
1638 for (
auto &
MI :
MBB) {
1641 if (HasLds && HasVmem)
1649 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1650 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1651 !
I.getOperand(1).getImm();
1654bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1655 if (!RunLdsBranchVmemWARHazardFixup)
1658 assert(ST.hasLdsBranchVmemWARHazard());
1659 assert(!ST.hasExtendedWaitCounts());
1661 auto IsHazardInst = [](
const MachineInstr &
MI) {
1669 auto InstType = IsHazardInst(*
MI);
1673 auto IsExpiredFn = [&IsHazardInst](
const MachineInstr &
I, int) {
1677 auto IsHazardFn = [InstType, &IsHazardInst](
const MachineInstr &
I) {
1681 auto IsHazardFn = [InstType, IsHazardInst](
const MachineInstr &
I) {
1682 auto InstType2 = IsHazardInst(
I);
1683 return InstType2 && InstType != InstType2;
1686 auto IsExpiredFn = [InstType, &IsHazardInst](
const MachineInstr &
I, int) {
1687 auto InstType2 = IsHazardInst(
I);
1688 if (InstType == InstType2)
1695 std::numeric_limits<int>::max();
1699 std::numeric_limits<int>::max())
1702 const SIInstrInfo *TII = ST.getInstrInfo();
1704 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1711bool GCNHazardRecognizer::fixLdsDirectVALUHazard(
MachineInstr *
MI) {
1715 const int NoHazardWaitStates = 15;
1716 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1719 bool VisitedTrans =
false;
1720 auto IsHazardFn = [
this, VDSTReg, &VisitedTrans](
const MachineInstr &
I) {
1725 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1727 auto IsExpiredFn = [&](
const MachineInstr &
I,
int WaitStates) {
1728 if (WaitStates >= NoHazardWaitStates)
1734 auto GetWaitStatesFn = [](
const MachineInstr &
MI) {
1738 DenseSet<const MachineBasicBlock *> Visited;
1740 std::next(
MI->getReverseIterator()), 0,
1748 MachineOperand *WaitVdstOp =
1749 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvdst);
1750 WaitVdstOp->
setImm(std::min(
Count, NoHazardWaitStates));
1755bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(
MachineInstr *
MI) {
1759 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1762 auto IsHazardFn = [
this, VDSTReg](
const MachineInstr &
I) {
1765 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1767 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1770 auto IsExpiredFn = [
this, LdsdirCanWait](
const MachineInstr &
I, int) {
1773 (
I.getOpcode() == AMDGPU::S_WAITCNT && !
I.getOperand(0).getImm()) ||
1774 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1777 !TII.getNamedOperand(
I, AMDGPU::OpName::waitvsrc)->getImm());
1781 std::numeric_limits<int>::max())
1784 if (LdsdirCanWait) {
1785 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1788 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1795bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(
MachineInstr *
MI) {
1796 if (!ST.hasVALUPartialForwardingHazard())
1798 assert(!ST.hasExtendedWaitCounts());
1803 SmallSetVector<Register, 4> SrcVGPRs;
1805 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1806 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1811 if (SrcVGPRs.
size() <= 1)
1829 const int Intv1plus2MaxVALUs = 2;
1830 const int Intv3MaxVALUs = 4;
1831 const int IntvMaxVALUs = 6;
1832 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1835 SmallDenseMap<Register, int, 4> DefPos;
1836 int ExecPos = std::numeric_limits<int>::max();
1839 static unsigned getHashValue(
const StateType &State) {
1843 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1844 return LHS.DefPos ==
RHS.DefPos &&
LHS.ExecPos ==
RHS.ExecPos &&
1852 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1854 if (State.VALUs > NoHazardVALUWaitStates)
1860 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1868 if (!State.DefPos.count(Src) &&
I.modifiesRegister(Src, &TRI)) {
1869 State.DefPos[Src] = State.VALUs;
1874 if (State.ExecPos == std::numeric_limits<int>::max()) {
1875 if (!State.DefPos.empty() &&
I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1876 State.ExecPos = State.VALUs;
1883 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1891 if (State.ExecPos == std::numeric_limits<int>::max())
1894 int PreExecPos = std::numeric_limits<int>::max();
1895 int PostExecPos = std::numeric_limits<int>::max();
1897 for (
auto Entry : State.DefPos) {
1898 int DefVALUs =
Entry.second;
1899 if (DefVALUs != std::numeric_limits<int>::max()) {
1900 if (DefVALUs >= State.ExecPos)
1901 PreExecPos = std::min(PreExecPos, DefVALUs);
1903 PostExecPos = std::min(PostExecPos, DefVALUs);
1908 if (PostExecPos == std::numeric_limits<int>::max())
1912 int Intv3VALUs = PostExecPos;
1913 if (Intv3VALUs > Intv3MaxVALUs)
1917 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1918 if (Intv2VALUs > Intv1plus2MaxVALUs)
1922 if (PreExecPos == std::numeric_limits<int>::max())
1926 int Intv1VALUs = PreExecPos - State.ExecPos;
1927 if (Intv1VALUs > Intv1plus2MaxVALUs)
1931 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1936 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1942 std::next(
MI->getReverseIterator())))
1946 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1952bool GCNHazardRecognizer::fixVALUTransUseHazard(
MachineInstr *
MI) {
1953 if (!ST.hasVALUTransUseHazard())
1955 assert(!ST.hasExtendedWaitCounts());
1960 SmallSet<Register, 4> SrcVGPRs;
1962 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1963 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1977 const int IntvMaxVALUs = 5;
1978 const int IntvMaxTRANS = 1;
1984 static unsigned getHashValue(
const StateType &State) {
1987 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1988 return LHS.VALUs ==
RHS.VALUs &&
LHS.TRANS ==
RHS.TRANS;
1995 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1997 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
2003 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2010 if (
I.modifiesRegister(Src, &TRI)) {
2018 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
2026 std::next(
MI->getReverseIterator())))
2032 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2038bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(
MachineInstr *
MI) {
2039 if (!ST.hasTransCoexecutionHazard() ||
2044 const SIInstrInfo *TII = ST.getInstrInfo();
2045 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2047 auto IsTransHazardFn = [
MI, TII, TRI](
const MachineInstr &
I) {
2052 Register TransDef = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2053 for (
const MachineOperand &ValuUse :
MI->explicit_uses()) {
2054 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
2058 auto *ValuDst = TII->getNamedOperand(*
MI, AMDGPU::OpName::vdst);
2059 if (!ValuDst || !ValuDst->isReg())
2063 Register ValuDef = ValuDst->getReg();
2064 for (
const MachineOperand &TransUse :
I.explicit_uses()) {
2065 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
2076 const int HasVALU = std::numeric_limits<int>::max();
2077 if (::getWaitStatesSince(IsTransHazardFn,
MI,
IsExpiredFn) == HasVALU)
2080 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2088 const SIInstrInfo *TII = ST.getInstrInfo();
2089 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2091 auto IsHazardFn = [
MI, TII, TRI,
this](
const MachineInstr &
I) {
2098 TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
2100 TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
2103 TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2105 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2106 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2115 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
2116 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2130 std::numeric_limits<int>::max())
2133 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2188 bool IsLowestRateWMMA = ST.hasGFX125xLowestRateWMMA();
2189 unsigned Category = 0;
2191 unsigned Latency = SchedModel.computeInstrLatency(&
MI);
2194 Category = IsSWMMAC ? 2 : 0;
2197 Category = IsLowestRateWMMA ? 4 : (IsSWMMAC ? 3 : 1);
2200 assert(IsLowestRateWMMA &&
"latency 32 is not expected");
2210int GCNHazardRecognizer::checkWMMACoexecutionHazards(
MachineInstr *
MI)
const {
2211 if (!ST.hasWMMACoexecutionHazards())
2214 const SIInstrInfo *TII = ST.getInstrInfo();
2223 const int WMMAWaitStates[] = {5, 9, 3, 5, 9, 17};
2224 const int VALUWaitStates[] = {4, 8, 2, 4, 8, 16};
2225 unsigned Category = 0;
2227 auto IsWMMAHazardFn = [
MI, TII, &Category,
this](
const MachineInstr &
I) {
2228 if (!TII->isXDLWMMA(
I))
2232 return hasWMMAToWMMARegOverlap(
I, *
MI);
2235 auto IsVALUHazardFn = [
MI, TII, &Category,
this](
const MachineInstr &
I) {
2236 if (!TII->isXDLWMMA(
I))
2240 return hasWMMAToVALURegOverlap(
I, *
MI);
2243 int WaitStatesNeeded = -1;
2244 int ExistingVALUs = 0;
2245 bool IsLowestRateWMMA = ST.hasGFX125xLowestRateWMMA();
2253 if (TII->isXDLWMMA(*
MI)) {
2255 const int WMMAWaitsLimit = IsLowestRateWMMA ? 17 : 9;
2256 ExistingVALUs = getWaitStatesSinceVALU(IsWMMAHazardFn, WMMAWaitsLimit);
2257 WaitStatesNeeded = WMMAWaitStates[Category] - ExistingVALUs;
2260 const int VALUWaitsLimit = IsLowestRateWMMA ? 16 : 8;
2261 ExistingVALUs = getWaitStatesSinceVALU(IsVALUHazardFn, VALUWaitsLimit);
2262 WaitStatesNeeded = VALUWaitStates[Category] - ExistingVALUs;
2265 return WaitStatesNeeded;
2268bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
2270 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2271 Register A1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src0)->getReg();
2272 Register B1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src1)->getReg();
2275 if (TRI.regsOverlap(D0, A1) || TRI.regsOverlap(D0, B1))
2279 Register Idx1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src2)->getReg();
2280 if (TRI.regsOverlap(D0, Idx1))
2286bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
2289 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2290 for (
const MachineOperand &ValuUse :
MI.explicit_uses()) {
2291 if (ValuUse.isReg() && TRI.regsOverlap(D0, ValuUse.getReg()))
2296 Register A0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src0)->getReg();
2297 Register B0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src1)->getReg();
2301 Register Idx0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src2)->getReg();
2302 WMMARegs.push_back(Idx0);
2305 for (
const MachineOperand &ValuDef :
MI.defs()) {
2306 Register VDstReg = ValuDef.getReg();
2307 for (
Register WMMAReg : WMMARegs) {
2308 if (TRI.regsOverlap(VDstReg, WMMAReg))
2315bool GCNHazardRecognizer::isCoexecutionHazardFor(
const MachineInstr &
I,
2319 if (!TII.isXDLWMMA(
I))
2323 if (TII.isXDLWMMA(
MI))
2324 return hasWMMAToWMMARegOverlap(
I,
MI);
2326 return hasWMMAToVALURegOverlap(
I,
MI);
2332 bool IncludeSubloops) {
2335 for (MachineBasicBlock *
MBB :
L->getBlocks()) {
2336 if (!IncludeSubloops && MLI->getLoopFor(
MBB) != L)
2338 for (MachineInstr &
I : *
MBB) {
2341 if (isCoexecutionHazardFor(
I, *
MI))
2348bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(
MachineInstr *
MI,
2349 int WaitStatesNeeded) {
2353 MachineLoop *
L = MLI->getLoopFor(
MI->getParent());
2355 ++NumWMMAHoistingBailed;
2360 if (hasWMMAHazardInLoop(L,
MI)) {
2361 ++NumWMMAHoistingBailed;
2366 MachineLoop *TargetLoop =
L;
2368 if (hasWMMAHazardInLoop(Parent,
MI,
false))
2370 TargetLoop = Parent;
2376 ++NumWMMAHoistingBailed;
2380 LLVM_DEBUG(
dbgs() <<
"WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
2386 NumWMMANopsHoisted += WaitStatesNeeded;
2390bool GCNHazardRecognizer::fixWMMACoexecutionHazards(
MachineInstr *
MI) {
2391 int WaitStatesNeeded = checkWMMACoexecutionHazards(
MI);
2392 if (WaitStatesNeeded <= 0)
2398 emitVNops(*
MI->getParent(),
MI->getIterator(), WaitStatesNeeded);
2402bool GCNHazardRecognizer::fixShift64HighRegBug(
MachineInstr *
MI) {
2403 if (!ST.hasShift64HighRegBug())
2405 assert(!ST.hasExtendedWaitCounts());
2407 switch (
MI->getOpcode()) {
2410 case AMDGPU::V_LSHLREV_B64_e64:
2411 case AMDGPU::V_LSHRREV_B64_e64:
2412 case AMDGPU::V_ASHRREV_I64_e64:
2416 MachineOperand *Amt = TII.getNamedOperand(*
MI, AMDGPU::OpName::src0);
2421 const MachineRegisterInfo &MRI = MF.getRegInfo();
2423 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2426 if (AmtReg != AMDGPU::VGPR255 && MRI.
isPhysRegUsed(AmtReg + 1))
2429 assert(ST.needsAlignedVGPRs());
2430 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2433 MachineBasicBlock *
MBB =
MI->getParent();
2434 MachineOperand *Src1 = TII.getNamedOperand(*
MI, AMDGPU::OpName::src1);
2445 Register DstReg =
MI->getOperand(0).getReg();
2447 Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
2455 bool Overlapped =
MI->modifiesRegister(AmtReg, &TRI);
2457 for (MCRegister
Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2458 : AMDGPU::VGPR_32RegClass) {
2459 if (!
MI->modifiesRegister(
Reg, &TRI) && !
MI->readsRegister(
Reg, &TRI)) {
2465 Register NewAmt = Overlapped ? (
Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2470 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2483 runOnInstruction(
BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2490 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2496 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2510 MI->getOperand(0).setReg(NewReg);
2519int GCNHazardRecognizer::checkNSAtoVMEMHazard(
MachineInstr *
MI)
const {
2520 int NSAtoVMEMWaitStates = 1;
2522 if (!ST.hasNSAtoVMEMBug())
2528 const SIInstrInfo *TII = ST.getInstrInfo();
2529 const auto *
Offset = TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
2537 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2538 TII->getInstSizeInBytes(
I) >= 16;
2541 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
2544int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
2546 int FPAtomicToDenormModeWaitStates = 3;
2548 if (!ST.hasFPAtomicToDenormModeHazard())
2550 assert(!ST.hasExtendedWaitCounts());
2552 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2561 auto IsExpiredFn = [](
const MachineInstr &
MI,
int WaitStates) {
2568 return FPAtomicToDenormModeWaitStates -
2572int GCNHazardRecognizer::checkMAIHazards(
MachineInstr *
MI)
const {
2575 return ST.hasGFX90AInsts() ? checkMAIHazards90A(
MI) : checkMAIHazards908(
MI);
2578int GCNHazardRecognizer::checkMFMAPadding(
MachineInstr *
MI)
const {
2583 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2587 int NeighborMFMALatency = 0;
2588 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2589 this](
const MachineInstr &
MI) {
2593 NeighborMFMALatency = this->getMFMAPipelineWaitStates(
MI);
2597 const int MaxMFMAPipelineWaitStates = 16;
2598 int WaitStatesSinceNeighborMFMA =
2599 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2601 int NeighborMFMAPaddingNeeded =
2603 WaitStatesSinceNeighborMFMA;
2605 return std::max(0, NeighborMFMAPaddingNeeded);
2608int GCNHazardRecognizer::checkMAIHazards908(
MachineInstr *
MI)
const {
2609 int WaitStatesNeeded = 0;
2610 unsigned Opc =
MI->getOpcode();
2612 auto IsVALUFn = [](
const MachineInstr &
MI) {
2616 if (
Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
2617 const int LegacyVALUWritesVGPRWaitStates = 2;
2618 const int VALUWritesExecWaitStates = 4;
2619 const int MaxWaitStates = 4;
2621 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2622 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2623 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2625 if (WaitStatesNeeded < MaxWaitStates) {
2626 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2627 const int MaxWaitStates = 2;
2629 if (!
Use.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
2632 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2633 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
2634 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2636 if (WaitStatesNeeded == MaxWaitStates)
2642 for (
const MachineOperand &
Op :
MI->explicit_operands()) {
2643 if (!
Op.isReg() || !TRI.isAGPR(MF.getRegInfo(),
Op.getReg()))
2646 if (
Op.isDef() &&
Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2649 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2650 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2651 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2652 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2653 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2654 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2655 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2656 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2657 const int MaxWaitStates = 18;
2659 unsigned HazardDefLatency = 0;
2661 auto IsOverlappedMFMAFn = [
Reg, &HazardDefLatency,
2662 this](
const MachineInstr &
MI) {
2669 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2670 return TRI.regsOverlap(DstReg,
Reg);
2673 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn,
2675 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2676 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2677 int OpNo =
Op.getOperandNo();
2678 if (OpNo == SrcCIdx) {
2679 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2680 }
else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2681 switch (HazardDefLatency) {
2682 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2684 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2686 case 16: [[fallthrough]];
2687 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2690 }
else if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2691 switch (HazardDefLatency) {
2692 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2694 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2696 case 16: [[fallthrough]];
2697 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2702 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2703 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2705 if (WaitStatesNeeded == MaxWaitStates)
2706 return WaitStatesNeeded;
2708 auto IsAccVgprWriteFn = [
Reg,
this](
const MachineInstr &
MI) {
2709 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2712 return TRI.regsOverlap(
Reg, DstReg);
2715 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2716 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2717 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2718 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2719 if (OpNo == SrcCIdx)
2720 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2721 else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2722 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2724 WaitStatesNeededForUse = NeedWaitStates -
2725 getWaitStatesSinceDef(
Reg, IsAccVgprWriteFn, MaxWaitStates);
2726 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2728 if (WaitStatesNeeded == MaxWaitStates)
2729 return WaitStatesNeeded;
2732 if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2733 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2734 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2735 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2736 const int MaxWaitStates = 13;
2737 Register DstReg =
MI->getOperand(0).getReg();
2738 unsigned HazardDefLatency = 0;
2740 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2741 this](
const MachineInstr &
MI) {
2744 Register Reg = TII.getNamedOperand(
MI, AMDGPU::OpName::src2)->getReg();
2746 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2747 return TRI.regsOverlap(
Reg, DstReg);
2750 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2752 switch (HazardDefLatency) {
2753 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2755 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2757 case 16: [[fallthrough]];
2758 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2762 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2763 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2767 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2769 return WaitStatesNeeded;
2780 return NumPasses + 1 + IsGFX950;
2791 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2809 return NumPasses + 2;
2819 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2822int GCNHazardRecognizer::checkMAIHazards90A(
MachineInstr *
MI)
const {
2823 int WaitStatesNeeded = 0;
2824 unsigned Opc =
MI->getOpcode();
2826 auto IsLegacyVALUFn = [](
const MachineInstr &
MI) {
2831 auto IsLegacyVALUNotDotFn = [](
const MachineInstr &
MI) {
2837 return WaitStatesNeeded;
2839 const int VALUWritesExecWaitStates = 4;
2840 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2841 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2842 VALUWritesExecWaitStates);
2843 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2845 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2848 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2849 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2850 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2851 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2852 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2853 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2854 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2855 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2856 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2857 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2858 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2859 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2860 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2861 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2862 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2863 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2864 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2865 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2866 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2867 const int MaxWaitStates = 19;
2873 const MachineInstr *MI1;
2875 auto IsOverlappedMFMAFn = [
Reg, &FullReg, &MI1,
2876 this](
const MachineInstr &
MI) {
2880 FullReg = (DstReg ==
Reg);
2882 return TRI.regsOverlap(DstReg,
Reg);
2885 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2886 getWaitStatesSinceDef(
Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2887 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2890 getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn, MaxWaitStates);
2891 if (NumWaitStates == std::numeric_limits<int>::max())
2894 int OpNo =
Use.getOperandNo();
2896 int NeedWaitStates = 0;
2897 if (OpNo == SrcCIdx) {
2901 }
else if (FullReg) {
2902 if ((
Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2903 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2904 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2905 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2906 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2907 else if (ST.hasGFX940Insts() &&
2908 TSchedModel.computeInstrLatency(MI1) == 2)
2909 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2912 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2913 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2914 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2915 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2916 if (!TII.isXDL(*
MI))
2919 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2920 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2922 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2923 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2924 if (!TII.isXDL(*
MI))
2925 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2928 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2929 if (ST.hasGFX940Insts()) {
2930 if (TII.isXDL(*
MI) && !TII.isXDL(*MI1))
2937 NumPasses, ST.hasGFX950Insts())
2939 NumPasses, ST.hasGFX950Insts()))
2945 switch (NumPasses) {
2949 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2950 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2955 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2956 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2961 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2962 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2971 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2972 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2973 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2974 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2977 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2978 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2980 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2981 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2982 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2985 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2987 if (ST.hasGFX940Insts()) {
2991 NumPasses, ST.hasGFX950Insts())
2997 switch (NumPasses) {
2999 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
3004 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
3008 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
3012 if (WaitStatesNeeded >= NeedWaitStates)
3015 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
3016 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3018 if (WaitStatesNeeded == MaxWaitStates)
3023 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
3025 return WaitStatesNeeded;
3028int GCNHazardRecognizer::checkMAILdStHazards(
MachineInstr *
MI)
const {
3030 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
3033 int WaitStatesNeeded = 0;
3035 auto IsAccVgprReadFn = [](
const MachineInstr &
MI) {
3036 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
3039 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
3040 if (!
Op.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Op.getReg()))
3045 const int AccVgprReadLdStWaitStates = 2;
3046 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
3047 const int MaxWaitStates = 2;
3049 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
3050 getWaitStatesSinceDef(
Reg, IsAccVgprReadFn, MaxWaitStates);
3051 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3053 if (WaitStatesNeeded == MaxWaitStates)
3054 return WaitStatesNeeded;
3056 auto IsVALUAccVgprRdWrCheckFn = [
Reg,
this](
const MachineInstr &
MI) {
3057 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
3058 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
3060 auto IsVALUFn = [](
const MachineInstr &
MI) {
3064 return getWaitStatesSinceDef(
Reg, IsVALUFn, 2 ) <
3065 std::numeric_limits<int>::max();
3068 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
3069 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
3070 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3073 return WaitStatesNeeded;
3076int GCNHazardRecognizer::checkPermlaneHazards(
MachineInstr *
MI)
const {
3077 assert(!ST.hasVcmpxPermlaneHazard() &&
3078 "this is a different vcmpx+permlane hazard");
3079 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3080 const SIInstrInfo *TII = ST.getInstrInfo();
3082 auto IsVCmpXWritesExecFn = [TII, TRI](
const MachineInstr &
MI) {
3086 auto IsVALUFn = [](
const MachineInstr &
MI) {
3090 const int VCmpXWritesExecWaitStates = 4;
3091 const int VALUWritesVDstWaitStates = 2;
3092 int WaitStatesNeeded = 0;
3094 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
3095 if (!
Op.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Op.getReg()))
3099 int WaitStatesSinceDef =
3100 VALUWritesVDstWaitStates -
3101 getWaitStatesSinceDef(
Reg, IsVALUFn,
3102 VALUWritesVDstWaitStates);
3103 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
3104 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
3108 int VCmpXHazardWaits =
3109 VCmpXWritesExecWaitStates -
3110 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
3112 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
3113 return WaitStatesNeeded;
3121 return NumPasses + 2;
3131 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3141 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3149 return NumPasses + 2;
3152int GCNHazardRecognizer::checkMAIVALUHazards(
MachineInstr *
MI)
const {
3153 if (!ST.hasGFX90AInsts())
3156 auto IsDGEMMFn = [](
const MachineInstr &
MI) ->
bool {
3164 const MachineRegisterInfo &MRI = MF.getRegInfo();
3166 int WaitStatesNeeded = 0;
3172 const MachineInstr *
MFMA =
nullptr;
3174 auto IsMFMAWriteFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
3176 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
3182 const MachineInstr *
DOT =
nullptr;
3183 auto IsDotWriteFn = [&
Reg, &
DOT,
this](
const MachineInstr &
MI) {
3185 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
3191 bool DGEMMAfterVALUWrite =
false;
3192 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite,
this](
const MachineInstr &
MI) {
3195 DGEMMAfterVALUWrite =
true;
3199 if (!TII.isVALU(
MI,
true) || !DGEMMAfterVALUWrite)
3205 int SrcCIdx = AMDGPU::getNamedOperandIdx(
MI->getOpcode(),
3206 AMDGPU::OpName::src2);
3208 if (IsMemOrExport || IsVALU) {
3209 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
3210 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3211 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3212 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3213 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3214 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3215 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3216 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3217 const int DotWriteSameDotReadSrcAB = 3;
3218 const int DotWriteDifferentVALURead = 3;
3219 const int DMFMABetweenVALUWriteVMEMRead = 2;
3220 const int MaxWaitStates = 19;
3222 for (
const MachineOperand &Use :
MI->explicit_uses()) {
3228 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3231 int NeedWaitStates = 0;
3232 if (
DOT->getOpcode() ==
MI->getOpcode()) {
3233 if (&Use - &
MI->getOperand(0) != SrcCIdx)
3234 NeedWaitStates = DotWriteSameDotReadSrcAB;
3236 NeedWaitStates = DotWriteDifferentVALURead;
3239 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3240 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3247 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3248 DGEMMAfterVALUWrite =
false;
3249 if (TRI.isVectorRegister(MRI,
Reg)) {
3250 int WaitStatesNeededForUse =
3251 DMFMABetweenVALUWriteVMEMRead -
3252 getWaitStatesSinceDef(
Reg, IsDGEMMHazard,
3253 DMFMABetweenVALUWriteVMEMRead);
3255 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3260 WaitStatesSinceDef =
3261 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3265 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3266 int NumPasses = HazardDefLatency;
3267 int NeedWaitStates = MaxWaitStates;
3270 switch (HazardDefLatency) {
3272 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3273 : DMFMA4x4WriteVgprVALUReadWaitStates;
3279 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3280 : (ST.hasGFX950Insts()
3281 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3282 : DMFMA16x16WriteVgprVALUReadWaitStates);
3287 }
else if (ST.hasGFX940Insts()) {
3291 NumPasses, ST.hasGFX950Insts())
3295 switch (HazardDefLatency) {
3297 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3300 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3303 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3310 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3311 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3313 if (WaitStatesNeeded == MaxWaitStates)
3318 unsigned Opc =
MI->getOpcode();
3319 const int DMFMAToFMA64WaitStates = 2;
3320 if ((
Opc == AMDGPU::V_FMA_F64_e64 ||
3321 Opc == AMDGPU::V_FMAC_F64_e32 ||
Opc == AMDGPU::V_FMAC_F64_e64 ||
3322 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3323 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3324 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3325 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3326 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3329 if (!IsVALU && !IsMemOrExport)
3330 return WaitStatesNeeded;
3332 for (
const MachineOperand &Def :
MI->defs()) {
3333 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3334 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3335 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3336 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3337 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3338 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3339 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3340 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3341 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3342 const int DotWriteDifferentVALUWrite = 3;
3343 const int MaxWaitStates = 19;
3344 const int MaxWarWaitStates = 15;
3349 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3351 if (DOT &&
DOT->getOpcode() !=
MI->getOpcode())
3352 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3353 WaitStatesSinceDef);
3356 WaitStatesSinceDef =
3357 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3359 int NeedWaitStates = MaxWaitStates;
3360 int NumPasses = TSchedModel.computeInstrLatency(
MFMA);
3363 switch (NumPasses) {
3365 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3369 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3374 }
else if (ST.hasGFX940Insts()) {
3378 NumPasses, ST.hasGFX950Insts())
3381 switch (NumPasses) {
3383 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3386 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3389 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3396 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3397 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3399 if (WaitStatesNeeded == MaxWaitStates)
3403 auto IsSMFMAReadAsCFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
3405 !
MI.readsRegister(
Reg, &TRI))
3408 if (ST.hasGFX940Insts() && !TII.isXDL(
MI))
3411 const MachineOperand *SrcC =
3412 TII.getNamedOperand(
MI, AMDGPU::OpName::src2);
3422 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3427 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3428 int NeedWaitStates = MaxWaitStates;
3429 switch (HazardDefLatency) {
3430 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3432 case 4:
assert(ST.hasGFX940Insts());
3433 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3435 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3437 case 16: [[fallthrough]];
3438 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3442 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3443 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3446 return WaitStatesNeeded;
3459 return MAI !=
nullptr;
3463 if (IsMFMAFn(*
MI)) {
3464 int W = getWaitStatesSince(IsMFMAFn, 16);
3466 return W < (int)TSchedModel.computeInstrLatency(MAI);
3480 while (
I->isBundledWithPred())
3486 if (
I->getOpcode() != AMDGPU::S_GETPC_B64)
3490 const unsigned NewBytes = 4;
3492 "Unexpected instruction insertion in bundle");
3495 while (NextMI != End && NextMI->isBundledWithPred()) {
3496 for (
auto &Operand : NextMI->operands()) {
3497 if (Operand.isGlobal())
3498 Operand.setOffset(Operand.getOffset() + NewBytes);
3504bool GCNHazardRecognizer::fixVALUMaskWriteHazard(
MachineInstr *
MI) {
3505 if (!ST.hasVALUMaskWriteHazard())
3507 assert(!ST.hasExtendedWaitCounts());
3514 if (!IsSALU && !IsVALU)
3526 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3527 const MachineRegisterInfo &MRI = MF.getRegInfo();
3532 case AMDGPU::EXEC_LO:
3533 case AMDGPU::EXEC_HI:
3535 case AMDGPU::SGPR_NULL:
3536 case AMDGPU::SGPR_NULL64:
3544 return Reg == AMDGPU::VCC ||
Reg == AMDGPU::VCC_LO ||
Reg == AMDGPU::VCC_HI;
3548 SmallSet<Register, 2> HazardSGPRs;
3550 static unsigned getHashValue(
const StateType &State) {
3553 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
3554 return LHS.HazardSGPRs ==
RHS.HazardSGPRs;
3558 SmallVector<const MachineInstr *> WaitInstrs;
3559 bool HasSGPRRead =
false;
3560 StateType InitialState;
3563 MachineOperand *HazardDef =
nullptr;
3564 for (MachineOperand &
Op :
MI->operands()) {
3567 if (
Op.isDef() && HazardDef)
3571 if (IgnoreableSGPR(
Reg))
3574 if (
Op.isImplicit())
3576 if (!TRI->isSGPRReg(MRI,
Reg))
3594 if (AMDGPU::SReg_32RegClass.
contains(HazardReg)) {
3595 InitialState.HazardSGPRs.insert(HazardReg);
3598 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3599 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3602 auto IsHazardFn = [&](StateType &State,
const MachineInstr &
I) {
3603 if (State.HazardSGPRs.empty())
3606 switch (
I.getOpcode()) {
3607 case AMDGPU::V_ADDC_U32_e32:
3608 case AMDGPU::V_ADDC_U32_dpp:
3609 case AMDGPU::V_CNDMASK_B16_t16_e32:
3610 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3611 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3612 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3613 case AMDGPU::V_CNDMASK_B32_e32:
3614 case AMDGPU::V_CNDMASK_B32_dpp:
3615 case AMDGPU::V_DIV_FMAS_F32_e64:
3616 case AMDGPU::V_DIV_FMAS_F64_e64:
3617 case AMDGPU::V_SUBB_U32_e32:
3618 case AMDGPU::V_SUBB_U32_dpp:
3619 case AMDGPU::V_SUBBREV_U32_e32:
3620 case AMDGPU::V_SUBBREV_U32_dpp: {
3624 case AMDGPU::V_ADDC_U32_e64:
3625 case AMDGPU::V_ADDC_U32_e64_dpp:
3626 case AMDGPU::V_CNDMASK_B16_t16_e64:
3627 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3628 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3629 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3630 case AMDGPU::V_CNDMASK_B32_e64:
3631 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3632 case AMDGPU::V_SUBB_U32_e64:
3633 case AMDGPU::V_SUBB_U32_e64_dpp:
3634 case AMDGPU::V_SUBBREV_U32_e64:
3635 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3637 const MachineOperand *SSRCOp = TII.getNamedOperand(
I, AMDGPU::OpName::src2);
3639 bool Result = TRI->regsOverlap(SSRCOp->
getReg(), HazardReg);
3651 auto UpdateStateFn = [&](StateType &State,
const MachineInstr &
I) {
3652 switch (
I.getOpcode()) {
3653 case AMDGPU::S_WAITCNT_DEPCTR:
3655 if (!HasSGPRRead &&
I.getParent() ==
MI->getParent() && !
I.isBundled() &&
3656 (
I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3661 for (
auto &
Op :
I.operands()) {
3666 if (IgnoreableSGPR(
Reg))
3669 if (
Op.isImplicit())
3671 if (!TRI->isSGPRReg(MRI,
Reg))
3682 for (
Register SGPR : State.HazardSGPRs) {
3683 if (
Reg == SGPR || TRI->regsOverlap(
Reg, SGPR))
3687 State.HazardSGPRs.erase(SGPR);
3696 std::next(
MI->getReverseIterator())))
3706 if (!WaitInstrs.
empty()) {
3710 SmallVector<MachineInstr *> ToErase;
3712 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3713 End = MI->getParent()->rend();
3714 Found < WaitInstrs.size() && It != End; ++It) {
3715 MachineInstr *WaitMI = &*It;
3717 if (std::as_const(WaitMI) != WaitInstrs[Found])
3720 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3721 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3722 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3723 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3724 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3725 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3726 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3727 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3728 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3729 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3730 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3731 ToErase.push_back(WaitMI);
3734 for (MachineInstr *WaitMI : ToErase)
3735 WaitMI->eraseFromParent();
3739 auto NextMI = std::next(
MI->getIterator());
3740 auto NewMI =
BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
3741 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3753 if (EntryMBB.
begin() != EntryMBB.
end()) {
3754 auto &EntryMI = *EntryMBB.
begin();
3755 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3756 EntryMI.getOperand(0).getImm() >= Priority)
3765bool GCNHazardRecognizer::fixRequiredExportPriority(
MachineInstr *
MI) {
3766 if (!ST.hasRequiredExportPriority())
3771 MachineBasicBlock *
MBB =
MI->getParent();
3784 const int MaxPriority = 3;
3785 const int NormalPriority = 2;
3786 const int PostExportPriority = 0;
3788 auto It =
MI->getIterator();
3789 switch (
MI->getOpcode()) {
3790 case AMDGPU::S_ENDPGM:
3791 case AMDGPU::S_ENDPGM_SAVED:
3792 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3793 case AMDGPU::SI_RETURN_TO_EPILOG:
3796 if (MF->getFrameInfo().hasCalls())
3799 case AMDGPU::S_SETPRIO: {
3801 auto &PrioOp =
MI->getOperand(0);
3802 int Prio = PrioOp.getImm();
3803 bool InWA = (Prio == PostExportPriority) &&
3804 (It !=
MBB->
begin() && TII.isEXP(*std::prev(It)));
3805 if (InWA || Prio >= NormalPriority)
3807 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3811 if (!TII.isEXP(*
MI))
3822 auto NextMI = std::next(It);
3823 bool EndOfShader =
false;
3824 if (NextMI !=
MBB->
end()) {
3826 if (TII.isEXP(*NextMI))
3829 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3830 NextMI->getOperand(0).getImm() == PostExportPriority)
3832 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3839 .
addImm(PostExportPriority);
3843 BuildMI(*
MBB, NextMI,
DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3844 .
addReg(AMDGPU::SGPR_NULL)
3864 const SIInstrInfo *TII = ST.getInstrInfo();
3876 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3881bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(
MachineInstr *
MI) {
3882 if (
MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3885 const SIInstrInfo *TII = ST.getInstrInfo();
3887 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3889 BuildMI(*
MI->getParent(), std::next(
MI->getIterator()),
MI->getDebugLoc(),
3890 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3896bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(
MachineInstr *
MI) {
3899 if (!IsHazardRecognizerMode)
3902 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3903 const SIInstrInfo *TII = ST.getInstrInfo();
3905 const int FlatScrBaseWaitStates = 10;
3907 bool ReadsFlatScrLo =
3908 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3909 bool ReadsFlatScrHi =
3910 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3916 ReadsFlatScrLo =
true;
3919 ReadsFlatScrHi =
true;
3924 const MachineRegisterInfo &MRI = MF.getRegInfo();
3927 DenseSet<const MachineBasicBlock *> Visited;
3929 return MI.modifiesRegister(
Reg, TRI);
3934 auto IsSGPRDef = [TII, TRI, &MRI](
const MachineInstr &
MI) ->
unsigned {
3935 if (!TII->isSALU(
MI) && !TII->isVALU(
MI,
true))
3937 for (
const MachineOperand &MO :
MI.all_defs()) {
3938 if (TRI->isSGPRReg(MRI, MO.getReg()))
3944 auto IsExpiredFn = [=](
const MachineInstr &
MI,
int SgprWrites) {
3945 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3946 unsigned Wait =
MI.getOperand(0).getImm();
3951 return SgprWrites >= FlatScrBaseWaitStates;
3954 return ::getWaitStatesSince(
3955 IsHazardFn,
MI->getParent(), std::next(
MI->getReverseIterator()),
3956 0,
IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3960 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3962 !IsRegDefHazard(AMDGPU::SGPR103)))
3966 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3977 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3978 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static cl::opt< bool > EnableWMMAVnopHoisting("amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden, cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"))
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static unsigned getWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, const TargetSchedModel &SchedModel, const GCNSubtarget &ST)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(GsymDataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const uint32_t IV[8]
unsigned get(InstCounterType T) const
BitVector & set()
Set all bits in the bitvector.
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Implements a dense probed hash-table based set.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
unsigned getHazardWaitStates(MachineInstr *MI) const
Returns the number of wait states until all hazards for MI are resolved.
unsigned PreEmitNoopsCommon(MachineInstr *) const
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
bool ShouldPreferAnother(SUnit *SU) const override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
GCNHazardRecognizer(const MachineFunction &MF, MachineLoopInfo *MLI=nullptr)
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Wrapper class representing physical registers. Should be passed by value.
Instructions::const_reverse_iterator const_reverse_instr_iterator
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool isConstantPhysReg(MCRegister PhysReg) const
Returns true if PhysReg is unallocatable and constant throughout the function.
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Wrapper class representing virtual and physical registers.
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI, bool AllowLDSDMA)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
unsigned getOccupancy() const
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned getMaxLookAhead() const
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Provide an instruction scheduling machine model to CodeGen passes.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
This namespace contains all of the command line option processing machinery.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
NodeAddr< UseNode * > Use
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
constexpr RegState getDeadRegState(bool B)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionAddr VTableAddr Count
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...