27struct MFMAPaddingRatioParser :
public cl::parser<unsigned> {
28 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(
O) {}
30 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg,
unsigned &
Value) {
32 return O.error(
"'" + Arg +
"' value invalid for uint argument!");
35 return O.error(
"'" + Arg +
"' value must be in the range [0, 100]!");
45 cl::desc(
"Fill a percentage of the latency between "
46 "neighboring MFMA with s_nops."));
51 cl::desc(
"Insert a s_nop x before every instruction"));
61 : IsHazardRecognizerMode(
false), CurrCycleInstr(nullptr), MF(MF),
62 ST(MF.getSubtarget<
GCNSubtarget>()), TII(*ST.getInstrInfo()),
63 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
64 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
65 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
70 EmittedInstrs.clear();
82 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
86 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
91 case AMDGPU::S_SETREG_B32:
92 case AMDGPU::S_SETREG_B32_mode:
93 case AMDGPU::S_SETREG_IMM32_B32:
94 case AMDGPU::S_SETREG_IMM32_B32_mode:
101 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
105 return Opcode == AMDGPU::S_RFE_B64;
110 case AMDGPU::S_MOVRELS_B32:
111 case AMDGPU::S_MOVRELS_B64:
112 case AMDGPU::S_MOVRELD_B32:
113 case AMDGPU::S_MOVRELD_B64:
122 if (
TII.isAlwaysGDS(
MI.getOpcode()))
125 switch (
MI.getOpcode()) {
126 case AMDGPU::S_SENDMSG:
127 case AMDGPU::S_SENDMSGHALT:
128 case AMDGPU::S_TTRACEDATA:
132 case AMDGPU::DS_PERMUTE_B32:
133 case AMDGPU::DS_BPERMUTE_B32:
136 if (
TII.isDS(
MI.getOpcode())) {
137 int GDS = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
138 AMDGPU::OpName::gds);
139 if (
MI.getOperand(GDS).getImm())
147 unsigned Opcode =
MI.getOpcode();
148 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
149 Opcode == AMDGPU::V_PERMLANE64_B32 ||
150 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
151 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
152 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
153 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
154 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
155 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
156 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
157 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
158 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
159 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
160 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
161 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
171 AMDGPU::OpName::simm16);
188 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(
MI) > 0)
191 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
195 if (!IsHazardRecognizerMode) {
196 if (checkWMMACoexecutionHazards(
MI) > 0)
200 if (ST.hasNoDataDepHazard())
212 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
215 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
220 checkMAIVALUHazards(
MI) > 0)
223 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
226 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
229 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
232 if (((ST.hasReadM0MovRelInterpHazard() &&
234 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
235 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
237 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
238 (ST.hasReadM0LdsDirectHazard() &&
239 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr))) &&
240 checkReadM0Hazards(
MI) > 0)
247 checkMAILdStHazards(
MI) > 0)
250 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
258 while (Quantity > 0) {
259 unsigned Arg = std::min(Quantity, 8u);
267GCNHazardRecognizer::getMFMAPipelineWaitStates(
const MachineInstr &
MI)
const {
268 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&
MI);
269 assert(TSchedModel.getWriteProcResBegin(SC) !=
270 TSchedModel.getWriteProcResEnd(SC));
271 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
274void GCNHazardRecognizer::processBundle() {
278 for (;
MI !=
E &&
MI->isInsideBundle(); ++
MI) {
279 CurrCycleInstr = &*
MI;
282 if (IsHazardRecognizerMode) {
283 fixHazards(CurrCycleInstr);
291 for (
unsigned i = 0, e = std::min(WaitStates,
MaxLookAhead - 1); i <
e; ++i)
292 EmittedInstrs.push_front(
nullptr);
294 EmittedInstrs.push_front(CurrCycleInstr);
297 CurrCycleInstr =
nullptr;
301 assert(IsHazardRecognizerMode);
305 if (
MI->isInsideBundle())
315 IsHazardRecognizerMode =
true;
319 CurrCycleInstr =
nullptr;
330 return std::max(WaitStates, checkSMRDHazards(
MI));
332 if (ST.hasNSAtoVMEMBug())
333 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
335 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
337 if (ST.hasNoDataDepHazard())
341 WaitStates = std::max(WaitStates, checkVMEMHazards(
MI));
344 WaitStates = std::max(WaitStates, checkVALUHazards(
MI));
347 WaitStates = std::max(WaitStates, checkDPPHazards(
MI));
350 WaitStates = std::max(WaitStates, checkDivFMasHazards(
MI));
353 WaitStates = std::max(WaitStates, checkRWLaneHazards(
MI));
357 checkMAIVALUHazards(
MI) > 0)
358 WaitStates = std::max(WaitStates, checkMAIVALUHazards(
MI));
360 if (
MI->isInlineAsm())
361 return std::max(WaitStates, checkInlineAsmHazards(
MI));
364 return std::max(WaitStates, checkGetRegHazards(
MI));
367 return std::max(WaitStates, checkSetRegHazards(
MI));
370 return std::max(WaitStates, checkRFEHazards(
MI));
372 if ((ST.hasReadM0MovRelInterpHazard() &&
374 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
375 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
377 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
378 (ST.hasReadM0LdsDirectHazard() &&
379 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr)))
380 return std::max(WaitStates, checkReadM0Hazards(
MI));
383 return std::max(WaitStates, checkMAIHazards(
MI));
386 return std::max(WaitStates, checkMAILdStHazards(
MI));
389 return std::max(WaitStates, checkPermlaneHazards(
MI));
395 EmittedInstrs.push_front(
nullptr);
401 if (!CurrCycleInstr) {
402 EmittedInstrs.push_front(
nullptr);
406 if (CurrCycleInstr->isBundle()) {
411 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
412 if (!NumWaitStates) {
413 CurrCycleInstr =
nullptr;
418 EmittedInstrs.push_front(CurrCycleInstr);
425 EmittedInstrs.push_front(
nullptr);
433 CurrCycleInstr =
nullptr;
437 assert(!IsHazardRecognizerMode &&
438 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
448template <
typename StateT>
458 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
463 static inline StateMapKey getEmptyKey() {
468 static inline StateMapKey getTombstoneKey() {
473 static unsigned getHashValue(
const StateMapKey &
Key) {
474 return StateT::getHashValue((*
Key.States)[
Key.Idx]);
476 static unsigned getHashValue(
const StateT &State) {
477 return StateT::getHashValue(State);
479 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
480 const auto EKey = getEmptyKey();
481 const auto TKey = getTombstoneKey();
482 if (StateMapKey::isEqual(
LHS, EKey) || StateMapKey::isEqual(
RHS, EKey) ||
483 StateMapKey::isEqual(
LHS, TKey) || StateMapKey::isEqual(
RHS, TKey))
484 return StateMapKey::isEqual(
LHS,
RHS);
485 return StateT::isEqual((*
LHS.States)[
LHS.Idx], (*
RHS.States)[
RHS.Idx]);
487 static bool isEqual(
const StateT &
LHS,
const StateMapKey &
RHS) {
488 if (StateMapKey::isEqual(
RHS, getEmptyKey()) ||
489 StateMapKey::isEqual(
RHS, getTombstoneKey()))
491 return StateT::isEqual(
LHS, (*
RHS.States)[
RHS.Idx]);
500 StateT State = InitialState;
503 unsigned WorkIdx = 0;
505 bool Expired =
false;
506 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
511 auto Result = IsHazard(State, *
I);
519 if (
I->isInlineAsm() ||
I->isMetaInstruction())
522 UpdateState(State, *
I);
526 unsigned StateIdx = States.
size();
527 StateMapKey
Key = {&States, StateIdx};
528 auto Insertion = StateMap.
insert_as(std::pair(
Key, StateIdx), State);
529 if (Insertion.second) {
532 StateIdx = Insertion.first->second;
535 Worklist.
insert(std::pair(Pred, StateIdx));
538 if (WorkIdx == Worklist.
size())
542 std::tie(
MBB, StateIdx) = Worklist[WorkIdx++];
543 State = States[StateIdx];
544 I =
MBB->instr_rbegin();
561 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
569 if (
I->isInlineAsm())
572 WaitStates += GetNumWaitStates(*
I);
574 if (IsExpired(*
I, WaitStates))
575 return std::numeric_limits<int>::max();
578 int MinWaitStates = std::numeric_limits<int>::max();
580 if (!Visited.
insert(Pred).second)
584 IsExpired, Visited, GetNumWaitStates);
586 MinWaitStates = std::min(MinWaitStates, W);
589 return MinWaitStates;
600 std::next(
MI->getReverseIterator()), 0, IsExpired,
601 Visited, GetNumWaitStates);
604int GCNHazardRecognizer::getWaitStatesSince(
605 IsHazardFn IsHazard,
int Limit, GetNumWaitStatesFn GetNumWaitStates) {
606 if (IsHazardRecognizerMode) {
607 auto IsExpiredFn = [Limit](
const MachineInstr &,
int WaitStates) {
608 return WaitStates >= Limit;
610 return ::getWaitStatesSince(IsHazard, CurrCycleInstr,
IsExpiredFn,
615 for (MachineInstr *
MI : EmittedInstrs) {
620 if (
MI->isInlineAsm())
623 WaitStates +=
MI ? GetNumWaitStates(*
MI) : 1;
625 if (WaitStates >= Limit)
628 return std::numeric_limits<int>::max();
631int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
int Limit) {
635int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
636 IsHazardFn IsHazardDef,
638 const SIRegisterInfo *TRI = ST.getRegisterInfo();
641 return IsHazardDef(
MI) &&
MI.modifiesRegister(
Reg, TRI);
647int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
662 for (MCRegUnit Unit :
TRI.regunits(
Reg))
663 BV.
set(
static_cast<unsigned>(Unit));
687int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM) {
690 if (!ST.isXNACKEnabled())
693 bool IsSMRD = TII.isSMRD(*MEM);
707 for (MachineInstr *
MI : EmittedInstrs) {
719 if (ClauseDefs.none())
732 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
735int GCNHazardRecognizer::checkSMRDHazards(
MachineInstr *SMRD) {
736 int WaitStatesNeeded = 0;
738 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
741 if (!ST.hasSMRDReadVALUDefHazard())
742 return WaitStatesNeeded;
746 int SmrdSgprWaitStates = 4;
747 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
748 return TII.isVALU(
MI);
750 auto IsBufferHazardDefFn = [
this](
const MachineInstr &
MI) {
751 return TII.isSALU(
MI);
754 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
756 for (
const MachineOperand &Use :
SMRD->uses()) {
759 int WaitStatesNeededForUse =
760 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
762 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
772 int WaitStatesNeededForUse =
773 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
776 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
780 return WaitStatesNeeded;
783int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr* VMEM) {
784 if (!ST.hasVMEMReadSGPRVALUDefHazard())
787 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
791 const int VmemSgprWaitStates = 5;
792 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
793 return TII.isVALU(
MI);
795 for (
const MachineOperand &Use : VMEM->uses()) {
796 if (!
Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(),
Use.getReg()))
799 int WaitStatesNeededForUse =
800 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
802 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
804 return WaitStatesNeeded;
808 const SIRegisterInfo *TRI = ST.getRegisterInfo();
809 const SIInstrInfo *TII = ST.getInstrInfo();
812 int DppVgprWaitStates = 2;
813 int DppExecWaitStates = 5;
814 int WaitStatesNeeded = 0;
815 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
816 return TII->isVALU(
MI);
819 for (
const MachineOperand &Use :
DPP->uses()) {
820 if (!
Use.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Use.getReg()))
822 int WaitStatesNeededForUse =
823 DppVgprWaitStates - getWaitStatesSinceDef(
825 [](
const MachineInstr &) { return true; },
827 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
830 WaitStatesNeeded = std::max(
832 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
835 return WaitStatesNeeded;
838int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas) {
839 const SIInstrInfo *TII = ST.getInstrInfo();
843 const int DivFMasWaitStates = 4;
844 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
845 return TII->isVALU(
MI);
847 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
850 return DivFMasWaitStates - WaitStatesNeeded;
853int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr) {
854 const SIInstrInfo *TII = ST.getInstrInfo();
855 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
857 const int GetRegWaitStates = 2;
858 auto IsHazardFn = [TII, GetRegHWReg](
const MachineInstr &
MI) {
861 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
863 return GetRegWaitStates - WaitStatesNeeded;
866int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr) {
867 const SIInstrInfo *TII = ST.getInstrInfo();
868 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
870 const int SetRegWaitStates = ST.getSetRegWaitStates();
871 auto IsHazardFn = [TII, HWReg](
const MachineInstr &
MI) {
874 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
875 return SetRegWaitStates - WaitStatesNeeded;
878int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI) {
882 const SIInstrInfo *TII = ST.getInstrInfo();
883 unsigned Opcode =
MI.getOpcode();
884 const MCInstrDesc &
Desc =
MI.getDesc();
886 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
889 VDataRCID = TII->getOpRegClassID(
Desc.operands()[VDataIdx]);
891 if (TII->isMUBUF(
MI) || TII->isMTBUF(
MI)) {
898 const MachineOperand *SOffset =
899 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
903 (!SOffset || !SOffset->
isReg()))
911 if (TII->isMIMG(
MI)) {
912 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
914 Desc.operands()[SRsrcIdx])) == 256);
918 if (TII->isFLAT(
MI)) {
931GCNHazardRecognizer::checkVALUHazardsHelper(
const MachineOperand &Def,
935 const SIRegisterInfo *TRI = ST.getRegisterInfo();
937 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
938 int WaitStatesNeeded = 0;
940 if (!TRI->isVectorRegister(
MRI,
Def.getReg()))
941 return WaitStatesNeeded;
944 int DataIdx = createsVALUHazard(
MI);
945 return DataIdx >= 0 &&
946 TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(),
Reg);
949 int WaitStatesNeededForDef =
950 VALUWaitStates - getWaitStatesSince(
IsHazardFn, VALUWaitStates);
951 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
953 return WaitStatesNeeded;
969 unsigned Opcode =
MI.getOpcode();
979 if (
auto *DstSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel))
981 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
987 if (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src0_modifiers) &
989 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
993 (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src2_modifiers) &
995 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1001 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1022 for (
auto &Operand : VALU->operands()) {
1023 if (Operand.isReg() &&
TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1030int GCNHazardRecognizer::checkVALUHazards(
MachineInstr *VALU) {
1031 int WaitStatesNeeded = 0;
1034 const int TransDefWaitstates = 1;
1036 auto IsTransDefFn = [
this,
VALU](
const MachineInstr &
MI) {
1039 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1040 const SIInstrInfo *TII = ST.getInstrInfo();
1041 Register Def = TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)->getReg();
1043 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1044 if (
Use.isReg() && TRI->regsOverlap(Def,
Use.getReg()))
1051 int WaitStatesNeededForDef =
1052 TransDefWaitstates -
1053 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1054 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1057 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1058 const int Shift16DefWaitstates = 1;
1060 auto IsShift16BitDefFn = [
this,
VALU](
const MachineInstr &ProducerMI) {
1061 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1062 const MachineOperand *ForwardedDst =
1068 if (ProducerMI.isInlineAsm()) {
1070 for (
auto &Def : ProducerMI.all_defs()) {
1079 int WaitStatesNeededForDef =
1080 Shift16DefWaitstates -
1081 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1082 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1085 if (ST.hasVDecCoExecHazard()) {
1086 const int VALUWriteSGPRVALUReadWaitstates = 2;
1087 const int VALUWriteEXECRWLane = 4;
1088 const int VALUWriteVGPRReadlaneRead = 1;
1090 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1091 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1093 auto IsVALUDefSGPRFn = [&
UseReg, TRI](
const MachineInstr &
MI) {
1096 return MI.modifiesRegister(
UseReg, TRI);
1099 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1105 int WaitStatesNeededForDef =
1106 VALUWriteSGPRVALUReadWaitstates -
1107 getWaitStatesSince(IsVALUDefSGPRFn,
1108 VALUWriteSGPRVALUReadWaitstates);
1109 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1113 if (
VALU->readsRegister(AMDGPU::VCC, TRI)) {
1115 int WaitStatesNeededForDef =
1116 VALUWriteSGPRVALUReadWaitstates -
1117 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1118 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1121 switch (
VALU->getOpcode()) {
1122 case AMDGPU::V_READLANE_B32:
1123 case AMDGPU::V_READFIRSTLANE_B32: {
1124 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1126 int WaitStatesNeededForDef =
1127 VALUWriteVGPRReadlaneRead -
1128 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1129 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1132 case AMDGPU::V_WRITELANE_B32: {
1134 int WaitStatesNeededForDef =
1135 VALUWriteEXECRWLane -
1136 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1137 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1147 if (!ST.has12DWordStoreHazard())
1148 return WaitStatesNeeded;
1150 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1152 for (
const MachineOperand &Def :
VALU->defs()) {
1153 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def,
MRI));
1156 return WaitStatesNeeded;
1159int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA) {
1168 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1169 !ST.hasCvtScaleForwardingHazard())
1172 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1173 int WaitStatesNeeded = 0;
1175 for (
const MachineOperand &
Op :
1177 if (
Op.isReg() &&
Op.isDef()) {
1178 if (!TRI.isVectorRegister(
MRI,
Op.getReg()))
1181 if (ST.has12DWordStoreHazard()) {
1183 std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Op,
MRI));
1188 if (ST.hasDstSelForwardingHazard()) {
1189 const int Shift16DefWaitstates = 1;
1191 auto IsShift16BitDefFn = [
this, &
IA](
const MachineInstr &ProducerMI) {
1195 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1196 IA->readsRegister(Dst->getReg(), &TRI);
1198 if (ProducerMI.isInlineAsm()) {
1200 for (
auto &Def : ProducerMI.all_defs()) {
1201 if (
IA->modifiesRegister(
Def.getReg(), &TRI) ||
1202 IA->readsRegister(
Def.getReg(), &TRI)) {
1211 int WaitStatesNeededForDef =
1212 Shift16DefWaitstates -
1213 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1214 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1217 return WaitStatesNeeded;
1220int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane) {
1221 const SIInstrInfo *TII = ST.getInstrInfo();
1222 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1223 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1225 const MachineOperand *LaneSelectOp =
1226 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1228 if (!LaneSelectOp->
isReg() || !TRI->isSGPRReg(
MRI, LaneSelectOp->
getReg()))
1232 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isVALU(
MI); };
1234 const int RWLaneWaitStates = 4;
1235 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
1237 return RWLaneWaitStates - WaitStatesSince;
1240int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE) {
1241 if (!ST.hasRFEHazards())
1244 const SIInstrInfo *TII = ST.getInstrInfo();
1246 const int RFEWaitStates = 1;
1251 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
1252 return RFEWaitStates - WaitStatesNeeded;
1256 const SIInstrInfo *TII = ST.getInstrInfo();
1257 const int ReadM0WaitStates = 1;
1258 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isSALU(
MI); };
1259 return ReadM0WaitStates -
1260 getWaitStatesSinceDef(AMDGPU::M0,
IsHazardFn, ReadM0WaitStates);
1265bool GCNHazardRecognizer::emitVNops(
MachineInstr *
MI,
int WaitStatesNeeded) {
1266 if (WaitStatesNeeded <= 0)
1269 const SIInstrInfo *TII = ST.getInstrInfo();
1270 for (
int I = 0;
I < WaitStatesNeeded; ++
I)
1272 TII->get(AMDGPU::V_NOP_e32));
1278 fixVMEMtoScalarWriteHazards(
MI);
1279 fixVcmpxPermlaneHazards(
MI);
1280 fixSMEMtoVectorWriteHazards(
MI);
1281 fixVcmpxExecWARHazard(
MI);
1282 fixLdsBranchVmemWARHazard(
MI);
1283 if (ST.hasLdsDirect()) {
1284 fixLdsDirectVALUHazard(
MI);
1285 fixLdsDirectVMEMHazard(
MI);
1287 fixVALUPartialForwardingHazard(
MI);
1288 fixVALUTransUseHazard(
MI);
1289 fixVALUTransCoexecutionHazards(
MI);
1291 emitVNops(
MI, checkWMMACoexecutionHazards(
MI));
1292 fixShift64HighRegBug(
MI);
1293 fixVALUMaskWriteHazard(
MI);
1294 fixRequiredExportPriority(
MI);
1295 if (ST.requiresWaitIdleBeforeGetReg())
1296 fixGetRegWaitIdle(
MI);
1297 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1298 fixDsAtomicAsyncBarrierArriveB64(
MI);
1299 if (ST.hasScratchBaseForwardingHazard())
1300 fixScratchBaseForwardingHazard(
MI);
1301 if (ST.setRegModeNeedsVNOPs())
1307 return (
TII.isVOPC(
MI) ||
1308 (
MI.isCompare() && (
TII.isVOP3(
MI) ||
TII.isSDWA(
MI)))) &&
1309 MI.modifiesRegister(AMDGPU::EXEC, &
TRI);
1312bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
1316 const SIInstrInfo *TII = ST.getInstrInfo();
1317 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1323 unsigned Opc =
MI.getOpcode();
1325 Opc != AMDGPU::V_NOP_e64 &&
Opc != AMDGPU::V_NOP_sdwa;
1329 std::numeric_limits<int>::max())
1335 auto *Src0 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
1337 bool IsUndef = Src0->isUndef();
1339 TII->get(AMDGPU::V_MOV_B32_e32))
1346bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
1347 if (!ST.hasVMEMtoScalarWriteHazard())
1349 assert(!ST.hasExtendedWaitCounts());
1354 if (
MI->getNumDefs() == 0)
1357 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1363 for (
const MachineOperand &Def :
MI->defs()) {
1364 const MachineOperand *
Op =
1365 I.findRegisterUseOperand(
Def.getReg(), TRI,
false);
1375 (
MI.getOpcode() == AMDGPU::S_WAITCNT &&
1376 !
MI.getOperand(0).getImm()) ||
1377 (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1382 std::numeric_limits<int>::max())
1385 const SIInstrInfo *TII = ST.getInstrInfo();
1387 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1392bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
1393 if (!ST.hasSMEMtoVectorWriteHazard())
1395 assert(!ST.hasExtendedWaitCounts());
1400 AMDGPU::OpName SDSTName;
1401 switch (
MI->getOpcode()) {
1402 case AMDGPU::V_READLANE_B32:
1403 case AMDGPU::V_READFIRSTLANE_B32:
1404 SDSTName = AMDGPU::OpName::vdst;
1407 SDSTName = AMDGPU::OpName::sdst;
1411 const SIInstrInfo *TII = ST.getInstrInfo();
1412 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1414 const MachineOperand *SDST = TII->getNamedOperand(*
MI, SDSTName);
1416 for (
const auto &MO :
MI->implicit_operands()) {
1417 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1428 auto IsHazardFn = [SDSTReg, TRI](
const MachineInstr &
I) {
1433 if (TII->isSALU(
MI)) {
1434 switch (
MI.getOpcode()) {
1435 case AMDGPU::S_SETVSKIP:
1436 case AMDGPU::S_VERSION:
1437 case AMDGPU::S_WAITCNT_VSCNT:
1438 case AMDGPU::S_WAITCNT_VMCNT:
1439 case AMDGPU::S_WAITCNT_EXPCNT:
1442 case AMDGPU::S_WAITCNT_LGKMCNT:
1444 return (
MI.getOperand(1).getImm() == 0) &&
1445 (
MI.getOperand(0).
getReg() == AMDGPU::SGPR_NULL);
1446 case AMDGPU::S_WAITCNT: {
1447 const int64_t
Imm =
MI.getOperand(0).getImm();
1450 return (Decoded.
DsCnt == 0);
1454 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1455 "unexpected wait count instruction");
1457 if (TII->isSOPP(
MI))
1473 std::numeric_limits<int>::max())
1477 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1482bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1483 if (!ST.hasVcmpxExecWARHazard())
1485 assert(!ST.hasExtendedWaitCounts());
1490 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1491 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1497 return I.readsRegister(AMDGPU::EXEC, TRI);
1500 const SIInstrInfo *TII = ST.getInstrInfo();
1501 auto IsExpiredFn = [TII, TRI](
const MachineInstr &
MI, int) {
1503 if (TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))
1505 for (
auto MO :
MI.implicit_operands())
1506 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1509 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1516 std::numeric_limits<int>::max())
1520 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1527 if (!ST.hasLdsBranchVmemWARHazard())
1532 bool HasLds =
false;
1533 bool HasVmem =
false;
1534 for (
auto &
MBB : MF) {
1535 for (
auto &
MI :
MBB) {
1538 if (HasLds && HasVmem)
1546 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1547 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1548 !
I.getOperand(1).getImm();
1551bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1552 if (!RunLdsBranchVmemWARHazardFixup)
1555 assert(ST.hasLdsBranchVmemWARHazard());
1556 assert(!ST.hasExtendedWaitCounts());
1558 auto IsHazardInst = [](
const MachineInstr &
MI) {
1566 auto InstType = IsHazardInst(*
MI);
1570 auto IsExpiredFn = [&IsHazardInst](
const MachineInstr &
I, int) {
1574 auto IsHazardFn = [InstType, &IsHazardInst](
const MachineInstr &
I) {
1578 auto IsHazardFn = [InstType, IsHazardInst](
const MachineInstr &
I) {
1579 auto InstType2 = IsHazardInst(
I);
1580 return InstType2 && InstType != InstType2;
1583 auto IsExpiredFn = [InstType, &IsHazardInst](
const MachineInstr &
I, int) {
1584 auto InstType2 = IsHazardInst(
I);
1585 if (InstType == InstType2)
1592 std::numeric_limits<int>::max();
1596 std::numeric_limits<int>::max())
1599 const SIInstrInfo *TII = ST.getInstrInfo();
1601 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1608bool GCNHazardRecognizer::fixLdsDirectVALUHazard(
MachineInstr *
MI) {
1612 const int NoHazardWaitStates = 15;
1613 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1616 bool VisitedTrans =
false;
1617 auto IsHazardFn = [
this, VDSTReg, &VisitedTrans](
const MachineInstr &
I) {
1622 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1624 auto IsExpiredFn = [&](
const MachineInstr &
I,
int WaitStates) {
1625 if (WaitStates >= NoHazardWaitStates)
1631 auto GetWaitStatesFn = [](
const MachineInstr &
MI) {
1635 DenseSet<const MachineBasicBlock *> Visited;
1637 std::next(
MI->getReverseIterator()), 0,
1645 MachineOperand *WaitVdstOp =
1646 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvdst);
1647 WaitVdstOp->
setImm(std::min(
Count, NoHazardWaitStates));
1652bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(
MachineInstr *
MI) {
1656 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1659 auto IsHazardFn = [
this, VDSTReg](
const MachineInstr &
I) {
1662 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1664 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1667 auto IsExpiredFn = [
this, LdsdirCanWait](
const MachineInstr &
I, int) {
1669 (
I.getOpcode() == AMDGPU::S_WAITCNT && !
I.getOperand(0).getImm()) ||
1670 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1673 !TII.getNamedOperand(
I, AMDGPU::OpName::waitvsrc)->getImm());
1677 std::numeric_limits<int>::max())
1680 if (LdsdirCanWait) {
1681 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1684 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1691bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(
MachineInstr *
MI) {
1692 if (!ST.hasVALUPartialForwardingHazard())
1694 assert(!ST.hasExtendedWaitCounts());
1699 SmallSetVector<Register, 4> SrcVGPRs;
1701 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1702 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1707 if (SrcVGPRs.
size() <= 1)
1725 const int Intv1plus2MaxVALUs = 2;
1726 const int Intv3MaxVALUs = 4;
1727 const int IntvMaxVALUs = 6;
1728 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1731 SmallDenseMap<Register, int, 4> DefPos;
1732 int ExecPos = std::numeric_limits<int>::max();
1735 static unsigned getHashValue(
const StateType &State) {
1739 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1740 return LHS.DefPos ==
RHS.DefPos &&
LHS.ExecPos ==
RHS.ExecPos &&
1748 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1750 if (State.VALUs > NoHazardVALUWaitStates)
1756 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1764 if (!State.DefPos.count(Src) &&
I.modifiesRegister(Src, &TRI)) {
1765 State.DefPos[Src] = State.VALUs;
1770 if (State.ExecPos == std::numeric_limits<int>::max()) {
1771 if (!State.DefPos.empty() &&
I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1772 State.ExecPos = State.VALUs;
1779 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1787 if (State.ExecPos == std::numeric_limits<int>::max())
1790 int PreExecPos = std::numeric_limits<int>::max();
1791 int PostExecPos = std::numeric_limits<int>::max();
1793 for (
auto Entry : State.DefPos) {
1794 int DefVALUs =
Entry.second;
1795 if (DefVALUs != std::numeric_limits<int>::max()) {
1796 if (DefVALUs >= State.ExecPos)
1797 PreExecPos = std::min(PreExecPos, DefVALUs);
1799 PostExecPos = std::min(PostExecPos, DefVALUs);
1804 if (PostExecPos == std::numeric_limits<int>::max())
1808 int Intv3VALUs = PostExecPos;
1809 if (Intv3VALUs > Intv3MaxVALUs)
1813 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1814 if (Intv2VALUs > Intv1plus2MaxVALUs)
1818 if (PreExecPos == std::numeric_limits<int>::max())
1822 int Intv1VALUs = PreExecPos - State.ExecPos;
1823 if (Intv1VALUs > Intv1plus2MaxVALUs)
1827 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1832 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1838 std::next(
MI->getReverseIterator())))
1842 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1848bool GCNHazardRecognizer::fixVALUTransUseHazard(
MachineInstr *
MI) {
1849 if (!ST.hasVALUTransUseHazard())
1851 assert(!ST.hasExtendedWaitCounts());
1856 SmallSet<Register, 4> SrcVGPRs;
1858 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1859 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1873 const int IntvMaxVALUs = 5;
1874 const int IntvMaxTRANS = 1;
1880 static unsigned getHashValue(
const StateType &State) {
1883 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1884 return LHS.VALUs ==
RHS.VALUs &&
LHS.TRANS ==
RHS.TRANS;
1891 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1893 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1899 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1906 if (
I.modifiesRegister(Src, &TRI)) {
1914 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1922 std::next(
MI->getReverseIterator())))
1928 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1934bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(
MachineInstr *
MI) {
1935 if (!ST.hasGFX1250Insts() ||
1939 const SIInstrInfo *TII = ST.getInstrInfo();
1940 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1942 auto IsTransHazardFn = [
MI, TII, TRI](
const MachineInstr &
I) {
1947 Register TransDef = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1948 for (
const MachineOperand &ValuUse :
MI->explicit_uses()) {
1949 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1953 auto *ValuDst = TII->getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1954 if (!ValuDst || !ValuDst->isReg())
1958 Register ValuDef = ValuDst->getReg();
1959 for (
const MachineOperand &TransUse :
I.explicit_uses()) {
1960 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1971 const int HasVALU = std::numeric_limits<int>::max();
1972 if (::getWaitStatesSince(IsTransHazardFn,
MI,
IsExpiredFn) == HasVALU)
1975 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1983 const SIInstrInfo *TII = ST.getInstrInfo();
1984 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1986 auto IsHazardFn = [
MI, TII, TRI,
this](
const MachineInstr &
I) {
1993 TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
1995 TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
1998 TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2000 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2001 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2010 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
2011 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2025 std::numeric_limits<int>::max())
2028 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2040 unsigned Category) {
2042 "Handle me if the xdl wmma instruction latency changes");
2079int GCNHazardRecognizer::checkWMMACoexecutionHazards(
MachineInstr *
MI) {
2080 if (!ST.hasGFX1250Insts())
2083 const SIInstrInfo *TII = ST.getInstrInfo();
2087 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2094 const int WMMAWaitStates[] = {5, 9, 3, 5};
2095 const int VALUWaitStates[] = {4, 8, 2, 4};
2096 unsigned Category = 0;
2098 auto IsWMMAHazardFn = [
MI, TII, TRI, &Category,
this](
const MachineInstr &
I) {
2099 if (!TII->isXDLWMMA(
I))
2102 unsigned Latency = TSchedModel.computeInstrLatency(&
I);
2106 Register D0 = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2107 Register A1 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
2108 Register B1 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
2111 if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))
2115 Register Idx1 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
2116 if (TRI->regsOverlap(D0, Idx1))
2123 auto IsVALUHazardFn = [
MI, TII, TRI, &Category,
this](
const MachineInstr &
I) {
2124 if (!TII->isXDLWMMA(
I))
2127 unsigned Latency = TSchedModel.computeInstrLatency(&
I);
2132 Register D0 = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2133 for (
const MachineOperand &ValuUse :
MI->explicit_uses()) {
2134 if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))
2138 auto *ValuDst = TII->getNamedOperand(*
MI, AMDGPU::OpName::vdst);
2139 if (!ValuDst || !ValuDst->isReg())
2144 if (TRI->regsOverlap(D0, D1))
2148 Register A0 = TII->getNamedOperand(
I, AMDGPU::OpName::src0)->getReg();
2149 Register B0 = TII->getNamedOperand(
I, AMDGPU::OpName::src1)->getReg();
2150 if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))
2154 Register Idx0 = TII->getNamedOperand(
I, AMDGPU::OpName::src2)->getReg();
2155 if (TRI->regsOverlap(D1, Idx0))
2164 auto GetWaitStatesFn = [](
const MachineInstr &
I) {
2168 int WaitStatesNeeded = -1;
2169 if (TII->isXDLWMMA(*
MI)) {
2170 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2171 Limit = WMMAWaitStates[Category];
2177 Limit - getWaitStatesSince(IsWMMAHazardFn, Limit, GetWaitStatesFn);
2180 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2181 Limit = VALUWaitStates[Category];
2187 Limit - getWaitStatesSince(IsVALUHazardFn, Limit, GetWaitStatesFn);
2191 return WaitStatesNeeded;
2194bool GCNHazardRecognizer::fixShift64HighRegBug(
MachineInstr *
MI) {
2195 if (!ST.hasShift64HighRegBug())
2197 assert(!ST.hasExtendedWaitCounts());
2199 switch (
MI->getOpcode()) {
2202 case AMDGPU::V_LSHLREV_B64_e64:
2203 case AMDGPU::V_LSHRREV_B64_e64:
2204 case AMDGPU::V_ASHRREV_I64_e64:
2208 MachineOperand *Amt = TII.getNamedOperand(*
MI, AMDGPU::OpName::src0);
2213 const MachineRegisterInfo &
MRI = MF.getRegInfo();
2215 if (!TRI.isVGPR(
MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2218 if (AmtReg != AMDGPU::VGPR255 &&
MRI.isPhysRegUsed(AmtReg + 1))
2221 assert(ST.needsAlignedVGPRs());
2222 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2225 MachineBasicBlock *
MBB =
MI->getParent();
2226 MachineOperand *Src1 = TII.getNamedOperand(*
MI, AMDGPU::OpName::src1);
2237 Register DstReg =
MI->getOperand(0).getReg();
2239 Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
2247 bool Overlapped =
MI->modifiesRegister(AmtReg, &TRI);
2249 for (MCRegister
Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2250 : AMDGPU::VGPR_32RegClass) {
2251 if (!
MI->modifiesRegister(
Reg, &TRI) && !
MI->readsRegister(
Reg, &TRI)) {
2257 Register NewAmt = Overlapped ? (
Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2262 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2275 runOnInstruction(
BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2282 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2288 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2302 MI->getOperand(0).setReg(NewReg);
2312 int NSAtoVMEMWaitStates = 1;
2314 if (!ST.hasNSAtoVMEMBug())
2320 const SIInstrInfo *TII = ST.getInstrInfo();
2321 const auto *
Offset = TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
2329 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2330 TII->getInstSizeInBytes(
I) >= 16;
2333 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
2336int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
MachineInstr *
MI) {
2337 int FPAtomicToDenormModeWaitStates = 3;
2339 if (!ST.hasFPAtomicToDenormModeHazard())
2341 assert(!ST.hasExtendedWaitCounts());
2343 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2352 auto IsExpiredFn = [](
const MachineInstr &
MI,
int WaitStates) {
2359 return FPAtomicToDenormModeWaitStates -
2366 return ST.hasGFX90AInsts() ? checkMAIHazards90A(
MI) : checkMAIHazards908(
MI);
2374 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2378 int NeighborMFMALatency = 0;
2379 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2380 this](
const MachineInstr &
MI) {
2384 NeighborMFMALatency = this->getMFMAPipelineWaitStates(
MI);
2388 const int MaxMFMAPipelineWaitStates = 16;
2389 int WaitStatesSinceNeighborMFMA =
2390 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2392 int NeighborMFMAPaddingNeeded =
2394 WaitStatesSinceNeighborMFMA;
2396 return std::max(0, NeighborMFMAPaddingNeeded);
2400 int WaitStatesNeeded = 0;
2401 unsigned Opc =
MI->getOpcode();
2403 auto IsVALUFn = [](
const MachineInstr &
MI) {
2407 if (
Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
2408 const int LegacyVALUWritesVGPRWaitStates = 2;
2409 const int VALUWritesExecWaitStates = 4;
2410 const int MaxWaitStates = 4;
2412 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2413 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2414 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2416 if (WaitStatesNeeded < MaxWaitStates) {
2417 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2418 const int MaxWaitStates = 2;
2420 if (!
Use.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
2423 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2424 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
2425 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2427 if (WaitStatesNeeded == MaxWaitStates)
2433 for (
const MachineOperand &
Op :
MI->explicit_operands()) {
2434 if (!
Op.isReg() || !TRI.isAGPR(MF.getRegInfo(),
Op.getReg()))
2437 if (
Op.isDef() &&
Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2440 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2441 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2442 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2443 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2444 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2445 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2446 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2447 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2448 const int MaxWaitStates = 18;
2450 unsigned HazardDefLatency = 0;
2452 auto IsOverlappedMFMAFn = [
Reg, &HazardDefLatency,
2453 this](
const MachineInstr &
MI) {
2460 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2461 return TRI.regsOverlap(DstReg,
Reg);
2464 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn,
2466 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2467 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2468 int OpNo =
Op.getOperandNo();
2469 if (OpNo == SrcCIdx) {
2470 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2471 }
else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2472 switch (HazardDefLatency) {
2473 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2475 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2477 case 16: [[fallthrough]];
2478 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2481 }
else if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2482 switch (HazardDefLatency) {
2483 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2485 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2487 case 16: [[fallthrough]];
2488 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2493 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2494 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2496 if (WaitStatesNeeded == MaxWaitStates)
2497 return WaitStatesNeeded;
2499 auto IsAccVgprWriteFn = [
Reg,
this](
const MachineInstr &
MI) {
2500 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2503 return TRI.regsOverlap(
Reg, DstReg);
2506 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2507 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2508 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2509 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2510 if (OpNo == SrcCIdx)
2511 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2512 else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2513 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2515 WaitStatesNeededForUse = NeedWaitStates -
2516 getWaitStatesSinceDef(
Reg, IsAccVgprWriteFn, MaxWaitStates);
2517 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2519 if (WaitStatesNeeded == MaxWaitStates)
2520 return WaitStatesNeeded;
2523 if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2524 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2525 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2526 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2527 const int MaxWaitStates = 13;
2528 Register DstReg =
MI->getOperand(0).getReg();
2529 unsigned HazardDefLatency = 0;
2531 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2532 this](
const MachineInstr &
MI) {
2535 Register Reg = TII.getNamedOperand(
MI, AMDGPU::OpName::src2)->getReg();
2537 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2538 return TRI.regsOverlap(
Reg, DstReg);
2541 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2543 switch (HazardDefLatency) {
2544 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2546 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2548 case 16: [[fallthrough]];
2549 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2553 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2554 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2558 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2560 return WaitStatesNeeded;
2571 return NumPasses + 1 + IsGFX950;
2582 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2600 return NumPasses + 2;
2610 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2614 int WaitStatesNeeded = 0;
2615 unsigned Opc =
MI->getOpcode();
2617 auto IsLegacyVALUFn = [](
const MachineInstr &
MI) {
2621 auto IsLegacyVALUNotDotFn = [](
const MachineInstr &
MI) {
2627 return WaitStatesNeeded;
2629 const int VALUWritesExecWaitStates = 4;
2630 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2631 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2632 VALUWritesExecWaitStates);
2633 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2635 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2638 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2639 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2640 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2641 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2642 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2643 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2644 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2645 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2646 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2647 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2648 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2649 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2650 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2651 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2652 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2653 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2654 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2655 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2656 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2657 const int MaxWaitStates = 19;
2663 const MachineInstr *MI1;
2665 auto IsOverlappedMFMAFn = [
Reg, &FullReg, &MI1,
2666 this](
const MachineInstr &
MI) {
2670 FullReg = (DstReg ==
Reg);
2672 return TRI.regsOverlap(DstReg,
Reg);
2675 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2676 getWaitStatesSinceDef(
Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2677 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2680 getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn, MaxWaitStates);
2681 if (NumWaitStates == std::numeric_limits<int>::max())
2684 int OpNo =
Use.getOperandNo();
2686 int NeedWaitStates = 0;
2687 if (OpNo == SrcCIdx) {
2691 }
else if (FullReg) {
2692 if ((
Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2693 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2694 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2695 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2696 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2697 else if (ST.hasGFX940Insts() &&
2698 TSchedModel.computeInstrLatency(MI1) == 2)
2699 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2702 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2703 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2704 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2705 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2706 if (!TII.isXDL(*
MI))
2709 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2710 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2712 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2713 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2714 if (!TII.isXDL(*
MI))
2715 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2718 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2719 if (ST.hasGFX940Insts()) {
2720 if (TII.isXDL(*
MI) && !TII.isXDL(*MI1))
2727 NumPasses, ST.hasGFX950Insts())
2729 NumPasses, ST.hasGFX950Insts()))
2735 switch (NumPasses) {
2739 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2740 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2745 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2746 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2751 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2752 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2761 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2762 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2763 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2764 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2767 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2768 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2770 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2771 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2772 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2775 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2777 if (ST.hasGFX940Insts()) {
2781 NumPasses, ST.hasGFX950Insts())
2787 switch (NumPasses) {
2789 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2794 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2798 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2802 if (WaitStatesNeeded >= NeedWaitStates)
2805 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2806 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2808 if (WaitStatesNeeded == MaxWaitStates)
2813 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2815 return WaitStatesNeeded;
2820 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2823 int WaitStatesNeeded = 0;
2825 auto IsAccVgprReadFn = [](
const MachineInstr &
MI) {
2826 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2829 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
2830 if (!
Op.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Op.getReg()))
2835 const int AccVgprReadLdStWaitStates = 2;
2836 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2837 const int MaxWaitStates = 2;
2839 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2840 getWaitStatesSinceDef(
Reg, IsAccVgprReadFn, MaxWaitStates);
2841 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2843 if (WaitStatesNeeded == MaxWaitStates)
2844 return WaitStatesNeeded;
2846 auto IsVALUAccVgprRdWrCheckFn = [
Reg,
this](
const MachineInstr &
MI) {
2847 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2848 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2850 auto IsVALUFn = [](
const MachineInstr &
MI) {
2853 return getWaitStatesSinceDef(
Reg, IsVALUFn, 2 ) <
2854 std::numeric_limits<int>::max();
2857 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2858 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2859 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2862 return WaitStatesNeeded;
2866 assert(!ST.hasVcmpxPermlaneHazard() &&
2867 "this is a different vcmpx+permlane hazard");
2868 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2869 const SIInstrInfo *TII = ST.getInstrInfo();
2871 auto IsVCmpXWritesExecFn = [TII, TRI](
const MachineInstr &
MI) {
2875 auto IsVALUFn = [](
const MachineInstr &
MI) {
2879 const int VCmpXWritesExecWaitStates = 4;
2880 const int VALUWritesVDstWaitStates = 2;
2881 int WaitStatesNeeded = 0;
2883 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
2884 if (!
Op.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Op.getReg()))
2888 int WaitStatesSinceDef =
2889 VALUWritesVDstWaitStates -
2890 getWaitStatesSinceDef(
Reg, IsVALUFn,
2891 VALUWritesVDstWaitStates);
2892 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2893 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2897 int VCmpXHazardWaits =
2898 VCmpXWritesExecWaitStates -
2899 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2901 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2902 return WaitStatesNeeded;
2910 return NumPasses + 2;
2920 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2930 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2938 return NumPasses + 2;
2942 if (!ST.hasGFX90AInsts())
2945 auto IsDGEMMFn = [](
const MachineInstr &
MI) ->
bool {
2953 const MachineRegisterInfo &
MRI = MF.getRegInfo();
2955 int WaitStatesNeeded = 0;
2961 const MachineInstr *
MFMA =
nullptr;
2963 auto IsMFMAWriteFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
2965 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
2971 const MachineInstr *
DOT =
nullptr;
2972 auto IsDotWriteFn = [&
Reg, &
DOT,
this](
const MachineInstr &
MI) {
2974 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
2980 bool DGEMMAfterVALUWrite =
false;
2981 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite,
this](
const MachineInstr &
MI) {
2984 DGEMMAfterVALUWrite =
true;
2988 if (!TII.isVALU(
MI) || !DGEMMAfterVALUWrite)
2994 int SrcCIdx = AMDGPU::getNamedOperandIdx(
MI->getOpcode(),
2995 AMDGPU::OpName::src2);
2997 if (IsMemOrExport || IsVALU) {
2998 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2999 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3000 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3001 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3002 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3003 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3004 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3005 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3006 const int DotWriteSameDotReadSrcAB = 3;
3007 const int DotWriteDifferentVALURead = 3;
3008 const int DMFMABetweenVALUWriteVMEMRead = 2;
3009 const int MaxWaitStates = 19;
3011 for (
const MachineOperand &Use :
MI->explicit_uses()) {
3017 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3020 int NeedWaitStates = 0;
3021 if (
DOT->getOpcode() ==
MI->getOpcode()) {
3022 if (&Use - &
MI->getOperand(0) != SrcCIdx)
3023 NeedWaitStates = DotWriteSameDotReadSrcAB;
3025 NeedWaitStates = DotWriteDifferentVALURead;
3028 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3029 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3036 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3037 DGEMMAfterVALUWrite =
false;
3038 if (TRI.isVectorRegister(
MRI,
Reg)) {
3039 int WaitStatesNeededForUse =
3040 DMFMABetweenVALUWriteVMEMRead -
3041 getWaitStatesSinceDef(
Reg, IsDGEMMHazard,
3042 DMFMABetweenVALUWriteVMEMRead);
3044 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3049 WaitStatesSinceDef =
3050 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3054 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3055 int NumPasses = HazardDefLatency;
3056 int NeedWaitStates = MaxWaitStates;
3059 switch (HazardDefLatency) {
3061 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3062 : DMFMA4x4WriteVgprVALUReadWaitStates;
3068 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3069 : (ST.hasGFX950Insts()
3070 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3071 : DMFMA16x16WriteVgprVALUReadWaitStates);
3076 }
else if (ST.hasGFX940Insts()) {
3080 NumPasses, ST.hasGFX950Insts())
3084 switch (HazardDefLatency) {
3086 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3089 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3092 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3099 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3100 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3102 if (WaitStatesNeeded == MaxWaitStates)
3107 unsigned Opc =
MI->getOpcode();
3108 const int DMFMAToFMA64WaitStates = 2;
3109 if ((
Opc == AMDGPU::V_FMA_F64_e64 ||
3110 Opc == AMDGPU::V_FMAC_F64_e32 ||
Opc == AMDGPU::V_FMAC_F64_e64 ||
3111 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3112 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3113 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3114 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3115 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3118 if (!IsVALU && !IsMemOrExport)
3119 return WaitStatesNeeded;
3121 for (
const MachineOperand &Def :
MI->defs()) {
3122 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3123 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3124 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3125 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3126 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3127 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3128 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3129 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3130 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3131 const int DotWriteDifferentVALUWrite = 3;
3132 const int MaxWaitStates = 19;
3133 const int MaxWarWaitStates = 15;
3138 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3140 if (DOT &&
DOT->getOpcode() !=
MI->getOpcode())
3141 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3142 WaitStatesSinceDef);
3145 WaitStatesSinceDef =
3146 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3148 int NeedWaitStates = MaxWaitStates;
3149 int NumPasses = TSchedModel.computeInstrLatency(
MFMA);
3152 switch (NumPasses) {
3154 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3158 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3163 }
else if (ST.hasGFX940Insts()) {
3167 NumPasses, ST.hasGFX950Insts())
3170 switch (NumPasses) {
3172 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3175 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3178 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3185 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3186 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3188 if (WaitStatesNeeded == MaxWaitStates)
3192 auto IsSMFMAReadAsCFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
3194 !
MI.readsRegister(
Reg, &TRI))
3197 if (ST.hasGFX940Insts() && !TII.isXDL(
MI))
3200 const MachineOperand *SrcC =
3201 TII.getNamedOperand(
MI, AMDGPU::OpName::src2);
3211 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3216 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3217 int NeedWaitStates = MaxWaitStates;
3218 switch (HazardDefLatency) {
3219 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3221 case 4:
assert(ST.hasGFX940Insts());
3222 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3224 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3226 case 16: [[fallthrough]];
3227 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3231 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3232 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3235 return WaitStatesNeeded;
3248 return MAI !=
nullptr;
3252 if (IsMFMAFn(*
MI)) {
3253 int W = getWaitStatesSince(IsMFMAFn, 16);
3255 return W < (int)TSchedModel.computeInstrLatency(MAI);
3269 while (
I->isBundledWithPred())
3275 if (
I->getOpcode() != AMDGPU::S_GETPC_B64)
3279 const unsigned NewBytes = 4;
3281 "Unexpected instruction insertion in bundle");
3284 while (NextMI != End && NextMI->isBundledWithPred()) {
3285 for (
auto &Operand : NextMI->operands()) {
3286 if (Operand.isGlobal())
3287 Operand.setOffset(Operand.getOffset() + NewBytes);
3293bool GCNHazardRecognizer::fixVALUMaskWriteHazard(
MachineInstr *
MI) {
3294 if (!ST.hasVALUMaskWriteHazard())
3296 assert(!ST.hasExtendedWaitCounts());
3303 if (!IsSALU && !IsVALU)
3315 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3316 const MachineRegisterInfo &
MRI = MF.getRegInfo();
3321 case AMDGPU::EXEC_LO:
3322 case AMDGPU::EXEC_HI:
3324 case AMDGPU::SGPR_NULL:
3325 case AMDGPU::SGPR_NULL64:
3333 return Reg == AMDGPU::VCC ||
Reg == AMDGPU::VCC_LO ||
Reg == AMDGPU::VCC_HI;
3337 SmallSet<Register, 2> HazardSGPRs;
3339 static unsigned getHashValue(
const StateType &State) {
3342 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
3343 return LHS.HazardSGPRs ==
RHS.HazardSGPRs;
3347 SmallVector<const MachineInstr *> WaitInstrs;
3348 bool HasSGPRRead =
false;
3349 StateType InitialState;
3352 MachineOperand *HazardDef =
nullptr;
3353 for (MachineOperand &
Op :
MI->operands()) {
3356 if (
Op.isDef() && HazardDef)
3360 if (IgnoreableSGPR(
Reg))
3363 if (
Op.isImplicit())
3365 if (!TRI->isSGPRReg(
MRI,
Reg))
3383 if (AMDGPU::SReg_32RegClass.
contains(HazardReg)) {
3384 InitialState.HazardSGPRs.insert(HazardReg);
3387 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3388 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3391 auto IsHazardFn = [&](StateType &State,
const MachineInstr &
I) {
3392 if (State.HazardSGPRs.empty())
3395 switch (
I.getOpcode()) {
3396 case AMDGPU::V_ADDC_U32_e32:
3397 case AMDGPU::V_ADDC_U32_dpp:
3398 case AMDGPU::V_CNDMASK_B16_t16_e32:
3399 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3400 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3401 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3402 case AMDGPU::V_CNDMASK_B32_e32:
3403 case AMDGPU::V_CNDMASK_B32_dpp:
3404 case AMDGPU::V_DIV_FMAS_F32_e64:
3405 case AMDGPU::V_DIV_FMAS_F64_e64:
3406 case AMDGPU::V_SUBB_U32_e32:
3407 case AMDGPU::V_SUBB_U32_dpp:
3408 case AMDGPU::V_SUBBREV_U32_e32:
3409 case AMDGPU::V_SUBBREV_U32_dpp: {
3413 case AMDGPU::V_ADDC_U32_e64:
3414 case AMDGPU::V_ADDC_U32_e64_dpp:
3415 case AMDGPU::V_CNDMASK_B16_t16_e64:
3416 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3417 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3418 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3419 case AMDGPU::V_CNDMASK_B32_e64:
3420 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3421 case AMDGPU::V_SUBB_U32_e64:
3422 case AMDGPU::V_SUBB_U32_e64_dpp:
3423 case AMDGPU::V_SUBBREV_U32_e64:
3424 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3426 const MachineOperand *SSRCOp = TII.getNamedOperand(
I, AMDGPU::OpName::src2);
3428 bool Result = TRI->regsOverlap(SSRCOp->
getReg(), HazardReg);
3440 auto UpdateStateFn = [&](StateType &State,
const MachineInstr &
I) {
3441 switch (
I.getOpcode()) {
3442 case AMDGPU::S_WAITCNT_DEPCTR:
3444 if (!HasSGPRRead &&
I.getParent() ==
MI->getParent() && !
I.isBundled() &&
3445 (
I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3450 for (
auto &
Op :
I.operands()) {
3455 if (IgnoreableSGPR(
Reg))
3458 if (
Op.isImplicit())
3460 if (!TRI->isSGPRReg(
MRI,
Reg))
3471 for (
Register SGPR : State.HazardSGPRs) {
3472 if (
Reg == SGPR || TRI->regsOverlap(
Reg, SGPR))
3476 State.HazardSGPRs.erase(SGPR);
3485 std::next(
MI->getReverseIterator())))
3495 if (!WaitInstrs.
empty()) {
3499 SmallVector<MachineInstr *> ToErase;
3501 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3502 End = MI->getParent()->rend();
3503 Found < WaitInstrs.size() && It != End; ++It) {
3504 MachineInstr *WaitMI = &*It;
3506 if (std::as_const(WaitMI) != WaitInstrs[Found])
3509 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3510 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3511 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3512 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3513 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3514 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3515 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3516 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3517 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3518 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3519 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3520 ToErase.push_back(WaitMI);
3523 for (MachineInstr *WaitMI : ToErase)
3524 WaitMI->eraseFromParent();
3528 auto NextMI = std::next(
MI->getIterator());
3529 auto NewMI =
BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
3530 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3542 if (EntryMBB.
begin() != EntryMBB.
end()) {
3543 auto &EntryMI = *EntryMBB.
begin();
3544 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3545 EntryMI.getOperand(0).getImm() >= Priority)
3554bool GCNHazardRecognizer::fixRequiredExportPriority(
MachineInstr *
MI) {
3555 if (!ST.hasRequiredExportPriority())
3560 MachineBasicBlock *
MBB =
MI->getParent();
3573 const int MaxPriority = 3;
3574 const int NormalPriority = 2;
3575 const int PostExportPriority = 0;
3577 auto It =
MI->getIterator();
3578 switch (
MI->getOpcode()) {
3579 case AMDGPU::S_ENDPGM:
3580 case AMDGPU::S_ENDPGM_SAVED:
3581 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3582 case AMDGPU::SI_RETURN_TO_EPILOG:
3585 if (MF->getFrameInfo().hasCalls())
3588 case AMDGPU::S_SETPRIO: {
3590 auto &PrioOp =
MI->getOperand(0);
3591 int Prio = PrioOp.getImm();
3592 bool InWA = (Prio == PostExportPriority) &&
3593 (It !=
MBB->
begin() && TII.isEXP(*std::prev(It)));
3594 if (InWA || Prio >= NormalPriority)
3596 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3600 if (!TII.isEXP(*
MI))
3611 auto NextMI = std::next(It);
3612 bool EndOfShader =
false;
3613 if (NextMI !=
MBB->
end()) {
3615 if (TII.isEXP(*NextMI))
3618 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3619 NextMI->getOperand(0).getImm() == PostExportPriority)
3621 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3628 .
addImm(PostExportPriority);
3632 BuildMI(*
MBB, NextMI,
DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3633 .
addReg(AMDGPU::SGPR_NULL)
3653 const SIInstrInfo *TII = ST.getInstrInfo();
3665 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3670bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(
MachineInstr *
MI) {
3671 if (
MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3674 const SIInstrInfo *TII = ST.getInstrInfo();
3676 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3678 BuildMI(*
MI->getParent(), std::next(
MI->getIterator()),
MI->getDebugLoc(),
3679 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3685bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(
MachineInstr *
MI) {
3688 if (!IsHazardRecognizerMode)
3691 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3692 const SIInstrInfo *TII = ST.getInstrInfo();
3694 const int FlatScrBaseWaitStates = 10;
3696 bool ReadsFlatScrLo =
3697 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3698 bool ReadsFlatScrHi =
3699 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3705 ReadsFlatScrLo =
true;
3708 ReadsFlatScrHi =
true;
3713 const MachineRegisterInfo &
MRI = MF.getRegInfo();
3716 DenseSet<const MachineBasicBlock *> Visited;
3718 return MI.modifiesRegister(
Reg, TRI);
3723 auto IsSGPRDef = [TII, TRI, &
MRI](
const MachineInstr &
MI) ->
unsigned {
3724 if (!TII->isSALU(
MI) && !TII->isVALU(
MI))
3726 for (
const MachineOperand &MO :
MI.all_defs()) {
3727 if (TRI->isSGPRReg(
MRI, MO.getReg()))
3733 auto IsExpiredFn = [=](
const MachineInstr &
MI,
int SgprWrites) {
3734 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3735 unsigned Wait =
MI.getOperand(0).getImm();
3740 return SgprWrites >= FlatScrBaseWaitStates;
3743 return ::getWaitStatesSince(
3744 IsHazardFn,
MI->getParent(), std::next(
MI->getReverseIterator()),
3745 0,
IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3748 if ((!ReadsFlatScrLo ||
MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
3749 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3750 (!ReadsFlatScrHi ||
MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
3751 !IsRegDefHazard(AMDGPU::SGPR103)))
3755 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3766 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3767 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
unsigned const MachineRegisterInfo * MRI
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, unsigned Latency, unsigned Category)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(DataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
static const uint32_t IV[8]
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Implements a dense probed hash-table based set.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
unsigned PreEmitNoopsCommon(MachineInstr *)
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
Wrapper class representing physical registers. Should be passed by value.
Instructions::const_reverse_iterator const_reverse_instr_iterator
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
unsigned getOccupancy() const
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned getMaxLookAhead() const
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
NodeAddr< UseNode * > Use
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
constexpr RegState getDeadRegState(bool B)
FunctionAddr VTableAddr Count
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...