26struct MFMAPaddingRatioParser :
public cl::parser<unsigned> {
27 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(
O) {}
29 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg,
unsigned &
Value) {
31 return O.error(
"'" + Arg +
"' value invalid for uint argument!");
34 return O.error(
"'" + Arg +
"' value must be in the range [0, 100]!");
44 cl::desc(
"Fill a percentage of the latency between "
45 "neighboring MFMA with s_nops."));
50 cl::desc(
"Insert a s_nop x before every instruction"));
60 : IsHazardRecognizerMode(
false), CurrCycleInstr(nullptr), MF(MF),
61 ST(MF.getSubtarget<
GCNSubtarget>()), TII(*ST.getInstrInfo()),
62 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
64 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
69 EmittedInstrs.clear();
81 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
85 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
90 case AMDGPU::S_SETREG_B32:
91 case AMDGPU::S_SETREG_B32_mode:
92 case AMDGPU::S_SETREG_IMM32_B32:
93 case AMDGPU::S_SETREG_IMM32_B32_mode:
100 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
104 return Opcode == AMDGPU::S_RFE_B64;
109 case AMDGPU::S_MOVRELS_B32:
110 case AMDGPU::S_MOVRELS_B64:
111 case AMDGPU::S_MOVRELD_B32:
112 case AMDGPU::S_MOVRELD_B64:
121 if (
TII.isAlwaysGDS(
MI.getOpcode()))
124 switch (
MI.getOpcode()) {
125 case AMDGPU::S_SENDMSG:
126 case AMDGPU::S_SENDMSGHALT:
127 case AMDGPU::S_TTRACEDATA:
131 case AMDGPU::DS_PERMUTE_B32:
132 case AMDGPU::DS_BPERMUTE_B32:
135 if (
TII.isDS(
MI.getOpcode())) {
136 int GDS = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
137 AMDGPU::OpName::gds);
138 if (
MI.getOperand(GDS).getImm())
146 unsigned Opcode =
MI.getOpcode();
147 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
148 Opcode == AMDGPU::V_PERMLANE64_B32 ||
149 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
150 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
151 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
152 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
153 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
154 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
155 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
156 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
157 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
158 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
159 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
160 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
170 AMDGPU::OpName::simm16);
187 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(
MI) > 0)
190 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
193 if (ST.hasNoDataDepHazard())
205 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
208 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
213 checkMAIVALUHazards(
MI) > 0)
216 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
219 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
222 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
225 if (((ST.hasReadM0MovRelInterpHazard() &&
227 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
228 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
230 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
231 (ST.hasReadM0LdsDirectHazard() &&
232 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr))) &&
233 checkReadM0Hazards(
MI) > 0)
240 checkMAILdStHazards(
MI) > 0)
243 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
251 while (Quantity > 0) {
252 unsigned Arg = std::min(Quantity, 8u);
260GCNHazardRecognizer::getMFMAPipelineWaitStates(
const MachineInstr &
MI)
const {
261 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&
MI);
262 assert(TSchedModel.getWriteProcResBegin(SC) !=
263 TSchedModel.getWriteProcResEnd(SC));
264 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
267void GCNHazardRecognizer::processBundle() {
271 for (;
MI !=
E &&
MI->isInsideBundle(); ++
MI) {
272 CurrCycleInstr = &*
MI;
275 if (IsHazardRecognizerMode) {
276 fixHazards(CurrCycleInstr);
284 for (
unsigned i = 0, e = std::min(WaitStates,
MaxLookAhead - 1); i <
e; ++i)
285 EmittedInstrs.push_front(
nullptr);
287 EmittedInstrs.push_front(CurrCycleInstr);
290 CurrCycleInstr =
nullptr;
294 assert(IsHazardRecognizerMode);
298 if (
MI->isInsideBundle())
308 IsHazardRecognizerMode =
true;
312 CurrCycleInstr =
nullptr;
323 return std::max(WaitStates, checkSMRDHazards(
MI));
325 if (ST.hasNSAtoVMEMBug())
326 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
328 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
330 if (ST.hasNoDataDepHazard())
334 WaitStates = std::max(WaitStates, checkVMEMHazards(
MI));
337 WaitStates = std::max(WaitStates, checkVALUHazards(
MI));
340 WaitStates = std::max(WaitStates, checkDPPHazards(
MI));
343 WaitStates = std::max(WaitStates, checkDivFMasHazards(
MI));
346 WaitStates = std::max(WaitStates, checkRWLaneHazards(
MI));
350 checkMAIVALUHazards(
MI) > 0)
351 WaitStates = std::max(WaitStates, checkMAIVALUHazards(
MI));
353 if (
MI->isInlineAsm())
354 return std::max(WaitStates, checkInlineAsmHazards(
MI));
357 return std::max(WaitStates, checkGetRegHazards(
MI));
360 return std::max(WaitStates, checkSetRegHazards(
MI));
363 return std::max(WaitStates, checkRFEHazards(
MI));
365 if ((ST.hasReadM0MovRelInterpHazard() &&
367 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
368 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
370 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
371 (ST.hasReadM0LdsDirectHazard() &&
372 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr)))
373 return std::max(WaitStates, checkReadM0Hazards(
MI));
376 return std::max(WaitStates, checkMAIHazards(
MI));
379 return std::max(WaitStates, checkMAILdStHazards(
MI));
382 return std::max(WaitStates, checkPermlaneHazards(
MI));
388 EmittedInstrs.push_front(
nullptr);
394 if (!CurrCycleInstr) {
395 EmittedInstrs.push_front(
nullptr);
399 if (CurrCycleInstr->isBundle()) {
404 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
405 if (!NumWaitStates) {
406 CurrCycleInstr =
nullptr;
411 EmittedInstrs.push_front(CurrCycleInstr);
418 EmittedInstrs.push_front(
nullptr);
426 CurrCycleInstr =
nullptr;
430 assert(!IsHazardRecognizerMode &&
431 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
444template <
typename StateT>
454 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
459 static inline StateMapKey getEmptyKey() {
464 static inline StateMapKey getTombstoneKey() {
469 static unsigned getHashValue(
const StateMapKey &
Key) {
470 return StateT::getHashValue((*
Key.States)[
Key.Idx]);
472 static unsigned getHashValue(
const StateT &State) {
473 return StateT::getHashValue(State);
475 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
476 const auto EKey = getEmptyKey();
477 const auto TKey = getTombstoneKey();
478 if (StateMapKey::isEqual(
LHS, EKey) || StateMapKey::isEqual(
RHS, EKey) ||
479 StateMapKey::isEqual(
LHS, TKey) || StateMapKey::isEqual(
RHS, TKey))
480 return StateMapKey::isEqual(
LHS,
RHS);
481 return StateT::isEqual((*
LHS.States)[
LHS.Idx], (*
RHS.States)[
RHS.Idx]);
483 static bool isEqual(
const StateT &
LHS,
const StateMapKey &
RHS) {
484 if (StateMapKey::isEqual(
RHS, getEmptyKey()) ||
485 StateMapKey::isEqual(
RHS, getTombstoneKey()))
487 return StateT::isEqual(
LHS, (*
RHS.States)[
RHS.Idx]);
496 StateT State = InitialState;
499 unsigned WorkIdx = 0;
501 bool Expired =
false;
502 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
507 auto Result = IsHazard(State, *
I);
515 if (
I->isInlineAsm() ||
I->isMetaInstruction())
518 UpdateState(State, *
I);
522 unsigned StateIdx = States.
size();
523 StateMapKey
Key = {&States, StateIdx};
524 auto Insertion = StateMap.
insert_as(std::pair(
Key, StateIdx), State);
525 if (Insertion.second) {
528 StateIdx = Insertion.first->second;
531 Worklist.
insert(std::pair(Pred, StateIdx));
534 if (WorkIdx == Worklist.
size())
538 std::tie(
MBB, StateIdx) = Worklist[WorkIdx++];
539 State = States[StateIdx];
540 I =
MBB->instr_rbegin();
554 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
562 if (
I->isInlineAsm())
565 WaitStates += GetNumWaitStates(*
I);
567 if (IsExpired(*
I, WaitStates))
568 return std::numeric_limits<int>::max();
571 int MinWaitStates = std::numeric_limits<int>::max();
573 if (!Visited.
insert(Pred).second)
577 IsExpired, Visited, GetNumWaitStates);
579 MinWaitStates = std::min(MinWaitStates, W);
582 return MinWaitStates;
589 std::next(
MI->getReverseIterator()), 0, IsExpired,
593int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
int Limit) {
594 if (IsHazardRecognizerMode) {
595 auto IsExpiredFn = [Limit](
const MachineInstr &,
int WaitStates) {
596 return WaitStates >= Limit;
598 return ::getWaitStatesSince(IsHazard, CurrCycleInstr,
IsExpiredFn);
602 for (MachineInstr *
MI : EmittedInstrs) {
607 if (
MI->isInlineAsm())
612 if (WaitStates >= Limit)
615 return std::numeric_limits<int>::max();
618int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
619 IsHazardFn IsHazardDef,
621 const SIRegisterInfo *TRI = ST.getRegisterInfo();
624 return IsHazardDef(
MI) &&
MI.modifiesRegister(
Reg, TRI);
630int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
645 for (MCRegUnit Unit :
TRI.regunits(
Reg))
646 BV.
set(
static_cast<unsigned>(Unit));
670int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM) {
673 if (!ST.isXNACKEnabled())
676 bool IsSMRD = TII.isSMRD(*MEM);
690 for (MachineInstr *
MI : EmittedInstrs) {
702 if (ClauseDefs.none())
715 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
718int GCNHazardRecognizer::checkSMRDHazards(
MachineInstr *SMRD) {
719 int WaitStatesNeeded = 0;
721 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
724 if (!ST.hasSMRDReadVALUDefHazard())
725 return WaitStatesNeeded;
729 int SmrdSgprWaitStates = 4;
730 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
731 return TII.isVALU(
MI);
733 auto IsBufferHazardDefFn = [
this](
const MachineInstr &
MI) {
734 return TII.isSALU(
MI);
737 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
739 for (
const MachineOperand &Use :
SMRD->uses()) {
742 int WaitStatesNeededForUse =
743 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
745 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
755 int WaitStatesNeededForUse =
756 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
759 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
763 return WaitStatesNeeded;
766int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr* VMEM) {
767 if (!ST.hasVMEMReadSGPRVALUDefHazard())
770 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
774 const int VmemSgprWaitStates = 5;
775 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
776 return TII.isVALU(
MI);
778 for (
const MachineOperand &Use : VMEM->uses()) {
779 if (!
Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(),
Use.getReg()))
782 int WaitStatesNeededForUse =
783 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
785 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
787 return WaitStatesNeeded;
791 const SIRegisterInfo *TRI = ST.getRegisterInfo();
792 const SIInstrInfo *TII = ST.getInstrInfo();
795 int DppVgprWaitStates = 2;
796 int DppExecWaitStates = 5;
797 int WaitStatesNeeded = 0;
798 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
799 return TII->isVALU(
MI);
802 for (
const MachineOperand &Use :
DPP->uses()) {
803 if (!
Use.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Use.getReg()))
805 int WaitStatesNeededForUse =
806 DppVgprWaitStates - getWaitStatesSinceDef(
808 [](
const MachineInstr &) { return true; },
810 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
813 WaitStatesNeeded = std::max(
815 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
818 return WaitStatesNeeded;
821int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas) {
822 const SIInstrInfo *TII = ST.getInstrInfo();
826 const int DivFMasWaitStates = 4;
827 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
828 return TII->isVALU(
MI);
830 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
833 return DivFMasWaitStates - WaitStatesNeeded;
836int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr) {
837 const SIInstrInfo *TII = ST.getInstrInfo();
838 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
840 const int GetRegWaitStates = 2;
841 auto IsHazardFn = [TII, GetRegHWReg](
const MachineInstr &
MI) {
844 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
846 return GetRegWaitStates - WaitStatesNeeded;
849int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr) {
850 const SIInstrInfo *TII = ST.getInstrInfo();
851 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
853 const int SetRegWaitStates = ST.getSetRegWaitStates();
854 auto IsHazardFn = [TII, HWReg](
const MachineInstr &
MI) {
857 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
858 return SetRegWaitStates - WaitStatesNeeded;
861int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI) {
865 const SIInstrInfo *TII = ST.getInstrInfo();
866 unsigned Opcode =
MI.getOpcode();
867 const MCInstrDesc &
Desc =
MI.getDesc();
869 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
872 VDataRCID = TII->getOpRegClassID(
Desc.operands()[VDataIdx]);
874 if (TII->isMUBUF(
MI) || TII->isMTBUF(
MI)) {
881 const MachineOperand *SOffset =
882 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
886 (!SOffset || !SOffset->
isReg()))
894 if (TII->isMIMG(
MI)) {
895 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
897 Desc.operands()[SRsrcIdx])) == 256);
901 if (TII->isFLAT(
MI)) {
914GCNHazardRecognizer::checkVALUHazardsHelper(
const MachineOperand &Def,
918 const SIRegisterInfo *TRI = ST.getRegisterInfo();
920 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
921 int WaitStatesNeeded = 0;
923 if (!TRI->isVectorRegister(
MRI,
Def.getReg()))
924 return WaitStatesNeeded;
927 int DataIdx = createsVALUHazard(
MI);
928 return DataIdx >= 0 &&
929 TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(),
Reg);
932 int WaitStatesNeededForDef =
933 VALUWaitStates - getWaitStatesSince(
IsHazardFn, VALUWaitStates);
934 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
936 return WaitStatesNeeded;
952 unsigned Opcode =
MI.getOpcode();
962 if (
auto *DstSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel))
964 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
970 if (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src0_modifiers) &
972 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
976 (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src2_modifiers) &
978 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
984 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1005 for (
auto &Operand : VALU->operands()) {
1006 if (Operand.isReg() &&
TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1013int GCNHazardRecognizer::checkVALUHazards(
MachineInstr *VALU) {
1014 int WaitStatesNeeded = 0;
1017 const int TransDefWaitstates = 1;
1019 auto IsTransDefFn = [
this,
VALU](
const MachineInstr &
MI) {
1022 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1023 const SIInstrInfo *TII = ST.getInstrInfo();
1024 Register Def = TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)->getReg();
1026 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1027 if (
Use.isReg() && TRI->regsOverlap(Def,
Use.getReg()))
1034 int WaitStatesNeededForDef =
1035 TransDefWaitstates -
1036 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1037 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1040 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1041 const int Shift16DefWaitstates = 1;
1043 auto IsShift16BitDefFn = [
this,
VALU](
const MachineInstr &ProducerMI) {
1044 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1045 const MachineOperand *ForwardedDst =
1051 if (ProducerMI.isInlineAsm()) {
1053 for (
auto &Def : ProducerMI.all_defs()) {
1062 int WaitStatesNeededForDef =
1063 Shift16DefWaitstates -
1064 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1065 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1068 if (ST.hasVDecCoExecHazard()) {
1069 const int VALUWriteSGPRVALUReadWaitstates = 2;
1070 const int VALUWriteEXECRWLane = 4;
1071 const int VALUWriteVGPRReadlaneRead = 1;
1073 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1074 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1076 auto IsVALUDefSGPRFn = [&
UseReg, TRI](
const MachineInstr &
MI) {
1079 return MI.modifiesRegister(
UseReg, TRI);
1082 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1088 int WaitStatesNeededForDef =
1089 VALUWriteSGPRVALUReadWaitstates -
1090 getWaitStatesSince(IsVALUDefSGPRFn,
1091 VALUWriteSGPRVALUReadWaitstates);
1092 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1096 if (
VALU->readsRegister(AMDGPU::VCC, TRI)) {
1098 int WaitStatesNeededForDef =
1099 VALUWriteSGPRVALUReadWaitstates -
1100 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1101 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1104 switch (
VALU->getOpcode()) {
1105 case AMDGPU::V_READLANE_B32:
1106 case AMDGPU::V_READFIRSTLANE_B32: {
1107 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1109 int WaitStatesNeededForDef =
1110 VALUWriteVGPRReadlaneRead -
1111 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1112 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1115 case AMDGPU::V_WRITELANE_B32: {
1117 int WaitStatesNeededForDef =
1118 VALUWriteEXECRWLane -
1119 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1120 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1130 if (!ST.has12DWordStoreHazard())
1131 return WaitStatesNeeded;
1133 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1135 for (
const MachineOperand &Def :
VALU->defs()) {
1136 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def,
MRI));
1139 return WaitStatesNeeded;
1142int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA) {
1151 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1152 !ST.hasCvtScaleForwardingHazard())
1155 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1156 int WaitStatesNeeded = 0;
1158 for (
const MachineOperand &
Op :
1160 if (
Op.isReg() &&
Op.isDef()) {
1161 if (!TRI.isVectorRegister(
MRI,
Op.getReg()))
1164 if (ST.has12DWordStoreHazard()) {
1166 std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Op,
MRI));
1171 if (ST.hasDstSelForwardingHazard()) {
1172 const int Shift16DefWaitstates = 1;
1174 auto IsShift16BitDefFn = [
this, &
IA](
const MachineInstr &ProducerMI) {
1178 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1179 IA->readsRegister(Dst->getReg(), &TRI);
1181 if (ProducerMI.isInlineAsm()) {
1183 for (
auto &Def : ProducerMI.all_defs()) {
1184 if (
IA->modifiesRegister(
Def.getReg(), &TRI) ||
1185 IA->readsRegister(
Def.getReg(), &TRI)) {
1194 int WaitStatesNeededForDef =
1195 Shift16DefWaitstates -
1196 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1197 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1200 return WaitStatesNeeded;
1203int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane) {
1204 const SIInstrInfo *TII = ST.getInstrInfo();
1205 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1206 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1208 const MachineOperand *LaneSelectOp =
1209 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1211 if (!LaneSelectOp->
isReg() || !TRI->isSGPRReg(
MRI, LaneSelectOp->
getReg()))
1215 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isVALU(
MI); };
1217 const int RWLaneWaitStates = 4;
1218 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
1220 return RWLaneWaitStates - WaitStatesSince;
1223int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE) {
1224 if (!ST.hasRFEHazards())
1227 const SIInstrInfo *TII = ST.getInstrInfo();
1229 const int RFEWaitStates = 1;
1234 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
1235 return RFEWaitStates - WaitStatesNeeded;
1239 const SIInstrInfo *TII = ST.getInstrInfo();
1240 const int ReadM0WaitStates = 1;
1241 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isSALU(
MI); };
1242 return ReadM0WaitStates -
1243 getWaitStatesSinceDef(AMDGPU::M0,
IsHazardFn, ReadM0WaitStates);
1247 fixVMEMtoScalarWriteHazards(
MI);
1248 fixVcmpxPermlaneHazards(
MI);
1249 fixSMEMtoVectorWriteHazards(
MI);
1250 fixVcmpxExecWARHazard(
MI);
1251 fixLdsBranchVmemWARHazard(
MI);
1252 if (ST.hasLdsDirect()) {
1253 fixLdsDirectVALUHazard(
MI);
1254 fixLdsDirectVMEMHazard(
MI);
1256 fixVALUPartialForwardingHazard(
MI);
1257 fixVALUTransUseHazard(
MI);
1258 fixVALUTransCoexecutionHazards(
MI);
1260 fixWMMACoexecutionHazards(
MI);
1261 fixShift64HighRegBug(
MI);
1262 fixVALUMaskWriteHazard(
MI);
1263 fixRequiredExportPriority(
MI);
1264 if (ST.requiresWaitIdleBeforeGetReg())
1265 fixGetRegWaitIdle(
MI);
1266 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1267 fixDsAtomicAsyncBarrierArriveB64(
MI);
1268 if (ST.hasScratchBaseForwardingHazard())
1269 fixScratchBaseForwardingHazard(
MI);
1270 if (ST.setRegModeNeedsVNOPs())
1276 return (
TII.isVOPC(
MI) ||
1277 (
MI.isCompare() && (
TII.isVOP3(
MI) ||
TII.isSDWA(
MI)))) &&
1278 MI.modifiesRegister(AMDGPU::EXEC, &
TRI);
1281bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
1285 const SIInstrInfo *TII = ST.getInstrInfo();
1286 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1292 unsigned Opc =
MI.getOpcode();
1294 Opc != AMDGPU::V_NOP_e64 &&
Opc != AMDGPU::V_NOP_sdwa;
1298 std::numeric_limits<int>::max())
1304 auto *Src0 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
1306 bool IsUndef = Src0->isUndef();
1308 TII->get(AMDGPU::V_MOV_B32_e32))
1315bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
1316 if (!ST.hasVMEMtoScalarWriteHazard())
1318 assert(!ST.hasExtendedWaitCounts());
1323 if (
MI->getNumDefs() == 0)
1326 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1332 for (
const MachineOperand &Def :
MI->defs()) {
1333 const MachineOperand *
Op =
1334 I.findRegisterUseOperand(
Def.getReg(), TRI,
false);
1344 (
MI.getOpcode() == AMDGPU::S_WAITCNT &&
1345 !
MI.getOperand(0).getImm()) ||
1346 (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1351 std::numeric_limits<int>::max())
1354 const SIInstrInfo *TII = ST.getInstrInfo();
1356 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1361bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
1362 if (!ST.hasSMEMtoVectorWriteHazard())
1364 assert(!ST.hasExtendedWaitCounts());
1369 AMDGPU::OpName SDSTName;
1370 switch (
MI->getOpcode()) {
1371 case AMDGPU::V_READLANE_B32:
1372 case AMDGPU::V_READFIRSTLANE_B32:
1373 SDSTName = AMDGPU::OpName::vdst;
1376 SDSTName = AMDGPU::OpName::sdst;
1380 const SIInstrInfo *TII = ST.getInstrInfo();
1381 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1383 const MachineOperand *SDST = TII->getNamedOperand(*
MI, SDSTName);
1385 for (
const auto &MO :
MI->implicit_operands()) {
1386 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1397 auto IsHazardFn = [SDSTReg, TRI](
const MachineInstr &
I) {
1402 if (TII->isSALU(
MI)) {
1403 switch (
MI.getOpcode()) {
1404 case AMDGPU::S_SETVSKIP:
1405 case AMDGPU::S_VERSION:
1406 case AMDGPU::S_WAITCNT_VSCNT:
1407 case AMDGPU::S_WAITCNT_VMCNT:
1408 case AMDGPU::S_WAITCNT_EXPCNT:
1411 case AMDGPU::S_WAITCNT_LGKMCNT:
1413 return (
MI.getOperand(1).getImm() == 0) &&
1414 (
MI.getOperand(0).
getReg() == AMDGPU::SGPR_NULL);
1415 case AMDGPU::S_WAITCNT: {
1416 const int64_t
Imm =
MI.getOperand(0).getImm();
1419 return (Decoded.
DsCnt == 0);
1423 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1424 "unexpected wait count instruction");
1426 if (TII->isSOPP(
MI))
1442 std::numeric_limits<int>::max())
1446 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1451bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1452 if (!ST.hasVcmpxExecWARHazard())
1454 assert(!ST.hasExtendedWaitCounts());
1459 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1460 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1466 return I.readsRegister(AMDGPU::EXEC, TRI);
1469 const SIInstrInfo *TII = ST.getInstrInfo();
1470 auto IsExpiredFn = [TII, TRI](
const MachineInstr &
MI, int) {
1472 if (TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))
1474 for (
auto MO :
MI.implicit_operands())
1475 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1478 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1485 std::numeric_limits<int>::max())
1489 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1496 if (!ST.hasLdsBranchVmemWARHazard())
1501 bool HasLds =
false;
1502 bool HasVmem =
false;
1503 for (
auto &
MBB : MF) {
1504 for (
auto &
MI :
MBB) {
1507 if (HasLds && HasVmem)
1515 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1516 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1517 !
I.getOperand(1).getImm();
1520bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1521 if (!RunLdsBranchVmemWARHazardFixup)
1524 assert(ST.hasLdsBranchVmemWARHazard());
1525 assert(!ST.hasExtendedWaitCounts());
1527 auto IsHazardInst = [](
const MachineInstr &
MI) {
1535 auto InstType = IsHazardInst(*
MI);
1539 auto IsExpiredFn = [&IsHazardInst](
const MachineInstr &
I, int) {
1543 auto IsHazardFn = [InstType, &IsHazardInst](
const MachineInstr &
I) {
1547 auto IsHazardFn = [InstType, IsHazardInst](
const MachineInstr &
I) {
1548 auto InstType2 = IsHazardInst(
I);
1549 return InstType2 && InstType != InstType2;
1552 auto IsExpiredFn = [InstType, &IsHazardInst](
const MachineInstr &
I, int) {
1553 auto InstType2 = IsHazardInst(
I);
1554 if (InstType == InstType2)
1561 std::numeric_limits<int>::max();
1565 std::numeric_limits<int>::max())
1568 const SIInstrInfo *TII = ST.getInstrInfo();
1570 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1577bool GCNHazardRecognizer::fixLdsDirectVALUHazard(
MachineInstr *
MI) {
1581 const int NoHazardWaitStates = 15;
1582 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1585 bool VisitedTrans =
false;
1586 auto IsHazardFn = [
this, VDSTReg, &VisitedTrans](
const MachineInstr &
I) {
1591 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1593 auto IsExpiredFn = [&](
const MachineInstr &
I,
int WaitStates) {
1594 if (WaitStates >= NoHazardWaitStates)
1600 auto GetWaitStatesFn = [](
const MachineInstr &
MI) {
1604 DenseSet<const MachineBasicBlock *> Visited;
1606 std::next(
MI->getReverseIterator()), 0,
1614 MachineOperand *WaitVdstOp =
1615 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvdst);
1616 WaitVdstOp->
setImm(std::min(
Count, NoHazardWaitStates));
1621bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(
MachineInstr *
MI) {
1625 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1628 auto IsHazardFn = [
this, VDSTReg](
const MachineInstr &
I) {
1631 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1633 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1636 auto IsExpiredFn = [
this, LdsdirCanWait](
const MachineInstr &
I, int) {
1638 (
I.getOpcode() == AMDGPU::S_WAITCNT && !
I.getOperand(0).getImm()) ||
1639 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1642 !TII.getNamedOperand(
I, AMDGPU::OpName::waitvsrc)->getImm());
1646 std::numeric_limits<int>::max())
1649 if (LdsdirCanWait) {
1650 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1653 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1660bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(
MachineInstr *
MI) {
1661 if (!ST.hasVALUPartialForwardingHazard())
1663 assert(!ST.hasExtendedWaitCounts());
1668 SmallSetVector<Register, 4> SrcVGPRs;
1670 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1671 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1676 if (SrcVGPRs.
size() <= 1)
1694 const int Intv1plus2MaxVALUs = 2;
1695 const int Intv3MaxVALUs = 4;
1696 const int IntvMaxVALUs = 6;
1697 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1700 SmallDenseMap<Register, int, 4> DefPos;
1701 int ExecPos = std::numeric_limits<int>::max();
1704 static unsigned getHashValue(
const StateType &State) {
1708 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1709 return LHS.DefPos ==
RHS.DefPos &&
LHS.ExecPos ==
RHS.ExecPos &&
1717 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1719 if (State.VALUs > NoHazardVALUWaitStates)
1725 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1733 if (!State.DefPos.count(Src) &&
I.modifiesRegister(Src, &TRI)) {
1734 State.DefPos[Src] = State.VALUs;
1739 if (State.ExecPos == std::numeric_limits<int>::max()) {
1740 if (!State.DefPos.empty() &&
I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1741 State.ExecPos = State.VALUs;
1748 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1756 if (State.ExecPos == std::numeric_limits<int>::max())
1759 int PreExecPos = std::numeric_limits<int>::max();
1760 int PostExecPos = std::numeric_limits<int>::max();
1762 for (
auto Entry : State.DefPos) {
1763 int DefVALUs =
Entry.second;
1764 if (DefVALUs != std::numeric_limits<int>::max()) {
1765 if (DefVALUs >= State.ExecPos)
1766 PreExecPos = std::min(PreExecPos, DefVALUs);
1768 PostExecPos = std::min(PostExecPos, DefVALUs);
1773 if (PostExecPos == std::numeric_limits<int>::max())
1777 int Intv3VALUs = PostExecPos;
1778 if (Intv3VALUs > Intv3MaxVALUs)
1782 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1783 if (Intv2VALUs > Intv1plus2MaxVALUs)
1787 if (PreExecPos == std::numeric_limits<int>::max())
1791 int Intv1VALUs = PreExecPos - State.ExecPos;
1792 if (Intv1VALUs > Intv1plus2MaxVALUs)
1796 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1801 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1807 std::next(
MI->getReverseIterator())))
1811 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1817bool GCNHazardRecognizer::fixVALUTransUseHazard(
MachineInstr *
MI) {
1818 if (!ST.hasVALUTransUseHazard())
1820 assert(!ST.hasExtendedWaitCounts());
1825 SmallSet<Register, 4> SrcVGPRs;
1827 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1828 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1842 const int IntvMaxVALUs = 5;
1843 const int IntvMaxTRANS = 1;
1849 static unsigned getHashValue(
const StateType &State) {
1852 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1853 return LHS.VALUs ==
RHS.VALUs &&
LHS.TRANS ==
RHS.TRANS;
1860 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1862 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1868 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1875 if (
I.modifiesRegister(Src, &TRI)) {
1883 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1891 std::next(
MI->getReverseIterator())))
1897 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1903bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(
MachineInstr *
MI) {
1908 const SIInstrInfo *TII = ST.getInstrInfo();
1909 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1911 auto IsTransHazardFn = [
MI, TII, TRI](
const MachineInstr &
I) {
1916 Register TransDef = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1917 for (
const MachineOperand &ValuUse :
MI->explicit_uses()) {
1918 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1922 auto *ValuDst = TII->getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1923 if (!ValuDst || !ValuDst->isReg())
1927 Register ValuDef = ValuDst->getReg();
1928 for (
const MachineOperand &TransUse :
I.explicit_uses()) {
1929 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1940 const int HasVALU = std::numeric_limits<int>::max();
1941 if (::getWaitStatesSince(IsTransHazardFn,
MI,
IsExpiredFn) == HasVALU)
1944 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1952 const SIInstrInfo *TII = ST.getInstrInfo();
1953 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1955 auto IsHazardFn = [
MI, TII, TRI,
this](
const MachineInstr &
I) {
1962 TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
1964 TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
1967 TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1969 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1970 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1979 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
1980 if (TRI->regsOverlap(PrevDstReg, CurIndex))
1994 std::numeric_limits<int>::max())
1997 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2009 unsigned Category) {
2011 "Handle me if the xdl wmma instruction latency changes");
2048bool GCNHazardRecognizer::fixWMMACoexecutionHazards(
MachineInstr *
MI) {
2052 const SIInstrInfo *TII = ST.getInstrInfo();
2056 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2063 const int WMMAWaitStates[] = {5, 9, 3, 5};
2064 const int VALUWaitStates[] = {4, 8, 2, 4};
2065 unsigned Category = 0;
2067 auto IsWMMAHazardFn = [
MI, TII, TRI, &Category,
this](
const MachineInstr &
I) {
2068 if (!TII->isXDLWMMA(
I))
2071 unsigned Latency = TSchedModel.computeInstrLatency(&
I);
2075 Register D0 = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2076 Register A1 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
2077 Register B1 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
2080 if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))
2084 Register Idx1 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
2085 if (TRI->regsOverlap(D0, Idx1))
2092 auto IsVALUHazardFn = [
MI, TII, TRI, &Category,
this](
const MachineInstr &
I) {
2093 if (!TII->isXDLWMMA(
I))
2096 unsigned Latency = TSchedModel.computeInstrLatency(&
I);
2101 Register D0 = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2102 for (
const MachineOperand &ValuUse :
MI->explicit_uses()) {
2103 if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))
2107 auto *ValuDst = TII->getNamedOperand(*
MI, AMDGPU::OpName::vdst);
2108 if (!ValuDst || !ValuDst->isReg())
2113 if (TRI->regsOverlap(D0, D1))
2117 Register A0 = TII->getNamedOperand(
I, AMDGPU::OpName::src0)->getReg();
2118 Register B0 = TII->getNamedOperand(
I, AMDGPU::OpName::src1)->getReg();
2119 if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))
2123 Register Idx0 = TII->getNamedOperand(
I, AMDGPU::OpName::src2)->getReg();
2124 if (TRI->regsOverlap(D1, Idx0))
2132 auto IsExpiredFn = [&Limit](
const MachineInstr &,
int WaitStates) {
2133 return WaitStates >= Limit;
2136 auto GetWaitStatesFn = [](
const MachineInstr &
I) {
2140 int WaitStatesNeeded = -1;
2141 if (TII->isXDLWMMA(*
MI)) {
2142 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2143 Limit = WMMAWaitStates[Category];
2144 DenseSet<const MachineBasicBlock *> Visited;
2150 Limit - ::getWaitStatesSince(IsWMMAHazardFn,
MI->getParent(),
2151 std::next(
MI->getReverseIterator()), 0,
2155 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2156 Limit = VALUWaitStates[Category];
2157 DenseSet<const MachineBasicBlock *> Visited;
2163 Limit - ::getWaitStatesSince(IsVALUHazardFn,
MI->getParent(),
2164 std::next(
MI->getReverseIterator()), 0,
2171 for (
int i = 0; i < WaitStatesNeeded; i++)
2173 TII->get(AMDGPU::V_NOP_e32));
2178bool GCNHazardRecognizer::fixShift64HighRegBug(
MachineInstr *
MI) {
2179 if (!ST.hasShift64HighRegBug())
2181 assert(!ST.hasExtendedWaitCounts());
2183 switch (
MI->getOpcode()) {
2186 case AMDGPU::V_LSHLREV_B64_e64:
2187 case AMDGPU::V_LSHRREV_B64_e64:
2188 case AMDGPU::V_ASHRREV_I64_e64:
2192 MachineOperand *Amt = TII.getNamedOperand(*
MI, AMDGPU::OpName::src0);
2197 const MachineRegisterInfo &
MRI = MF.getRegInfo();
2199 if (!TRI.isVGPR(
MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2202 if (AmtReg != AMDGPU::VGPR255 &&
MRI.isPhysRegUsed(AmtReg + 1))
2205 MachineOperand *Src1 = TII.getNamedOperand(*
MI, AMDGPU::OpName::src1);
2206 bool OverlappedSrc = Src1->
isReg() && TRI.regsOverlap(Src1->
getReg(), AmtReg);
2207 bool OverlappedDst =
MI->modifiesRegister(AmtReg, &TRI);
2208 bool Overlapped = OverlappedSrc || OverlappedDst;
2210 assert(!OverlappedDst || !OverlappedSrc ||
2211 Src1->
getReg() ==
MI->getOperand(0).getReg());
2212 assert(ST.needsAlignedVGPRs());
2213 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2216 for (MCRegister
Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2217 : AMDGPU::VGPR_32RegClass) {
2218 if (!
MI->modifiesRegister(
Reg, &TRI) && !
MI->readsRegister(
Reg, &TRI)) {
2224 Register NewAmt = Overlapped ? (
Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2229 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2232 MachineBasicBlock *
MBB =
MI->getParent();
2244 runOnInstruction(
BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2251 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2257 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2271 MI->getOperand(0).setReg(NewReg);
2272 if (OverlappedSrc) {
2282 int NSAtoVMEMWaitStates = 1;
2284 if (!ST.hasNSAtoVMEMBug())
2290 const SIInstrInfo *TII = ST.getInstrInfo();
2291 const auto *
Offset = TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
2299 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2300 TII->getInstSizeInBytes(
I) >= 16;
2303 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
2306int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
MachineInstr *
MI) {
2307 int FPAtomicToDenormModeWaitStates = 3;
2309 if (!ST.hasFPAtomicToDenormModeHazard())
2311 assert(!ST.hasExtendedWaitCounts());
2313 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2322 auto IsExpiredFn = [](
const MachineInstr &
MI,
int WaitStates) {
2329 return FPAtomicToDenormModeWaitStates -
2336 return ST.hasGFX90AInsts() ? checkMAIHazards90A(
MI) : checkMAIHazards908(
MI);
2344 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2348 int NeighborMFMALatency = 0;
2349 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2350 this](
const MachineInstr &
MI) {
2354 NeighborMFMALatency = this->getMFMAPipelineWaitStates(
MI);
2358 const int MaxMFMAPipelineWaitStates = 16;
2359 int WaitStatesSinceNeighborMFMA =
2360 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2362 int NeighborMFMAPaddingNeeded =
2364 WaitStatesSinceNeighborMFMA;
2366 return std::max(0, NeighborMFMAPaddingNeeded);
2370 int WaitStatesNeeded = 0;
2371 unsigned Opc =
MI->getOpcode();
2373 auto IsVALUFn = [](
const MachineInstr &
MI) {
2377 if (
Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
2378 const int LegacyVALUWritesVGPRWaitStates = 2;
2379 const int VALUWritesExecWaitStates = 4;
2380 const int MaxWaitStates = 4;
2382 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2383 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2384 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2386 if (WaitStatesNeeded < MaxWaitStates) {
2387 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2388 const int MaxWaitStates = 2;
2390 if (!
Use.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
2393 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2394 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
2395 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2397 if (WaitStatesNeeded == MaxWaitStates)
2403 for (
const MachineOperand &
Op :
MI->explicit_operands()) {
2404 if (!
Op.isReg() || !TRI.isAGPR(MF.getRegInfo(),
Op.getReg()))
2407 if (
Op.isDef() &&
Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2410 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2411 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2412 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2413 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2414 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2415 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2416 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2417 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2418 const int MaxWaitStates = 18;
2420 unsigned HazardDefLatency = 0;
2422 auto IsOverlappedMFMAFn = [
Reg, &HazardDefLatency,
2423 this](
const MachineInstr &
MI) {
2430 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2431 return TRI.regsOverlap(DstReg,
Reg);
2434 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn,
2436 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2437 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2438 int OpNo =
Op.getOperandNo();
2439 if (OpNo == SrcCIdx) {
2440 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2441 }
else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2442 switch (HazardDefLatency) {
2443 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2445 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2447 case 16: [[fallthrough]];
2448 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2451 }
else if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2452 switch (HazardDefLatency) {
2453 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2455 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2457 case 16: [[fallthrough]];
2458 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2463 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2464 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2466 if (WaitStatesNeeded == MaxWaitStates)
2467 return WaitStatesNeeded;
2469 auto IsAccVgprWriteFn = [
Reg,
this](
const MachineInstr &
MI) {
2470 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2473 return TRI.regsOverlap(
Reg, DstReg);
2476 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2477 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2478 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2479 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2480 if (OpNo == SrcCIdx)
2481 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2482 else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2483 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2485 WaitStatesNeededForUse = NeedWaitStates -
2486 getWaitStatesSinceDef(
Reg, IsAccVgprWriteFn, MaxWaitStates);
2487 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2489 if (WaitStatesNeeded == MaxWaitStates)
2490 return WaitStatesNeeded;
2493 if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2494 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2495 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2496 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2497 const int MaxWaitStates = 13;
2498 Register DstReg =
MI->getOperand(0).getReg();
2499 unsigned HazardDefLatency = 0;
2501 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2502 this](
const MachineInstr &
MI) {
2505 Register Reg = TII.getNamedOperand(
MI, AMDGPU::OpName::src2)->getReg();
2507 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2508 return TRI.regsOverlap(
Reg, DstReg);
2511 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2513 switch (HazardDefLatency) {
2514 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2516 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2518 case 16: [[fallthrough]];
2519 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2523 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2524 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2528 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2530 return WaitStatesNeeded;
2541 return NumPasses + 1 + IsGFX950;
2552 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2570 return NumPasses + 2;
2580 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2584 int WaitStatesNeeded = 0;
2585 unsigned Opc =
MI->getOpcode();
2587 auto IsLegacyVALUFn = [](
const MachineInstr &
MI) {
2591 auto IsLegacyVALUNotDotFn = [](
const MachineInstr &
MI) {
2597 return WaitStatesNeeded;
2599 const int VALUWritesExecWaitStates = 4;
2600 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2601 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2602 VALUWritesExecWaitStates);
2603 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2605 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2608 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2609 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2610 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2611 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2612 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2613 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2614 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2615 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2616 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2617 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2618 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2619 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2620 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2621 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2622 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2623 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2624 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2625 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2626 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2627 const int MaxWaitStates = 19;
2633 const MachineInstr *MI1;
2635 auto IsOverlappedMFMAFn = [
Reg, &FullReg, &MI1,
2636 this](
const MachineInstr &
MI) {
2640 FullReg = (DstReg ==
Reg);
2642 return TRI.regsOverlap(DstReg,
Reg);
2645 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2646 getWaitStatesSinceDef(
Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2647 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2650 getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn, MaxWaitStates);
2651 if (NumWaitStates == std::numeric_limits<int>::max())
2654 int OpNo =
Use.getOperandNo();
2656 int NeedWaitStates = 0;
2657 if (OpNo == SrcCIdx) {
2661 }
else if (FullReg) {
2662 if ((
Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2663 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2664 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2665 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2666 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2667 else if (ST.hasGFX940Insts() &&
2668 TSchedModel.computeInstrLatency(MI1) == 2)
2669 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2672 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2673 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2674 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2675 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2676 if (!TII.isXDL(*
MI))
2679 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2680 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2682 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2683 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2684 if (!TII.isXDL(*
MI))
2685 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2688 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2689 if (ST.hasGFX940Insts()) {
2690 if (TII.isXDL(*
MI) && !TII.isXDL(*MI1))
2697 NumPasses, ST.hasGFX950Insts())
2699 NumPasses, ST.hasGFX950Insts()))
2705 switch (NumPasses) {
2709 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2710 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2715 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2716 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2721 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2722 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2731 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2732 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2733 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2734 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2737 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2738 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2740 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2741 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2742 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2745 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2747 if (ST.hasGFX940Insts()) {
2751 NumPasses, ST.hasGFX950Insts())
2757 switch (NumPasses) {
2759 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2764 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2768 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2772 if (WaitStatesNeeded >= NeedWaitStates)
2775 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2776 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2778 if (WaitStatesNeeded == MaxWaitStates)
2783 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2785 return WaitStatesNeeded;
2790 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2793 int WaitStatesNeeded = 0;
2795 auto IsAccVgprReadFn = [](
const MachineInstr &
MI) {
2796 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2799 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
2800 if (!
Op.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Op.getReg()))
2805 const int AccVgprReadLdStWaitStates = 2;
2806 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2807 const int MaxWaitStates = 2;
2809 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2810 getWaitStatesSinceDef(
Reg, IsAccVgprReadFn, MaxWaitStates);
2811 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2813 if (WaitStatesNeeded == MaxWaitStates)
2814 return WaitStatesNeeded;
2816 auto IsVALUAccVgprRdWrCheckFn = [
Reg,
this](
const MachineInstr &
MI) {
2817 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2818 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2820 auto IsVALUFn = [](
const MachineInstr &
MI) {
2823 return getWaitStatesSinceDef(
Reg, IsVALUFn, 2 ) <
2824 std::numeric_limits<int>::max();
2827 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2828 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2829 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2832 return WaitStatesNeeded;
2836 assert(!ST.hasVcmpxPermlaneHazard() &&
2837 "this is a different vcmpx+permlane hazard");
2838 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2839 const SIInstrInfo *TII = ST.getInstrInfo();
2841 auto IsVCmpXWritesExecFn = [TII, TRI](
const MachineInstr &
MI) {
2845 auto IsVALUFn = [](
const MachineInstr &
MI) {
2849 const int VCmpXWritesExecWaitStates = 4;
2850 const int VALUWritesVDstWaitStates = 2;
2851 int WaitStatesNeeded = 0;
2853 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
2854 if (!
Op.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Op.getReg()))
2858 int WaitStatesSinceDef =
2859 VALUWritesVDstWaitStates -
2860 getWaitStatesSinceDef(
Reg, IsVALUFn,
2861 VALUWritesVDstWaitStates);
2862 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2863 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2867 int VCmpXHazardWaits =
2868 VCmpXWritesExecWaitStates -
2869 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2871 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2872 return WaitStatesNeeded;
2880 return NumPasses + 2;
2890 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2900 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2908 return NumPasses + 2;
2912 if (!ST.hasGFX90AInsts())
2915 auto IsDGEMMFn = [](
const MachineInstr &
MI) ->
bool {
2923 const MachineRegisterInfo &
MRI = MF.getRegInfo();
2925 int WaitStatesNeeded = 0;
2931 const MachineInstr *
MFMA =
nullptr;
2933 auto IsMFMAWriteFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
2935 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
2941 const MachineInstr *
DOT =
nullptr;
2942 auto IsDotWriteFn = [&
Reg, &
DOT,
this](
const MachineInstr &
MI) {
2944 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
2950 bool DGEMMAfterVALUWrite =
false;
2951 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite,
this](
const MachineInstr &
MI) {
2954 DGEMMAfterVALUWrite =
true;
2958 if (!TII.isVALU(
MI) || !DGEMMAfterVALUWrite)
2964 int SrcCIdx = AMDGPU::getNamedOperandIdx(
MI->getOpcode(),
2965 AMDGPU::OpName::src2);
2967 if (IsMemOrExport || IsVALU) {
2968 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2969 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2970 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2971 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2972 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2973 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2974 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2975 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2976 const int DotWriteSameDotReadSrcAB = 3;
2977 const int DotWriteDifferentVALURead = 3;
2978 const int DMFMABetweenVALUWriteVMEMRead = 2;
2979 const int MaxWaitStates = 19;
2981 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2987 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
2990 int NeedWaitStates = 0;
2991 if (
DOT->getOpcode() ==
MI->getOpcode()) {
2992 if (&Use - &
MI->getOperand(0) != SrcCIdx)
2993 NeedWaitStates = DotWriteSameDotReadSrcAB;
2995 NeedWaitStates = DotWriteDifferentVALURead;
2998 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2999 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3006 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3007 DGEMMAfterVALUWrite =
false;
3008 if (TRI.isVectorRegister(
MRI,
Reg)) {
3009 int WaitStatesNeededForUse =
3010 DMFMABetweenVALUWriteVMEMRead -
3011 getWaitStatesSinceDef(
Reg, IsDGEMMHazard,
3012 DMFMABetweenVALUWriteVMEMRead);
3014 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3019 WaitStatesSinceDef =
3020 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3024 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3025 int NumPasses = HazardDefLatency;
3026 int NeedWaitStates = MaxWaitStates;
3029 switch (HazardDefLatency) {
3031 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3032 : DMFMA4x4WriteVgprVALUReadWaitStates;
3038 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3039 : (ST.hasGFX950Insts()
3040 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3041 : DMFMA16x16WriteVgprVALUReadWaitStates);
3046 }
else if (ST.hasGFX940Insts()) {
3050 NumPasses, ST.hasGFX950Insts())
3054 switch (HazardDefLatency) {
3056 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3059 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3062 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3069 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3070 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3072 if (WaitStatesNeeded == MaxWaitStates)
3077 unsigned Opc =
MI->getOpcode();
3078 const int DMFMAToFMA64WaitStates = 2;
3079 if ((
Opc == AMDGPU::V_FMA_F64_e64 ||
3080 Opc == AMDGPU::V_FMAC_F64_e32 ||
Opc == AMDGPU::V_FMAC_F64_e64 ||
3081 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3082 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3083 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3084 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3085 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3088 if (!IsVALU && !IsMemOrExport)
3089 return WaitStatesNeeded;
3091 for (
const MachineOperand &Def :
MI->defs()) {
3092 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3093 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3094 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3095 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3096 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3097 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3098 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3099 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3100 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3101 const int DotWriteDifferentVALUWrite = 3;
3102 const int MaxWaitStates = 19;
3103 const int MaxWarWaitStates = 15;
3108 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3110 if (DOT &&
DOT->getOpcode() !=
MI->getOpcode())
3111 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3112 WaitStatesSinceDef);
3115 WaitStatesSinceDef =
3116 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3118 int NeedWaitStates = MaxWaitStates;
3119 int NumPasses = TSchedModel.computeInstrLatency(
MFMA);
3122 switch (NumPasses) {
3124 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3128 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3133 }
else if (ST.hasGFX940Insts()) {
3137 NumPasses, ST.hasGFX950Insts())
3140 switch (NumPasses) {
3142 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3145 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3148 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3155 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3156 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3158 if (WaitStatesNeeded == MaxWaitStates)
3162 auto IsSMFMAReadAsCFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
3164 !
MI.readsRegister(
Reg, &TRI))
3167 if (ST.hasGFX940Insts() && !TII.isXDL(
MI))
3170 const MachineOperand *SrcC =
3171 TII.getNamedOperand(
MI, AMDGPU::OpName::src2);
3181 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3186 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3187 int NeedWaitStates = MaxWaitStates;
3188 switch (HazardDefLatency) {
3189 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3191 case 4:
assert(ST.hasGFX940Insts());
3192 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3194 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3196 case 16: [[fallthrough]];
3197 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3201 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3202 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3205 return WaitStatesNeeded;
3218 return MAI !=
nullptr;
3222 if (IsMFMAFn(*
MI)) {
3223 int W = getWaitStatesSince(IsMFMAFn, 16);
3225 return W < (int)TSchedModel.computeInstrLatency(MAI);
3239 while (
I->isBundledWithPred())
3245 if (
I->getOpcode() != AMDGPU::S_GETPC_B64)
3249 const unsigned NewBytes = 4;
3251 "Unexpected instruction insertion in bundle");
3254 while (NextMI != End && NextMI->isBundledWithPred()) {
3255 for (
auto &Operand : NextMI->operands()) {
3256 if (Operand.isGlobal())
3257 Operand.setOffset(Operand.getOffset() + NewBytes);
3263bool GCNHazardRecognizer::fixVALUMaskWriteHazard(
MachineInstr *
MI) {
3264 if (!ST.hasVALUMaskWriteHazard())
3266 assert(!ST.hasExtendedWaitCounts());
3273 if (!IsSALU && !IsVALU)
3285 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3286 const MachineRegisterInfo &
MRI = MF.getRegInfo();
3291 case AMDGPU::EXEC_LO:
3292 case AMDGPU::EXEC_HI:
3294 case AMDGPU::SGPR_NULL:
3295 case AMDGPU::SGPR_NULL64:
3303 return Reg == AMDGPU::VCC ||
Reg == AMDGPU::VCC_LO ||
Reg == AMDGPU::VCC_HI;
3307 SmallSet<Register, 2> HazardSGPRs;
3309 static unsigned getHashValue(
const StateType &State) {
3312 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
3313 return LHS.HazardSGPRs ==
RHS.HazardSGPRs;
3317 SmallVector<const MachineInstr *> WaitInstrs;
3318 bool HasSGPRRead =
false;
3319 StateType InitialState;
3322 MachineOperand *HazardDef =
nullptr;
3323 for (MachineOperand &
Op :
MI->operands()) {
3326 if (
Op.isDef() && HazardDef)
3330 if (IgnoreableSGPR(
Reg))
3333 if (
Op.isImplicit())
3335 if (!TRI->isSGPRReg(
MRI,
Reg))
3353 if (AMDGPU::SReg_32RegClass.
contains(HazardReg)) {
3354 InitialState.HazardSGPRs.insert(HazardReg);
3357 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3358 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3361 auto IsHazardFn = [&](StateType &State,
const MachineInstr &
I) {
3362 if (State.HazardSGPRs.empty())
3365 switch (
I.getOpcode()) {
3366 case AMDGPU::V_ADDC_U32_e32:
3367 case AMDGPU::V_ADDC_U32_dpp:
3368 case AMDGPU::V_CNDMASK_B16_t16_e32:
3369 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3370 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3371 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3372 case AMDGPU::V_CNDMASK_B32_e32:
3373 case AMDGPU::V_CNDMASK_B32_dpp:
3374 case AMDGPU::V_DIV_FMAS_F32_e64:
3375 case AMDGPU::V_DIV_FMAS_F64_e64:
3376 case AMDGPU::V_SUBB_U32_e32:
3377 case AMDGPU::V_SUBB_U32_dpp:
3378 case AMDGPU::V_SUBBREV_U32_e32:
3379 case AMDGPU::V_SUBBREV_U32_dpp: {
3383 case AMDGPU::V_ADDC_U32_e64:
3384 case AMDGPU::V_ADDC_U32_e64_dpp:
3385 case AMDGPU::V_CNDMASK_B16_t16_e64:
3386 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3387 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3388 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3389 case AMDGPU::V_CNDMASK_B32_e64:
3390 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3391 case AMDGPU::V_SUBB_U32_e64:
3392 case AMDGPU::V_SUBB_U32_e64_dpp:
3393 case AMDGPU::V_SUBBREV_U32_e64:
3394 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3396 const MachineOperand *SSRCOp = TII.getNamedOperand(
I, AMDGPU::OpName::src2);
3398 bool Result = TRI->regsOverlap(SSRCOp->
getReg(), HazardReg);
3410 auto UpdateStateFn = [&](StateType &State,
const MachineInstr &
I) {
3411 switch (
I.getOpcode()) {
3412 case AMDGPU::S_WAITCNT_DEPCTR:
3414 if (!HasSGPRRead &&
I.getParent() ==
MI->getParent() && !
I.isBundled() &&
3415 (
I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3420 for (
auto &
Op :
I.operands()) {
3425 if (IgnoreableSGPR(
Reg))
3428 if (
Op.isImplicit())
3430 if (!TRI->isSGPRReg(
MRI,
Reg))
3441 for (
Register SGPR : State.HazardSGPRs) {
3442 if (
Reg == SGPR || TRI->regsOverlap(
Reg, SGPR))
3446 State.HazardSGPRs.erase(SGPR);
3455 std::next(
MI->getReverseIterator())))
3465 if (!WaitInstrs.
empty()) {
3469 SmallVector<MachineInstr *> ToErase;
3471 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3472 End = MI->getParent()->rend();
3473 Found < WaitInstrs.size() && It != End; ++It) {
3474 MachineInstr *WaitMI = &*It;
3476 if (std::as_const(WaitMI) != WaitInstrs[Found])
3479 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3480 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3481 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3482 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3483 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3484 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3485 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3486 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3487 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3488 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3489 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3490 ToErase.push_back(WaitMI);
3493 for (MachineInstr *WaitMI : ToErase)
3494 WaitMI->eraseFromParent();
3498 auto NextMI = std::next(
MI->getIterator());
3499 auto NewMI =
BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
3500 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3512 if (EntryMBB.
begin() != EntryMBB.
end()) {
3513 auto &EntryMI = *EntryMBB.
begin();
3514 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3515 EntryMI.getOperand(0).getImm() >= Priority)
3524bool GCNHazardRecognizer::fixRequiredExportPriority(
MachineInstr *
MI) {
3525 if (!ST.hasRequiredExportPriority())
3530 MachineBasicBlock *
MBB =
MI->getParent();
3543 const int MaxPriority = 3;
3544 const int NormalPriority = 2;
3545 const int PostExportPriority = 0;
3547 auto It =
MI->getIterator();
3548 switch (
MI->getOpcode()) {
3549 case AMDGPU::S_ENDPGM:
3550 case AMDGPU::S_ENDPGM_SAVED:
3551 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3552 case AMDGPU::SI_RETURN_TO_EPILOG:
3555 if (MF->getFrameInfo().hasCalls())
3558 case AMDGPU::S_SETPRIO: {
3560 auto &PrioOp =
MI->getOperand(0);
3561 int Prio = PrioOp.getImm();
3562 bool InWA = (Prio == PostExportPriority) &&
3563 (It !=
MBB->
begin() && TII.isEXP(*std::prev(It)));
3564 if (InWA || Prio >= NormalPriority)
3566 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3570 if (!TII.isEXP(*
MI))
3581 auto NextMI = std::next(It);
3582 bool EndOfShader =
false;
3583 if (NextMI !=
MBB->
end()) {
3585 if (TII.isEXP(*NextMI))
3588 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3589 NextMI->getOperand(0).getImm() == PostExportPriority)
3591 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3598 .
addImm(PostExportPriority);
3602 BuildMI(*
MBB, NextMI,
DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3603 .
addReg(AMDGPU::SGPR_NULL)
3623 const SIInstrInfo *TII = ST.getInstrInfo();
3635 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3640bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(
MachineInstr *
MI) {
3641 if (
MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3644 const SIInstrInfo *TII = ST.getInstrInfo();
3646 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3648 BuildMI(*
MI->getParent(), std::next(
MI->getIterator()),
MI->getDebugLoc(),
3649 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3655bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(
MachineInstr *
MI) {
3658 if (!IsHazardRecognizerMode)
3661 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3662 const SIInstrInfo *TII = ST.getInstrInfo();
3664 const int FlatScrBaseWaitStates = 10;
3666 bool ReadsFlatScrLo =
3667 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3668 bool ReadsFlatScrHi =
3669 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3675 ReadsFlatScrLo =
true;
3678 ReadsFlatScrHi =
true;
3683 const MachineRegisterInfo &
MRI = MF.getRegInfo();
3686 DenseSet<const MachineBasicBlock *> Visited;
3688 return MI.modifiesRegister(
Reg, TRI);
3693 auto IsSGPRDef = [TII, TRI, &
MRI](
const MachineInstr &
MI) ->
unsigned {
3694 if (!TII->isSALU(
MI) && !TII->isVALU(
MI))
3696 for (
const MachineOperand &MO :
MI.all_defs()) {
3697 if (TRI->isSGPRReg(
MRI, MO.getReg()))
3703 auto IsExpiredFn = [=](
const MachineInstr &
MI,
int SgprWrites) {
3704 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3705 unsigned Wait =
MI.getOperand(0).getImm();
3710 return SgprWrites >= FlatScrBaseWaitStates;
3713 return ::getWaitStatesSince(
3714 IsHazardFn,
MI->getParent(), std::next(
MI->getReverseIterator()),
3715 0,
IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3718 if ((!ReadsFlatScrLo ||
MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
3719 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3720 (!ReadsFlatScrHi ||
MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
3721 !IsRegDefHazard(AMDGPU::SGPR103)))
3725 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3736 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3737 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
unsigned const MachineRegisterInfo * MRI
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, unsigned Latency, unsigned Category)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(DataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
static const uint32_t IV[8]
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Implements a dense probed hash-table based set.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
unsigned PreEmitNoopsCommon(MachineInstr *)
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
Wrapper class representing physical registers. Should be passed by value.
Instructions::const_reverse_iterator const_reverse_instr_iterator
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
unsigned getOccupancy() const
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned getMaxLookAhead() const
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
NodeAddr< UseNode * > Use
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionAddr VTableAddr Count
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...