50#define DEBUG_TYPE "si-insert-waitcnts"
54 cl::desc(
"Force all waitcnt instrs to be emitted as "
55 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
59 "amdgpu-waitcnt-load-forcezero",
60 cl::desc(
"Force all waitcnt load counters to wait until 0"),
64 "amdgpu-expert-scheduling-mode",
65 cl::desc(
"Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
70template <
typename EmitWaitcntFn>
71static void EmitExpandedWaitcnt(
unsigned Outstanding,
unsigned Target,
72 EmitWaitcntFn &&EmitWaitcnt) {
74 for (
unsigned I = Outstanding - 1;
I >
Target &&
I != ~0u; --
I)
94 TRACKINGID_RANGE_LEN = (1 << 16),
99 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
104 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
105 LDSDMA_BEGIN = REGUNITS_END,
106 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
110static constexpr VMEMID toVMEMID(MCRegUnit RU) {
111 return static_cast<unsigned>(RU);
138 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT,
139 AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT,
140 AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
141 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT,
142 AMDGPU::S_WAIT_ASYNCCNT, AMDGPU::S_WAIT_TENSORCNT};
147 switch (
MI.getOpcode()) {
148 case AMDGPU::ASYNCMARK:
149 case AMDGPU::WAIT_ASYNCMARK:
152 return MI.isMetaInstruction();
168 assert(updateVMCntOnly(Inst));
170 return VMEM_NOSAMPLER;
184 return VMEM_NOSAMPLER;
187class WaitcntBrackets;
195class WaitcntGenerator {
197 const GCNSubtarget &ST;
198 const SIInstrInfo &TII;
199 AMDGPU::IsaVersion IV;
202 bool ExpandWaitcntProfiling =
false;
203 const AMDGPU::HardwareLimits &Limits;
206 WaitcntGenerator() =
delete;
207 WaitcntGenerator(
const WaitcntGenerator &) =
delete;
208 WaitcntGenerator(
const MachineFunction &MF,
210 const AMDGPU::HardwareLimits &Limits)
211 : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
215 ExpandWaitcntProfiling(
216 MF.
getFunction().hasFnAttribute(
"amdgpu-expand-waitcnt-profiling")),
221 bool isOptNone()
const {
return OptNone; }
237 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
238 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
242 bool promoteSoftWaitCnt(MachineInstr *Waitcnt)
const;
247 virtual bool createNewWaitcnt(MachineBasicBlock &
Block,
249 AMDGPU::Waitcnt
Wait,
250 const WaitcntBrackets &ScoreBrackets) = 0;
269 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const = 0;
271 virtual ~WaitcntGenerator() =
default;
274class WaitcntGeneratorPreGFX12 final :
public WaitcntGenerator {
277 HWEventSet({HWEvent::VMEM_ACCESS, HWEvent::VMEM_SAMPLER_READ_ACCESS,
278 HWEvent::VMEM_BVH_READ_ACCESS}),
279 HWEventSet({HWEvent::SMEM_ACCESS, HWEvent::LDS_ACCESS,
280 HWEvent::GDS_ACCESS, HWEvent::SQ_MESSAGE}),
281 HWEventSet({HWEvent::EXP_GPR_LOCK, HWEvent::GDS_GPR_LOCK,
282 HWEvent::VMW_GPR_LOCK, HWEvent::EXP_PARAM_ACCESS,
283 HWEvent::EXP_POS_ACCESS, HWEvent::EXP_LDS_ACCESS}),
285 {HWEvent::VMEM_WRITE_ACCESS, HWEvent::SCRATCH_WRITE_ACCESS}),
296 using WaitcntGenerator::WaitcntGenerator;
298 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
299 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
302 bool createNewWaitcnt(MachineBasicBlock &
Block,
304 AMDGPU::Waitcnt
Wait,
305 const WaitcntBrackets &ScoreBrackets)
override;
308 return WaitEventMaskForInstPreGFX12[
T];
311 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
314class WaitcntGeneratorGFX12Plus final :
public WaitcntGenerator {
319 HWEventSet({HWEvent::VMEM_ACCESS, HWEvent::GLOBAL_INV_ACCESS}),
320 HWEventSet({HWEvent::LDS_ACCESS, HWEvent::GDS_ACCESS}),
321 HWEventSet({HWEvent::EXP_GPR_LOCK, HWEvent::GDS_GPR_LOCK,
322 HWEvent::VMW_GPR_LOCK, HWEvent::EXP_PARAM_ACCESS,
323 HWEvent::EXP_POS_ACCESS, HWEvent::EXP_LDS_ACCESS}),
325 {HWEvent::VMEM_WRITE_ACCESS, HWEvent::SCRATCH_WRITE_ACCESS}),
326 HWEventSet({HWEvent::VMEM_SAMPLER_READ_ACCESS}),
329 {HWEvent::SMEM_ACCESS, HWEvent::SQ_MESSAGE, HWEvent::SCC_WRITE}),
330 HWEventSet({HWEvent::VMEM_GROUP, HWEvent::SMEM_GROUP}),
333 HWEventSet({HWEvent::VGPR_CSMACC_WRITE, HWEvent::VGPR_DPMACC_WRITE,
334 HWEvent::VGPR_TRANS_WRITE, HWEvent::VGPR_XDL_WRITE}),
335 HWEventSet({HWEvent::VGPR_LDS_READ, HWEvent::VGPR_FLAT_READ,
336 HWEvent::VGPR_VMEM_READ})};
339 WaitcntGeneratorGFX12Plus() =
delete;
340 WaitcntGeneratorGFX12Plus(
const MachineFunction &MF,
342 const AMDGPU::HardwareLimits &Limits,
344 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
347 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
348 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
351 bool createNewWaitcnt(MachineBasicBlock &
Block,
353 AMDGPU::Waitcnt
Wait,
354 const WaitcntBrackets &ScoreBrackets)
override;
357 return WaitEventMaskForInstGFX12Plus[
T];
360 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
364struct PreheaderFlushFlags {
365 bool FlushVmCnt =
false;
366 bool FlushDsCnt =
false;
369class SIInsertWaitcnts {
370 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
371 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
372 MachineLoopInfo &MLI;
373 MachinePostDominatorTree &PDT;
378 std::unique_ptr<WaitcntBrackets> Incoming;
380 BlockInfo() =
default;
381 BlockInfo(BlockInfo &&) =
default;
382 BlockInfo &operator=(BlockInfo &&) =
default;
386 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
390 std::unique_ptr<WaitcntGenerator> WCG;
393 DenseSet<MachineInstr *> CallInsts;
394 DenseSet<MachineInstr *> ReturnInsts;
399 DenseMap<MachineInstr *, bool> EndPgmInsts;
401 AMDGPU::HardwareLimits Limits;
404 const GCNSubtarget &ST;
405 const SIInstrInfo &TII;
406 const SIRegisterInfo &TRI;
407 const MachineRegisterInfo &MRI;
410 bool IsExpertMode =
false;
412 SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,
414 : MLI(MLI), PDT(PDT), AA(AA), MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),
415 TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()),
416 MRI(MF.getRegInfo()) {}
418 const AMDGPU::HardwareLimits &getLimits()
const {
return Limits; }
420 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *
ML,
421 const WaitcntBrackets &Brackets);
422 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &
MBB,
423 const WaitcntBrackets &ScoreBrackets);
424 bool isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const;
425 bool isDSRead(
const MachineInstr &
MI)
const;
426 bool mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const;
429 bool isAsync(
const MachineInstr &
MI)
const {
434 const MachineOperand *
Async =
435 TII.getNamedOperand(
MI, AMDGPU::OpName::IsAsync);
439 bool isNonAsyncLdsDmaWrite(
const MachineInstr &
MI)
const {
443 bool isAsyncLdsDmaWrite(
const MachineInstr &
MI)
const {
447 bool shouldUpdateAsyncMark(
const MachineInstr &
MI,
451 if (!isAsyncLdsDmaWrite(
MI))
458 bool isVmemAccess(
const MachineInstr &
MI)
const;
459 bool generateWaitcntInstBefore(MachineInstr &
MI,
460 WaitcntBrackets &ScoreBrackets,
461 MachineInstr *OldWaitcntInstr,
462 PreheaderFlushFlags FlushFlags);
463 bool generateWaitcnt(AMDGPU::Waitcnt
Wait,
465 MachineBasicBlock &
Block, WaitcntBrackets &ScoreBrackets,
466 MachineInstr *OldWaitcntInstr);
467 void updateEventWaitcntAfter(MachineInstr &Inst,
468 WaitcntBrackets *ScoreBrackets);
470 MachineBasicBlock *
Block)
const;
471 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &
Block,
472 WaitcntBrackets &ScoreBrackets);
473 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &
Block,
474 WaitcntBrackets &ScoreBrackets);
477 bool removeRedundantSoftXcnts(MachineBasicBlock &
Block);
479 bool ExpertMode)
const;
481 return WCG->getWaitEvents(
T);
484 return WCG->getCounterFromEvent(
E);
496class WaitcntBrackets {
498 WaitcntBrackets(
const SIInsertWaitcnts *Context) : Context(Context) {
499 assert(Context->TRI.getNumRegUnits() < REGUNITS_END);
504 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
505 for (
auto &[
ID, Val] : VMem) {
509 for (
auto &[
ID, Val] : SGPRs) {
514 if (NumUnusedVmem || NumUnusedSGPRs) {
515 errs() <<
"WaitcntBracket had unused entries at destruction time: "
516 << NumUnusedVmem <<
" VMem and " << NumUnusedSGPRs
517 <<
" SGPR unused entries\n";
528 return ScoreUBs[
T] - ScoreLBs[
T];
532 return getVMemScore(
ID,
T) > getScoreLB(
T);
550 return getScoreUB(
T) - getScoreLB(
T);
554 auto It = SGPRs.find(RU);
555 return It != SGPRs.end() ? It->second.get(
T) : 0;
559 auto It = VMem.find(TID);
560 return It != VMem.end() ? It->second.Scores[
T] : 0;
567 void simplifyWaitcnt(AMDGPU::Waitcnt &
Wait)
const {
570 void simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
571 AMDGPU::Waitcnt &UpdateWait)
const;
574 void simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
575 AMDGPU::Waitcnt &UpdateWait)
const;
576 void simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
577 AMDGPU::Waitcnt &UpdateWait)
const;
580 AMDGPU::Waitcnt &
Wait,
581 const MachineInstr &
MI)
const;
582 MCPhysReg determineVGPR16Dependency(
const MachineInstr &
MI,
586 AMDGPU::Waitcnt &
Wait)
const;
587 AMDGPU::Waitcnt determineAsyncWait(
unsigned N);
588 void tryClearSCCWriteEvent(MachineInstr *Inst);
590 void applyWaitcnt(
const AMDGPU::Waitcnt &
Wait);
593 void updateByEvent(
HWEvent E, MachineInstr &
MI);
594 void recordAsyncMark(MachineInstr &
MI);
596 bool hasPendingEvent()
const {
return !PendingEvents.empty(); }
597 bool hasPendingEvent(
HWEvent E)
const {
return PendingEvents.contains(
E); }
599 bool HasPending = PendingEvents & Context->getWaitEvents(
T);
601 "Expected pending events iff scoreboard is not empty");
606 HWEventSet Events = PendingEvents & Context->getWaitEvents(
T);
611 bool hasPendingFlat()
const {
618 void setPendingFlat() {
623 bool hasPendingGDS()
const {
628 unsigned getPendingGDSWait()
const {
637 bool hasOtherPendingVmemTypes(
MCPhysReg Reg, VmemType V)
const {
638 for (MCRegUnit RU : regunits(
Reg)) {
639 auto It = VMem.find(toVMEMID(RU));
640 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
647 for (MCRegUnit RU : regunits(
Reg)) {
648 if (
auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
649 It->second.VMEMTypes = 0;
650 if (It->second.empty())
656 void setStateOnFunctionEntryOrReturn() {
662 ArrayRef<const MachineInstr *> getLDSDMAStores()
const {
666 bool hasPointSampleAccel(
const MachineInstr &
MI)
const;
667 bool hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
670 void print(raw_ostream &)
const;
675 void purgeEmptyTrackingData();
679 return Context->getLimits().get(
T);
689 using CounterValueArray = std::array<unsigned, AMDGPU::NUM_INST_CNTS>;
692 AMDGPU::Waitcnt &
Wait)
const;
694 static bool mergeScore(
const MergeInfo &M,
unsigned &Score,
695 unsigned OtherScore);
700 assert(
Reg != AMDGPU::SCC &&
"Shouldn't be used on SCC");
701 if (!Context->TRI.isInAllocatableClass(
Reg))
703 return Context->TRI.regunits(
Reg);
724 const SIRegisterInfo &
TRI = Context->TRI;
725 if (
Reg == AMDGPU::SCC) {
727 }
else if (
TRI.isVectorRegister(Context->MRI,
Reg)) {
728 for (MCRegUnit RU : regunits(
Reg))
729 VMem[toVMEMID(RU)].Scores[
T] = Val;
730 }
else if (
TRI.isSGPRReg(Context->MRI,
Reg)) {
731 for (MCRegUnit RU : regunits(
Reg))
732 SGPRs[RU].get(
T) = Val;
739 VMem[TID].Scores[
T] = Val;
742 void setScoreByOperand(
const MachineOperand &
Op,
745 const SIInsertWaitcnts *Context;
751 unsigned LastFlatDsCnt = 0;
752 unsigned LastFlatLoadCnt = 0;
754 unsigned LastGDS = 0;
771 CounterValueArray Scores{};
773 unsigned VMEMTypes = 0;
782 unsigned ScoreDsKmCnt = 0;
783 unsigned ScoreXCnt = 0;
799 bool empty()
const {
return !ScoreDsKmCnt && !ScoreXCnt; }
802 DenseMap<VMEMID, VMEMInfo> VMem;
803 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
806 unsigned SCCScore = 0;
808 const MachineInstr *PendingSCCWrite =
nullptr;
812 SmallVector<const MachineInstr *> LDSDMAStores;
821 static constexpr unsigned MaxAsyncMarks = 16;
825 CounterValueArray AsyncScore{};
828SIInsertWaitcnts::BlockInfo::~BlockInfo() =
default;
833 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
835 bool runOnMachineFunction(MachineFunction &MF)
override;
837 StringRef getPassName()
const override {
838 return "SI insert wait instructions";
841 void getAnalysisUsage(AnalysisUsage &AU)
const override {
844 AU.
addRequired<MachinePostDominatorTreeWrapperPass>();
856 setRegScore(
Op.getReg().asMCReg(), CntTy, Score);
864bool WaitcntBrackets::hasPointSampleAccel(
const MachineInstr &
MI)
const {
869 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
879bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
881 if (!hasPointSampleAccel(
MI))
884 return hasOtherPendingVmemTypes(
Reg, VMEM_NOSAMPLER);
887void WaitcntBrackets::updateByEvent(
HWEvent E, MachineInstr &Inst) {
891 unsigned UB = getScoreUB(
T);
894 Context->ST.hasVOP3PX2IncrementsVaVdstTwice()) {
906 setScoreUB(
T, CurrScore);
909 const MachineRegisterInfo &MRI =
Context->MRI;
918 if (
const auto *AddrOp =
TII.getNamedOperand(Inst, AMDGPU::OpName::addr))
922 if (
const auto *Data0 =
923 TII.getNamedOperand(Inst, AMDGPU::OpName::data0))
925 if (
const auto *Data1 =
926 TII.getNamedOperand(Inst, AMDGPU::OpName::data1))
930 Inst.
getOpcode() != AMDGPU::DS_CONSUME &&
931 Inst.
getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
932 for (
const MachineOperand &
Op : Inst.
all_uses()) {
933 if (
TRI.isVectorRegister(MRI,
Op.getReg()))
937 }
else if (
TII.isFLAT(Inst)) {
939 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
942 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
945 }
else if (
TII.isMIMG(Inst)) {
949 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
952 }
else if (
TII.isMTBUF(Inst)) {
955 }
else if (
TII.isMUBUF(Inst)) {
959 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
962 }
else if (
TII.isLDSDIR(Inst)) {
964 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::vdst),
967 if (
TII.isEXP(Inst)) {
972 for (MachineOperand &DefMO : Inst.
all_defs()) {
973 if (
TRI.isVGPR(MRI, DefMO.getReg())) {
978 for (
const MachineOperand &
Op : Inst.
all_uses()) {
979 if (
TRI.isVectorRegister(MRI,
Op.getReg()))
985 E == HWEvent::SMEM_GROUP ? HWEvent::VMEM_GROUP : HWEvent::SMEM_GROUP;
986 if (PendingEvents.
contains(OtherEvent)) {
991 setScoreLB(
T, getScoreUB(
T) - 1);
992 PendingEvents.
remove(OtherEvent);
994 for (
const MachineOperand &
Op : Inst.
all_uses())
995 setScoreByOperand(
Op,
T, CurrScore);
999 for (
const MachineOperand &
Op : Inst.
operands()) {
1004 setScoreByOperand(
Op,
T, CurrScore);
1016 for (
const MachineOperand &
Op : Inst.
defs()) {
1019 if (!
TRI.isVectorRegister(MRI,
Op.getReg()))
1021 if (updateVMCntOnly(Inst)) {
1026 VmemType
V = getVmemType(Inst);
1027 unsigned char TypesMask = 1 <<
V;
1030 if (hasPointSampleAccel(Inst))
1031 TypesMask |= 1 << VMEM_NOSAMPLER;
1032 for (MCRegUnit RU : regunits(
Op.getReg().asMCReg()))
1033 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1036 setScoreByOperand(
Op,
T, CurrScore);
1039 (
TII.isDS(Inst) ||
Context->isNonAsyncLdsDmaWrite(Inst))) {
1048 if (!MemOp->isStore() ||
1053 auto AAI = MemOp->getAAInfo();
1059 if (!AAI || !AAI.Scope)
1061 for (
unsigned I = 0,
E = LDSDMAStores.
size();
I !=
E && !Slot; ++
I) {
1062 for (
const auto *MemOp : LDSDMAStores[
I]->memoperands()) {
1063 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1078 setVMemScore(LDSDMA_BEGIN,
T, CurrScore);
1079 if (Slot && Slot < NUM_LDSDMA)
1080 setVMemScore(LDSDMA_BEGIN + Slot,
T, CurrScore);
1083 if (
Context->shouldUpdateAsyncMark(Inst,
T)) {
1084 AsyncScore[
T] = CurrScore;
1088 setRegScore(AMDGPU::SCC,
T, CurrScore);
1089 PendingSCCWrite = &Inst;
1094void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1100 AsyncMarks.push_back(AsyncScore);
1103 dbgs() <<
"recordAsyncMark:\n" << Inst;
1104 for (
const auto &Mark : AsyncMarks) {
1111void WaitcntBrackets::print(raw_ostream &OS)
const {
1115 unsigned SR = getScoreRange(
T);
1118 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"LOAD" :
"VM") <<
"_CNT("
1122 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"DS" :
"LGKM") <<
"_CNT("
1126 OS <<
" EXP_CNT(" << SR <<
"):";
1129 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"STORE" :
"VS") <<
"_CNT("
1133 OS <<
" SAMPLE_CNT(" << SR <<
"):";
1136 OS <<
" BVH_CNT(" << SR <<
"):";
1139 OS <<
" KM_CNT(" << SR <<
"):";
1142 OS <<
" X_CNT(" << SR <<
"):";
1145 OS <<
" ASYNC_CNT(" << SR <<
"):";
1148 OS <<
" VA_VDST(" << SR <<
"): ";
1151 OS <<
" VM_VSRC(" << SR <<
"): ";
1154 OS <<
" UNKNOWN(" << SR <<
"):";
1160 unsigned LB = getScoreLB(
T);
1163 sort(SortedVMEMIDs);
1165 for (
auto ID : SortedVMEMIDs) {
1166 unsigned RegScore = VMem.at(
ID).Scores[
T];
1169 unsigned RelScore = RegScore - LB - 1;
1170 if (
ID < REGUNITS_END) {
1171 OS <<
' ' << RelScore <<
":vRU" <<
ID;
1173 assert(
ID >= LDSDMA_BEGIN &&
ID < LDSDMA_END &&
1174 "Unhandled/unexpected ID value!");
1175 OS <<
' ' << RelScore <<
":LDSDMA" <<
ID;
1180 if (isSmemCounter(
T)) {
1182 sort(SortedSMEMIDs);
1183 for (
auto ID : SortedSMEMIDs) {
1184 unsigned RegScore = SGPRs.at(
ID).get(
T);
1187 unsigned RelScore = RegScore - LB - 1;
1188 OS <<
' ' << RelScore <<
":sRU" <<
static_cast<unsigned>(
ID);
1193 OS <<
' ' << SCCScore <<
":scc";
1198 OS <<
"Pending Events: ";
1199 if (hasPendingEvent()) {
1202 if (hasPendingEvent(
E)) {
1211 OS <<
"Async score: ";
1212 if (AsyncScore.empty())
1218 OS <<
"Async marks: " << AsyncMarks.size() <<
'\n';
1220 for (
const auto &Mark : AsyncMarks) {
1222 unsigned MarkedScore = Mark[
T];
1225 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"LOAD" :
"VM")
1226 <<
"_CNT: " << MarkedScore;
1229 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"DS" :
"LGKM")
1230 <<
"_CNT: " << MarkedScore;
1233 OS <<
" EXP_CNT: " << MarkedScore;
1236 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"STORE" :
"VS")
1237 <<
"_CNT: " << MarkedScore;
1240 OS <<
" SAMPLE_CNT: " << MarkedScore;
1243 OS <<
" BVH_CNT: " << MarkedScore;
1246 OS <<
" KM_CNT: " << MarkedScore;
1249 OS <<
" X_CNT: " << MarkedScore;
1252 OS <<
" ASYNC_CNT: " << MarkedScore;
1255 OS <<
" UNKNOWN: " << MarkedScore;
1266void WaitcntBrackets::simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
1267 AMDGPU::Waitcnt &UpdateWait)
const {
1275 simplifyXcnt(CheckWait, UpdateWait);
1277 simplifyVmVsrc(CheckWait, UpdateWait);
1282 unsigned &
Count)
const {
1286 if (
Count >= getScoreRange(
T))
1290void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &
Wait,
1292 unsigned Cnt =
Wait.get(
T);
1293 simplifyWaitcnt(
T, Cnt);
1297void WaitcntBrackets::simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
1298 AMDGPU::Waitcnt &UpdateWait)
const {
1308 hasPendingEvent(HWEvent::SMEM_GROUP))
1314 hasPendingEvent(HWEvent::VMEM_GROUP) &&
1321void WaitcntBrackets::simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
1322 AMDGPU::Waitcnt &UpdateWait)
const {
1327 std::min({CheckWait.get(AMDGPU::LOAD_CNT),
1328 CheckWait.get(AMDGPU::STORE_CNT),
1329 CheckWait.get(AMDGPU::SAMPLE_CNT),
1330 CheckWait.get(AMDGPU::BVH_CNT), CheckWait.get(AMDGPU::DS_CNT)}))
1335void WaitcntBrackets::purgeEmptyTrackingData() {
1336 VMem.remove_if([](
const auto &
P) {
return P.second.empty(); });
1337 SGPRs.remove_if([](
const auto &
P) {
return P.second.empty(); });
1341 unsigned ScoreToWait,
1342 AMDGPU::Waitcnt &
Wait)
const {
1343 const unsigned LB = getScoreLB(
T);
1344 const unsigned UB = getScoreUB(
T);
1347 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1349 !
Context->ST.hasFlatLgkmVMemCountInOrder()) {
1354 }
else if (counterOutOfOrder(
T)) {
1362 unsigned NeededWait = std::min(UB - ScoreToWait, getLimit(
T) - 1);
1363 Wait.add(
T, NeededWait);
1368AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(
unsigned N) {
1370 dbgs() <<
"Need " <<
N <<
" async marks. Found " << AsyncMarks.size()
1372 for (
const auto &Mark : AsyncMarks) {
1378 if (AsyncMarks.size() == MaxAsyncMarks) {
1383 LLVM_DEBUG(
dbgs() <<
"Possible truncation. Ensuring a non-trivial wait.\n");
1384 N = std::min(
N, (
unsigned)MaxAsyncMarks - 1);
1387 AMDGPU::Waitcnt
Wait;
1388 if (AsyncMarks.size() <=
N) {
1393 size_t MarkIndex = AsyncMarks.size() -
N - 1;
1394 const auto &RequiredMark = AsyncMarks[MarkIndex];
1396 determineWaitForScore(
T, RequiredMark[
T],
Wait);
1402 dbgs() <<
"Removing " << (MarkIndex + 1)
1403 <<
" async marks after determining wait\n";
1405 AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
1418MCPhysReg WaitcntBrackets::determineVGPR16Dependency(
const MachineInstr &
MI,
1421 const TargetRegisterClass *RC =
Context->TRI.getPhysRegBaseClass(
Reg);
1422 unsigned Size =
Context->TRI.getRegSizeInBits(*RC);
1424 if (
Size != 16 || !
Context->ST.hasD16Writes32BitVgpr())
1434 AMDGPU::Waitcnt
Wait;
1435 for (MCRegUnit RU : regunits(OtherHalf))
1436 determineWaitForScore(
T, getVMemScore(toVMEMID(RU),
T),
Wait);
1439 if (!
Wait.hasWait())
1449 HWEventSet Events = MIEvents & OtherHalfEvents;
1457 AMDGPU::Waitcnt &
Wait,
1458 const MachineInstr &
MI)
const {
1459 if (
Reg == AMDGPU::SCC) {
1460 determineWaitForScore(
T, SCCScore,
Wait);
1464 Reg = determineVGPR16Dependency(
MI,
T,
Reg);
1465 for (MCRegUnit RU : regunits(
Reg))
1466 determineWaitForScore(
1467 T, IsVGPR ? getVMemScore(toVMEMID(RU),
T) : getSGPRScore(RU,
T),
1474 AMDGPU::Waitcnt &
Wait)
const {
1475 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1476 determineWaitForScore(
T, getVMemScore(TID,
T),
Wait);
1479void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1482 if (PendingSCCWrite &&
1483 PendingSCCWrite->
getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1485 HWEventSet SCC_WRITE_PendingEvent(HWEvent::SCC_WRITE);
1488 SCC_WRITE_PendingEvent) {
1492 PendingEvents.
remove(SCC_WRITE_PendingEvent);
1493 PendingSCCWrite =
nullptr;
1497void WaitcntBrackets::applyWaitcnt(
const AMDGPU::Waitcnt &
Wait) {
1499 applyWaitcnt(
Wait,
T);
1503 const unsigned UB = getScoreUB(
T);
1507 if (counterOutOfOrder(
T))
1509 setScoreLB(
T, std::max(getScoreLB(
T), UB -
Count));
1516 hasPendingEvent(HWEvent::SMEM_GROUP)) {
1520 PendingEvents.
remove(HWEvent::SMEM_GROUP);
1526 else if (
Count == 0)
1527 PendingEvents.
remove(HWEvent::VMEM_GROUP);
1531void WaitcntBrackets::applyWaitcnt(
const AMDGPU::Waitcnt &
Wait,
1533 unsigned Cnt =
Wait.get(
T);
1534 applyWaitcnt(
T, Cnt);
1541 if ((
T ==
Context->SmemAccessCounter &&
1542 hasPendingEvent(HWEvent::SMEM_ACCESS)) ||
1553 Events.
remove(HWEvent::GLOBAL_INV_ACCESS);
1559 return hasMixedPendingEvents(
T);
1569char SIInsertWaitcntsLegacy::
ID = 0;
1574 return new SIInsertWaitcntsLegacy();
1579 int OpIdx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
OpName);
1584 if (NewEnc == MO.
getImm())
1591bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt)
const {
1605bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1606 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1608 assert(isNormalMode(MaxCounter));
1611 MachineInstr *WaitcntInstr =
nullptr;
1612 MachineInstr *WaitcntVsCntInstr =
nullptr;
1615 dbgs() <<
"PreGFX12::applyPreexistingWaitcnt at: ";
1617 dbgs() <<
"end of block\n";
1625 if (isNonWaitcntMetaInst(
II)) {
1631 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1635 if (Opcode == AMDGPU::S_WAITCNT) {
1636 unsigned IEnc =
II.getOperand(0).getImm();
1639 ScoreBrackets.simplifyWaitcnt(OldWait);
1643 if (WaitcntInstr || (!
Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1644 II.eraseFromParent();
1648 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1651 <<
"Before: " <<
Wait <<
'\n';);
1662 II.eraseFromParent();
1663 }
else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1664 unsigned N =
II.getOperand(0).getImm();
1666 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(
N);
1669 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1670 assert(
II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1673 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1679 if (WaitcntVsCntInstr || (!
Wait.hasWaitStoreCnt() && TrySimplify)) {
1680 II.eraseFromParent();
1683 WaitcntVsCntInstr = &
II;
1690 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1699 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
1700 <<
"New Instr at block end: "
1701 << *WaitcntInstr <<
'\n'
1702 :
dbgs() <<
"applied pre-existing waitcnt\n"
1703 <<
"Old Instr: " << *It
1704 <<
"New Instr: " << *WaitcntInstr <<
'\n');
1707 if (WaitcntVsCntInstr) {
1711 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1717 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1718 <<
"New Instr at block end: " << *WaitcntVsCntInstr
1720 :
dbgs() <<
"applied pre-existing waitcnt\n"
1721 <<
"Old Instr: " << *It
1722 <<
"New Instr: " << *WaitcntVsCntInstr <<
'\n');
1730bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1732 AMDGPU::Waitcnt
Wait,
const WaitcntBrackets &ScoreBrackets) {
1733 assert(isNormalMode(MaxCounter));
1740 if (
Wait.hasWaitExceptStoreCnt()) {
1742 if (ExpandWaitcntProfiling) {
1746 bool AnyOutOfOrder =
false;
1748 unsigned WaitCnt =
Wait.get(CT);
1749 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
1750 AnyOutOfOrder =
true;
1755 if (AnyOutOfOrder) {
1763 unsigned WaitCnt =
Wait.get(CT);
1767 unsigned Outstanding =
1768 std::min(ScoreBrackets.getOutstanding(CT), getLimit(CT) - 1);
1769 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](
unsigned Count) {
1781 [[maybe_unused]]
auto SWaitInst =
1786 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1787 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1791 if (
Wait.hasWaitStoreCnt()) {
1797 unsigned Outstanding =
1800 EmitExpandedWaitcnt(
1802 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
1803 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1808 [[maybe_unused]]
auto SWaitInst =
1810 .
addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1815 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1816 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1824WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1825 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt &&
ST.hasVscnt() ? 0 : ~0u);
1829WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1830 unsigned ExpertVal = IsExpertMode ? 0 : ~0
u;
1831 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1833 ~0u , ExpertVal, ExpertVal);
1840bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1841 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1843 assert(!isNormalMode(MaxCounter));
1846 MachineInstr *CombinedLoadDsCntInstr =
nullptr;
1847 MachineInstr *CombinedStoreDsCntInstr =
nullptr;
1848 MachineInstr *WaitcntDepctrInstr =
nullptr;
1852 dbgs() <<
"GFX12Plus::applyPreexistingWaitcnt at: ";
1854 dbgs() <<
"end of block\n";
1860 AMDGPU::Waitcnt RequiredWait;
1865 if (isNonWaitcntMetaInst(
II)) {
1874 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1878 if (Opcode == AMDGPU::S_WAITCNT)
1881 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1883 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1888 RequiredWait = RequiredWait.combined(OldWait);
1890 if (CombinedLoadDsCntInstr ==
nullptr) {
1891 CombinedLoadDsCntInstr = &
II;
1893 II.eraseFromParent();
1896 }
else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1898 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1903 RequiredWait = RequiredWait.combined(OldWait);
1905 if (CombinedStoreDsCntInstr ==
nullptr) {
1906 CombinedStoreDsCntInstr = &
II;
1908 II.eraseFromParent();
1911 }
else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
1913 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1914 AMDGPU::Waitcnt OldWait;
1918 ScoreBrackets.simplifyWaitcnt(OldWait);
1920 if (WaitcntDepctrInstr ==
nullptr) {
1921 WaitcntDepctrInstr = &
II;
1930 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1938 II.eraseFromParent();
1942 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1945 II.eraseFromParent();
1947 }
else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1950 unsigned N =
II.getOperand(0).getImm();
1951 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(
N);
1954 std::optional<AMDGPU::InstCounterType> CT =
1958 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1960 Wait.add(CT.value(), OldCnt);
1962 RequiredWait.add(CT.value(), OldCnt);
1964 if (WaitInstrs[CT.value()] ==
nullptr) {
1965 WaitInstrs[CT.value()] = &
II;
1967 II.eraseFromParent();
1973 ScoreBrackets.simplifyWaitcnt(
Wait.combined(RequiredWait),
Wait);
1974 Wait =
Wait.combined(RequiredWait);
1976 if (CombinedLoadDsCntInstr) {
1992 AMDGPU::OpName::simm16, NewEnc);
1993 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1999 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
2000 <<
"New Instr at block end: "
2001 << *CombinedLoadDsCntInstr <<
'\n'
2002 :
dbgs() <<
"applied pre-existing waitcnt\n"
2003 <<
"Old Instr: " << *It <<
"New Instr: "
2004 << *CombinedLoadDsCntInstr <<
'\n');
2011 if (CombinedStoreDsCntInstr) {
2016 AMDGPU::OpName::simm16, NewEnc);
2017 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2023 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
2024 <<
"New Instr at block end: "
2025 << *CombinedStoreDsCntInstr <<
'\n'
2026 :
dbgs() <<
"applied pre-existing waitcnt\n"
2027 <<
"Old Instr: " << *It <<
"New Instr: "
2028 << *CombinedStoreDsCntInstr <<
'\n');
2058 for (MachineInstr **WI : WaitsToErase) {
2062 (*WI)->eraseFromParent();
2069 if (!WaitInstrs[CT])
2072 unsigned NewCnt =
Wait.get(CT);
2073 if (NewCnt != ~0u) {
2075 AMDGPU::OpName::simm16, NewCnt);
2076 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2078 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2082 ?
dbgs() <<
"applied pre-existing waitcnt\n"
2083 <<
"New Instr at block end: " << *WaitInstrs[CT]
2085 :
dbgs() <<
"applied pre-existing waitcnt\n"
2086 <<
"Old Instr: " << *It
2087 <<
"New Instr: " << *WaitInstrs[CT] <<
'\n');
2094 if (WaitcntDepctrInstr) {
2098 TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2113 AMDGPU::OpName::simm16, Enc);
2115 <<
"New Instr at block end: "
2116 << *WaitcntDepctrInstr <<
'\n'
2117 :
dbgs() <<
"applyPreexistingWaitcnt\n"
2118 <<
"Old Instr: " << *It <<
"New Instr: "
2119 << *WaitcntDepctrInstr <<
'\n');
2130bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2132 AMDGPU::Waitcnt
Wait,
const WaitcntBrackets &ScoreBrackets) {
2133 assert(!isNormalMode(MaxCounter));
2140 if (ExpandWaitcntProfiling) {
2147 if (ScoreBrackets.counterOutOfOrder(CT)) {
2154 unsigned Outstanding =
2155 std::min(ScoreBrackets.getOutstanding(CT), getLimit(CT) - 1);
2156 EmitExpandedWaitcnt(Outstanding,
Count, [&](
unsigned Val) {
2168 MachineInstr *SWaitInst =
nullptr;
2192 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2193 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2205 [[maybe_unused]]
auto SWaitInst =
2212 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2213 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2216 if (
Wait.hasWaitDepctr()) {
2222 [[maybe_unused]]
auto SWaitInst =
2228 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2229 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2248bool SIInsertWaitcnts::generateWaitcntInstBefore(
2249 MachineInstr &
MI, WaitcntBrackets &ScoreBrackets,
2250 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2255 AMDGPU::Waitcnt
Wait;
2256 const unsigned Opc =
MI.getOpcode();
2259 case AMDGPU::BUFFER_WBINVL1:
2260 case AMDGPU::BUFFER_WBINVL1_SC:
2261 case AMDGPU::BUFFER_WBINVL1_VOL:
2262 case AMDGPU::BUFFER_GL0_INV:
2263 case AMDGPU::BUFFER_GL1_INV: {
2271 case AMDGPU::SI_RETURN_TO_EPILOG:
2272 case AMDGPU::SI_RETURN:
2273 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2274 case AMDGPU::S_SETPC_B64_return: {
2279 AMDGPU::Waitcnt AllZeroWait =
2280 WCG->getAllZeroWaitcnt(
false);
2285 if (
ST.hasExtendedWaitCounts() &&
2286 !ScoreBrackets.hasPendingEvent(HWEvent::VMEM_ACCESS))
2291 case AMDGPU::S_ENDPGM:
2292 case AMDGPU::S_ENDPGM_SAVED: {
2303 !ScoreBrackets.hasPendingEvent(HWEvent::SCRATCH_WRITE_ACCESS);
2306 case AMDGPU::S_SENDMSG:
2307 case AMDGPU::S_SENDMSGHALT: {
2308 if (
ST.hasLegacyGeometry() &&
2323 if (
MI.modifiesRegister(AMDGPU::EXEC, &
TRI)) {
2326 if (ScoreBrackets.hasPendingEvent(HWEvent::EXP_GPR_LOCK) ||
2327 ScoreBrackets.hasPendingEvent(HWEvent::EXP_PARAM_ACCESS) ||
2328 ScoreBrackets.hasPendingEvent(HWEvent::EXP_POS_ACCESS) ||
2329 ScoreBrackets.hasPendingEvent(HWEvent::GDS_GPR_LOCK)) {
2336 if (
TII.isAlwaysGDS(
Opc) && ScoreBrackets.hasPendingGDS())
2344 Wait = AMDGPU::Waitcnt();
2346 const MachineOperand &CallAddrOp =
TII.getCalleeOperand(
MI);
2347 if (CallAddrOp.
isReg()) {
2348 ScoreBrackets.determineWaitForPhysReg(
2351 if (
const auto *RtnAddrOp =
2352 TII.getNamedOperand(
MI, AMDGPU::OpName::dst)) {
2353 ScoreBrackets.determineWaitForPhysReg(
2354 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(),
Wait,
MI);
2357 }
else if (
Opc == AMDGPU::S_BARRIER_WAIT) {
2358 ScoreBrackets.tryClearSCCWriteEvent(&
MI);
2374 for (
const MachineMemOperand *Memop :
MI.memoperands()) {
2375 const Value *Ptr = Memop->getValue();
2376 if (Memop->isStore()) {
2377 if (
auto It = SLoadAddresses.
find(Ptr); It != SLoadAddresses.
end()) {
2378 Wait.add(SmemAccessCounter, 0);
2380 SLoadAddresses.
erase(It);
2383 unsigned AS = Memop->getAddrSpace();
2387 if (
TII.mayWriteLDSThroughDMA(
MI))
2391 unsigned TID = LDSDMA_BEGIN;
2392 if (Ptr && Memop->getAAInfo()) {
2393 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2394 for (
unsigned I = 0,
E = LDSDMAStores.size();
I !=
E; ++
I) {
2395 if (
MI.mayAlias(AA, *LDSDMAStores[
I],
true)) {
2396 if ((
I + 1) >= NUM_LDSDMA) {
2411 if (Memop->isStore()) {
2417 for (
const MachineOperand &
Op :
MI.operands()) {
2422 if (
Op.isTied() &&
Op.isUse() &&
TII.doesNotReadTiedSource(
MI))
2427 const bool IsVGPR =
TRI.isVectorRegister(MRI,
Op.getReg());
2434 if (
Op.isImplicit() &&
MI.mayLoadOrStore())
2447 if (
Op.isUse() || !updateVMCntOnly(
MI) ||
2448 ScoreBrackets.hasOtherPendingVmemTypes(
Reg, getVmemType(
MI)) ||
2449 ScoreBrackets.hasPointSamplePendingVmemTypes(
MI,
Reg) ||
2450 !
ST.hasVmemWriteVgprInOrder()) {
2457 ScoreBrackets.clearVgprVmemTypes(
Reg);
2461 ScoreBrackets.hasPendingEvent(HWEvent::EXP_LDS_ACCESS)) {
2466 }
else if (
Op.getReg() == AMDGPU::SCC) {
2469 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter,
Reg,
Wait,
2473 if (
ST.hasWaitXcnt() &&
Op.isDef())
2492 if (
Opc == AMDGPU::S_BARRIER && !
ST.hasAutoWaitcntBeforeBarrier() &&
2493 !
ST.hasBackOffBarrier()) {
2494 Wait =
Wait.combined(WCG->getAllZeroWaitcnt(
true));
2501 ScoreBrackets.hasPendingEvent(HWEvent::SMEM_ACCESS)) {
2506 ScoreBrackets.simplifyWaitcnt(
Wait);
2526 Wait = WCG->getAllZeroWaitcnt(
false);
2530 if (!ForceEmitWaitcnt[
T])
2535 if (FlushFlags.FlushVmCnt) {
2541 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(
AMDGPU::DS_CNT))
2547 return generateWaitcnt(
Wait,
MI.getIterator(), *
MI.getParent(), ScoreBrackets,
2551bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt
Wait,
2553 MachineBasicBlock &
Block,
2554 WaitcntBrackets &ScoreBrackets,
2555 MachineInstr *OldWaitcntInstr) {
2558 if (OldWaitcntInstr)
2562 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr,
Wait, It);
2567 MachineOperand *WaitExp =
TII.getNamedOperand(*It, AMDGPU::OpName::waitexp);
2577 <<
"Update Instr: " << *It);
2580 if (WCG->createNewWaitcnt(
Block, It,
Wait, ScoreBrackets))
2585 ScoreBrackets.applyWaitcnt(
Wait);
2590bool SIInsertWaitcnts::isVmemAccess(
const MachineInstr &
MI)
const {
2591 return (
TII.isFLAT(
MI) &&
TII.mayAccessVMEMThroughFlat(
MI)) ||
2598 MachineBasicBlock *
Block)
const {
2599 auto BlockEnd =
Block->getParent()->end();
2600 auto BlockIter =
Block->getIterator();
2604 if (++BlockIter != BlockEnd) {
2605 It = BlockIter->instr_begin();
2612 if (!It->isMetaInstruction())
2620 return It->getOpcode() == AMDGPU::S_ENDPGM;
2624bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2625 MachineBasicBlock &
Block,
2626 WaitcntBrackets &ScoreBrackets) {
2627 AMDGPU::Waitcnt
Wait;
2628 bool NeedsEndPGMCheck =
false;
2636 NeedsEndPGMCheck =
true;
2639 ScoreBrackets.simplifyWaitcnt(
Wait);
2642 bool Result = generateWaitcnt(
Wait, SuccessorIt,
Block, ScoreBrackets,
2645 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &
Block)) {
2653void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2654 WaitcntBrackets *ScoreBrackets) {
2659 ScoreBrackets->updateByEvent(
E, Inst);
2662 if (
TII.isDS(Inst) &&
TII.usesLGKM_CNT(Inst)) {
2664 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2665 ScoreBrackets->setPendingGDS();
2667 }
else if (
TII.isFLAT(Inst)) {
2675 ScoreBrackets->setPendingFlat();
2678 ScoreBrackets->updateByEvent(HWEvent::ASYNC_ACCESS, Inst);
2681 ScoreBrackets->updateByEvent(HWEvent::TENSOR_ACCESS, Inst);
2682 }
else if (Inst.
isCall()) {
2685 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(
false));
2686 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2687 }
else if (
TII.isVINTERP(Inst)) {
2688 int64_t
Imm =
TII.getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2698bool WaitcntBrackets::mergeScore(
const MergeInfo &M,
unsigned &Score,
2699 unsigned OtherScore) {
2700 unsigned MyShifted = Score <=
M.OldLB ? 0 : Score +
M.MyShift;
2701 unsigned OtherShifted =
2702 OtherScore <=
M.OtherLB ? 0 : OtherScore +
M.OtherShift;
2703 Score = std::max(MyShifted, OtherShifted);
2704 return OtherShifted > MyShifted;
2709 bool StrictDom =
false;
2713 if (AsyncMarks.empty() && OtherMarks.
empty()) {
2720 auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.
size());
2721 MaxSize = std::min(MaxSize, MaxAsyncMarks);
2724 if (AsyncMarks.size() > MaxSize)
2725 AsyncMarks.erase(AsyncMarks.begin(),
2726 AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
2732 constexpr CounterValueArray ZeroMark{};
2733 AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
2736 dbgs() <<
"Before merge:\n";
2737 for (
const auto &Mark : AsyncMarks) {
2741 dbgs() <<
"Other marks:\n";
2742 for (
const auto &Mark : OtherMarks) {
2751 unsigned OtherSize = OtherMarks.size();
2752 unsigned OurSize = AsyncMarks.size();
2753 unsigned MergeCount = std::min(OtherSize, OurSize);
2757 if (MergeCount == 0)
2761 StrictDom |= mergeScore(MergeInfos[
T], AsyncMarks[OurSize - Idx][
T],
2762 OtherMarks[OtherSize - Idx][
T]);
2767 dbgs() <<
"After merge:\n";
2768 for (
const auto &Mark : AsyncMarks) {
2782bool WaitcntBrackets::merge(
const WaitcntBrackets &
Other) {
2783 bool StrictDom =
false;
2787 for (
auto K :
Other.VMem.keys())
2788 VMem.try_emplace(K);
2789 for (
auto K :
Other.SGPRs.keys())
2790 SGPRs.try_emplace(K);
2798 const HWEventSet OldEvents = PendingEvents & EventsForT;
2800 if (!OldEvents.
contains(OtherEvents))
2802 PendingEvents |= OtherEvents;
2805 const unsigned MyPending = ScoreUBs[
T] - ScoreLBs[
T];
2806 const unsigned OtherPending =
Other.ScoreUBs[
T] -
Other.ScoreLBs[
T];
2807 const unsigned NewUB = ScoreLBs[
T] + std::max(MyPending, OtherPending);
2808 if (NewUB < ScoreLBs[
T])
2811 MergeInfo &
M = MergeInfos[
T];
2812 M.OldLB = ScoreLBs[
T];
2813 M.OtherLB =
Other.ScoreLBs[
T];
2814 M.MyShift = NewUB - ScoreUBs[
T];
2815 M.OtherShift = NewUB -
Other.ScoreUBs[
T];
2817 ScoreUBs[
T] = NewUB;
2820 StrictDom |= mergeScore(M, LastFlatLoadCnt,
Other.LastFlatLoadCnt);
2823 StrictDom |= mergeScore(M, LastFlatDsCnt,
Other.LastFlatDsCnt);
2824 StrictDom |= mergeScore(M, LastGDS,
Other.LastGDS);
2828 StrictDom |= mergeScore(M, SCCScore,
Other.SCCScore);
2829 if (
Other.hasPendingEvent(HWEvent::SCC_WRITE)) {
2830 if (!OldEvents.
contains(HWEvent::SCC_WRITE)) {
2831 PendingSCCWrite =
Other.PendingSCCWrite;
2832 }
else if (PendingSCCWrite !=
Other.PendingSCCWrite) {
2833 PendingSCCWrite =
nullptr;
2838 for (
auto &[RegID, Info] : VMem)
2839 StrictDom |= mergeScore(M,
Info.Scores[
T],
Other.getVMemScore(RegID,
T));
2841 if (isSmemCounter(
T)) {
2842 for (
auto &[RegID, Info] : SGPRs) {
2843 auto It =
Other.SGPRs.find(RegID);
2844 unsigned OtherScore = (It !=
Other.SGPRs.end()) ? It->second.get(
T) : 0;
2845 StrictDom |= mergeScore(M,
Info.get(
T), OtherScore);
2850 for (
auto &[TID, Info] : VMem) {
2851 if (
auto It =
Other.VMem.find(TID); It !=
Other.VMem.end()) {
2852 unsigned char NewVmemTypes =
Info.VMEMTypes | It->second.VMEMTypes;
2853 StrictDom |= NewVmemTypes !=
Info.VMEMTypes;
2854 Info.VMEMTypes = NewVmemTypes;
2858 StrictDom |= mergeAsyncMarks(MergeInfos,
Other.AsyncMarks);
2860 StrictDom |= mergeScore(MergeInfos[
T], AsyncScore[
T],
Other.AsyncScore[
T]);
2862 purgeEmptyTrackingData();
2868 return Opcode == AMDGPU::S_WAITCNT ||
2871 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2872 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2873 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2874 Opcode == AMDGPU::WAIT_ASYNCMARK ||
2878void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &
MBB,
2880 bool ExpertMode)
const {
2884 .
addImm(ExpertMode ? 2 : 0)
2902class VCCZWorkaround {
2903 const WaitcntBrackets &ScoreBrackets;
2904 const GCNSubtarget &
ST;
2905 const SIInstrInfo &
TII;
2906 const SIRegisterInfo &
TRI;
2907 bool VCCZCorruptionBug =
false;
2908 bool VCCZNotUpdatedByPartialWrites =
false;
2911 bool MustRecomputeVCCZ =
true;
2914 VCCZWorkaround(
const WaitcntBrackets &ScoreBrackets,
const GCNSubtarget &ST,
2915 const SIInstrInfo &
TII,
const SIRegisterInfo &
TRI)
2917 VCCZCorruptionBug =
ST.hasReadVCCZBug();
2918 VCCZNotUpdatedByPartialWrites = !
ST.partialVCCWritesUpdateVCCZ();
2925 bool tryRecomputeVCCZ(MachineInstr &
MI) {
2927 if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)
2937 MustRecomputeVCCZ |= VCCZCorruptionBug &&
TII.isSMRD(
MI);
2943 std::optional<bool> PartiallyWritesToVCCOpt;
2944 auto PartiallyWritesToVCC = [](MachineInstr &
MI) {
2945 return MI.definesRegister(AMDGPU::VCC_LO,
nullptr) ||
2946 MI.definesRegister(AMDGPU::VCC_HI,
nullptr);
2948 if (VCCZNotUpdatedByPartialWrites) {
2949 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(
MI);
2952 MustRecomputeVCCZ |= *PartiallyWritesToVCCOpt;
2958 if (!ScoreBrackets.hasPendingEvent(HWEvent::SMEM_ACCESS) ||
2959 !VCCZCorruptionBug) {
2961 if (!PartiallyWritesToVCCOpt)
2962 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(
MI);
2963 bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&
2964 MI.definesRegister(AMDGPU::VCC,
nullptr);
2967 bool UpdatesVCCZ = FullyWritesToVCC || (!VCCZNotUpdatedByPartialWrites &&
2968 *PartiallyWritesToVCCOpt);
2970 MustRecomputeVCCZ =
false;
2980 TII.get(
ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2983 MustRecomputeVCCZ =
false;
2993bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2994 MachineBasicBlock &
Block,
2995 WaitcntBrackets &ScoreBrackets) {
2999 dbgs() <<
"*** Begin Block: ";
3001 ScoreBrackets.dump();
3003 VCCZWorkaround VCCZW(ScoreBrackets, ST,
TII,
TRI);
3006 MachineInstr *OldWaitcntInstr =
nullptr;
3011 Iter !=
E; ++Iter) {
3012 MachineInstr &Inst = *Iter;
3013 if (isNonWaitcntMetaInst(Inst))
3018 (IsExpertMode && Inst.
getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3019 if (!OldWaitcntInstr)
3020 OldWaitcntInstr = &Inst;
3024 PreheaderFlushFlags FlushFlags;
3025 if (
Block.getFirstTerminator() == Inst)
3026 FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
3029 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3031 OldWaitcntInstr =
nullptr;
3033 if (Inst.
getOpcode() == AMDGPU::ASYNCMARK) {
3037 ScoreBrackets.recordAsyncMark(Inst);
3041 if (
TII.isSMRD(Inst)) {
3042 for (
const MachineMemOperand *Memop : Inst.
memoperands()) {
3045 if (!Memop->isInvariant()) {
3046 const Value *Ptr = Memop->getValue();
3052 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3056 Modified |= insertForcedWaitAfter(Inst,
Block, ScoreBrackets);
3060 ScoreBrackets.dump();
3065 Modified |= VCCZW.tryRecomputeVCCZ(Inst);
3070 AMDGPU::Waitcnt
Wait;
3071 if (
Block.getFirstTerminator() ==
Block.end()) {
3072 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
3073 if (FlushFlags.FlushVmCnt) {
3081 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(
AMDGPU::DS_CNT))
3090 dbgs() <<
"*** End Block: ";
3092 ScoreBrackets.dump();
3098bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &
Block) {
3099 if (
Block.size() <= 1)
3107 MachineInstr *LastAtomicWithSoftXcnt =
nullptr;
3113 if (!IsLDS && (
MI.mayLoad() ^
MI.mayStore()))
3114 LastAtomicWithSoftXcnt =
nullptr;
3117 MI.mayLoad() &&
MI.mayStore();
3118 MachineInstr &PrevMI = *
MI.getPrevNode();
3120 if (PrevMI.
getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3123 if (LastAtomicWithSoftXcnt) {
3127 LastAtomicWithSoftXcnt = &
MI;
3135SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &
MBB,
3136 const WaitcntBrackets &ScoreBrackets) {
3137 auto [Iterator, IsInserted] =
3140 return Iterator->second;
3144 return PreheaderFlushFlags();
3148 return PreheaderFlushFlags();
3151 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3152 return Iterator->second;
3155 return PreheaderFlushFlags();
3158bool SIInsertWaitcnts::isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const {
3160 return TII.mayAccessVMEMThroughFlat(
MI);
3164bool SIInsertWaitcnts::isDSRead(
const MachineInstr &
MI)
const {
3170bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const {
3199SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *
ML,
3200 const WaitcntBrackets &Brackets) {
3201 PreheaderFlushFlags
Flags;
3202 bool HasVMemLoad =
false;
3203 bool HasVMemStore =
false;
3204 bool UsesVgprVMEMLoadedOutside =
false;
3205 bool UsesVgprDSReadOutside =
false;
3206 bool VMemInvalidated =
false;
3210 bool TrackSimpleDSOpt =
ST.hasExtendedWaitCounts();
3211 DenseSet<MCRegUnit> VgprUse;
3212 DenseSet<MCRegUnit> VgprDefVMEM;
3213 DenseSet<MCRegUnit> VgprDefDS;
3219 DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
3220 unsigned DSReadPosition = 0;
3221 bool IsSingleBlock =
ML->getNumBlocks() == 1;
3222 bool TrackDSFlushPoint =
ST.hasExtendedWaitCounts() && IsSingleBlock;
3223 unsigned LastDSFlushPosition = 0;
3225 for (MachineBasicBlock *
MBB :
ML->blocks()) {
3226 for (MachineInstr &
MI : *
MBB) {
3227 if (isVMEMOrFlatVMEM(
MI)) {
3228 HasVMemLoad |=
MI.mayLoad();
3229 HasVMemStore |=
MI.mayStore();
3233 if (mayStoreIncrementingDSCNT(
MI)) {
3236 if (VMemInvalidated)
3238 TrackSimpleDSOpt =
false;
3239 TrackDSFlushPoint =
false;
3241 bool IsDSRead = isDSRead(
MI);
3246 auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
3247 if (!TrackDSFlushPoint)
3249 if (
auto It = LastDSReadPositionMap.
find(RU);
3250 It != LastDSReadPositionMap.
end()) {
3254 LastDSFlushPosition = std::max(LastDSFlushPosition, It->second);
3258 for (
const MachineOperand &
Op :
MI.all_uses()) {
3259 if (
Op.isDebug() || !
TRI.isVectorRegister(MRI,
Op.getReg()))
3262 for (MCRegUnit RU :
TRI.regunits(
Op.getReg().asMCReg())) {
3266 VMemInvalidated =
true;
3270 TrackSimpleDSOpt =
false;
3273 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3277 updateDSReadFlushTracking(RU);
3282 VMEMID
ID = toVMEMID(RU);
3286 UsesVgprVMEMLoadedOutside =
true;
3291 UsesVgprDSReadOutside =
true;
3296 if (isVMEMOrFlatVMEM(
MI) &&
MI.mayLoad()) {
3297 for (
const MachineOperand &
Op :
MI.all_defs()) {
3298 for (MCRegUnit RU :
TRI.regunits(
Op.getReg().asMCReg())) {
3302 VMemInvalidated =
true;
3307 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3318 if (IsDSRead || TrackDSFlushPoint) {
3319 for (
const MachineOperand &
Op :
MI.all_defs()) {
3320 if (!
TRI.isVectorRegister(MRI,
Op.getReg()))
3322 for (MCRegUnit RU :
TRI.regunits(
Op.getReg().asMCReg())) {
3325 updateDSReadFlushTracking(RU);
3328 if (TrackDSFlushPoint)
3329 LastDSReadPositionMap[RU] = DSReadPosition;
3338 if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
3339 ((!
ST.hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3340 (HasVMemLoad &&
ST.hasVmemWriteVgprInOrder())))
3341 Flags.FlushVmCnt =
true;
3347 bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
3350 bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
3351 bool DSFlushPointPrefetch =
3352 TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
3354 if (SimpleDSOpt || DSFlushPointPrefetch)
3355 Flags.FlushDsCnt =
true;
3360bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3361 auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3363 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3365 if (
auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3366 AA = &AAR->getAAResults();
3368 return SIInsertWaitcnts(MLI, PDT, AA, MF).run();
3380 if (!SIInsertWaitcnts(MLI, PDT,
AA, MF).
run())
3385 .preserve<AAManager>();
3388bool SIInsertWaitcnts::run() {
3396 if (ST.hasExtendedWaitCounts()) {
3397 IsExpertMode = ST.hasExpertSchedulingMode() &&
3406 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, Limits,
3411 WCG = std::make_unique<WaitcntGeneratorPreGFX12>(
3415 SmemAccessCounter = getCounterFromEvent(HWEvent::SMEM_ACCESS);
3419 MachineBasicBlock &EntryBB = MF.
front();
3430 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3433 if (
ST.hasExtendedWaitCounts()) {
3442 if (!
ST.hasImageInsts() &&
3448 TII.get(instrsForExtendedCounterTypes[CT]))
3461 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
this);
3462 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3463 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3470 for (
auto *
MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3473 std::unique_ptr<WaitcntBrackets> Brackets;
3478 for (
auto BII = BlockInfos.
begin(), BIE = BlockInfos.
end(); BII != BIE;
3480 MachineBasicBlock *
MBB = BII->first;
3481 BlockInfo &BI = BII->second;
3487 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3489 *Brackets = *BI.Incoming;
3492 Brackets = std::make_unique<WaitcntBrackets>(
this);
3497 Brackets->~WaitcntBrackets();
3498 new (Brackets.get()) WaitcntBrackets(
this);
3502 if (
ST.hasWaitXcnt())
3504 Modified |= insertWaitcntInBlock(MF, *
MBB, *Brackets);
3507 if (Brackets->hasPendingEvent()) {
3508 BlockInfo *MoveBracketsToSucc =
nullptr;
3510 auto *SuccBII = BlockInfos.
find(Succ);
3511 BlockInfo &SuccBI = SuccBII->second;
3512 if (!SuccBI.Incoming) {
3513 SuccBI.Dirty =
true;
3514 if (SuccBII <= BII) {
3518 if (!MoveBracketsToSucc) {
3519 MoveBracketsToSucc = &SuccBI;
3521 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3525 dbgs() <<
"Try to merge ";
3531 if (SuccBI.Incoming->merge(*Brackets)) {
3532 SuccBI.Dirty =
true;
3533 if (SuccBII <= BII) {
3540 if (MoveBracketsToSucc)
3541 MoveBracketsToSucc->Incoming = std::move(Brackets);
3546 if (
ST.hasScalarStores()) {
3547 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3548 bool HaveScalarStores =
false;
3550 for (MachineBasicBlock &
MBB : MF) {
3551 for (MachineInstr &
MI :
MBB) {
3552 if (!HaveScalarStores &&
TII.isScalarStore(
MI))
3553 HaveScalarStores =
true;
3555 if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
3556 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3561 if (HaveScalarStores) {
3570 for (MachineBasicBlock *
MBB : EndPgmBlocks) {
3571 bool SeenDCacheWB =
false;
3575 if (
I->getOpcode() == AMDGPU::S_DCACHE_WB)
3576 SeenDCacheWB =
true;
3577 else if (
TII.isScalarStore(*
I))
3578 SeenDCacheWB =
false;
3581 if ((
I->getOpcode() == AMDGPU::S_ENDPGM ||
3582 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3598 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3600 setSchedulingMode(EntryBB,
I,
true);
3602 for (MachineInstr *
MI : CallInsts) {
3603 MachineBasicBlock &
MBB = *
MI->getParent();
3604 setSchedulingMode(
MBB,
MI,
false);
3605 setSchedulingMode(
MBB, std::next(
MI->getIterator()),
true);
3608 for (MachineInstr *
MI : ReturnInsts)
3609 setSchedulingMode(*
MI->getParent(),
MI,
false);
3620 for (
auto [
MI,
_] : EndPgmInsts) {
3622 TII.get(AMDGPU::S_ALLOC_VGPR))
3626 }
else if (!WCG->isOptNone() &&
3627 ST.getGeneration() >= AMDGPUSubtarget::GFX11 &&
3628 (MF.getFrameInfo().hasCalls() ||
3629 ST.getOccupancyWithNumVGPRs(
3630 TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass),
3633 for (
auto [
MI, Flag] : EndPgmInsts) {
3635 if (
ST.requiresNopBeforeDeallocVGPRs()) {
3637 TII.get(AMDGPU::S_NOP))
3641 TII.get(AMDGPU::S_SENDMSG))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
AMDGPU::HWEventSet HWEventSet
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Provides some synthesis utilities to produce sequences of values.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
A manager for alias analyses.
bool isEntryFunction() const
void remove(const HWEvent &Event)
bool contains(const HWEvent &Event) const
void insert(const HWEvent &Event)
unsigned get(InstCounterType T) const
void set(InstCounterType T, unsigned Val)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
size_t size() const
Get the array size.
bool empty() const
Check if the array is empty.
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
LLVM_ABI void printName(raw_ostream &os, unsigned printNameFlags=PrintNameIr, ModuleSlotTracker *moduleSlotTracker=nullptr) const
Print the basic block's name as:
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isXcntDrain(const MachineInstr &MI)
True if MI implicitly drains XCNT.
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool usesTENSOR_CNT(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool usesASYNC_CNT(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isDynamicVGPREnabled() const
void push_back(const T &Elt)
Target - Wrapper for Target specific information.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo &STI)
@ ID_DEALLOC_VGPRS_GFX11Plus
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
iota_range< InstCounterType > inst_counter_types(InstCounterType MaxCounter)
unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getHasMatrixScale(unsigned Opc)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
HWEventSet getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST, bool IsExpertMode)
unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
static constexpr StringLiteral toString(HWEvent Event)
std::optional< AMDGPU::InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
HWEvent
TODO: This should be a bitmask from the start instead of having this enum.
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
iota_range< HWEvent > hw_events(HWEvent MaxEvent=HWEvent::NUM_WAIT_EVENTS)
Return an iterator over all events between FIRST_WAIT_EVENT and MaxEvent (exclusive,...
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
auto seq_inclusive(T Begin, T End)
Iterate over an integral type from Begin to End inclusive.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
@ Async
"Asynchronous" unwind tables (instr precise)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
FunctionAddr VTableAddr Count
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
@ Increment
Incrementally increasing token ID.
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.