47#define DEBUG_TYPE "si-insert-waitcnts"
50 "Force emit s_waitcnt expcnt(0) instrs");
52 "Force emit s_waitcnt lgkmcnt(0) instrs");
54 "Force emit s_waitcnt vmcnt(0) instrs");
58 cl::desc(
"Force all waitcnt instrs to be emitted as "
59 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
63 "amdgpu-waitcnt-load-forcezero",
64 cl::desc(
"Force all waitcnt load counters to wait until 0"),
68 "amdgpu-expert-scheduling-mode",
69 cl::desc(
"Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
117 TRACKINGID_RANGE_LEN = (1 << 16),
122 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
127 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
128 LDSDMA_BEGIN = REGUNITS_END,
129 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
133static constexpr VMEMID toVMEMID(MCRegUnit RU) {
134 return static_cast<unsigned>(RU);
137#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
139 DECL(VMEM_SAMPLER_READ_ACCESS) \
140 DECL(VMEM_BVH_READ_ACCESS) \
141 DECL(GLOBAL_INV_ACCESS) \
142 DECL(VMEM_WRITE_ACCESS) \
143 DECL(SCRATCH_WRITE_ACCESS) \
153 DECL(EXP_POS_ACCESS) \
154 DECL(EXP_PARAM_ACCESS) \
156 DECL(EXP_LDS_ACCESS) \
157 DECL(VGPR_CSMACC_WRITE) \
158 DECL(VGPR_DPMACC_WRITE) \
159 DECL(VGPR_TRANS_WRITE) \
160 DECL(VGPR_XDL_WRITE) \
161 DECL(VGPR_LDS_READ) \
162 DECL(VGPR_FLAT_READ) \
166#define AMDGPU_EVENT_ENUM(Name) Name,
171#undef AMDGPU_EVENT_ENUM
185auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
186 return enum_seq(VMEM_ACCESS, MaxEvent);
189#define AMDGPU_EVENT_NAME(Name) #Name,
193#undef AMDGPU_EVENT_NAME
194static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
195 return WaitEventTypeName[
Event];
218 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
219 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
220 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
234 assert(updateVMCntOnly(Inst));
236 return VMEM_NOSAMPLER;
250 return VMEM_NOSAMPLER;
264 WaitEventSet() =
default;
265 explicit constexpr WaitEventSet(WaitEventType Event) {
266 static_assert(NUM_WAIT_EVENTS <=
sizeof(Mask) * 8,
267 "Not enough bits in Mask for all the events");
270 constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
271 for (
auto &
E : Events) {
275 void insert(
const WaitEventType &Event) { Mask |= 1 <<
Event; }
276 void remove(
const WaitEventType &Event) { Mask &= ~(1 <<
Event); }
277 void remove(
const WaitEventSet &
Other) { Mask &= ~Other.Mask; }
278 bool contains(
const WaitEventType &Event)
const {
279 return Mask & (1 <<
Event);
283 return (~Mask &
Other.Mask) == 0;
308 return Mask ==
Other.Mask;
311 bool empty()
const {
return Mask == 0; }
313 bool twoOrMore()
const {
return Mask & (Mask - 1); }
314 operator bool()
const {
return !
empty(); }
315 void print(raw_ostream &OS)
const {
316 ListSeparator
LS(
", ");
317 for (WaitEventType Event : wait_events()) {
319 OS <<
LS << getWaitEventTypeName(Event);
325void WaitEventSet::dump()
const {
330class WaitcntBrackets;
338class WaitcntGenerator {
340 const GCNSubtarget &ST;
341 const SIInstrInfo &
TII;
342 AMDGPU::IsaVersion
IV;
345 bool ExpandWaitcntProfiling =
false;
346 const AMDGPU::HardwareLimits *Limits =
nullptr;
349 WaitcntGenerator() =
delete;
350 WaitcntGenerator(
const WaitcntGenerator &) =
delete;
351 WaitcntGenerator(
const MachineFunction &MF,
InstCounterType MaxCounter,
352 const AMDGPU::HardwareLimits *Limits)
353 :
ST(MF.getSubtarget<GCNSubtarget>()),
TII(*
ST.getInstrInfo()),
357 ExpandWaitcntProfiling(
358 MF.
getFunction().hasFnAttribute(
"amdgpu-expand-waitcnt-profiling")),
363 bool isOptNone()
const {
return OptNone; }
365 const AMDGPU::HardwareLimits &getLimits()
const {
return *Limits; }
379 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
380 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
384 bool promoteSoftWaitCnt(MachineInstr *Waitcnt)
const;
389 virtual bool createNewWaitcnt(MachineBasicBlock &
Block,
391 AMDGPU::Waitcnt
Wait,
392 const WaitcntBrackets &ScoreBrackets) = 0;
408 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const = 0;
410 virtual ~WaitcntGenerator() =
default;
413class WaitcntGeneratorPreGFX12 final :
public WaitcntGenerator {
414 static constexpr const WaitEventSet
417 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
418 WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
419 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
420 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
421 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
430 using WaitcntGenerator::WaitcntGenerator;
432 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
433 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
436 bool createNewWaitcnt(MachineBasicBlock &
Block,
438 AMDGPU::Waitcnt
Wait,
439 const WaitcntBrackets &ScoreBrackets)
override;
442 return WaitEventMaskForInstPreGFX12[
T];
445 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
448class WaitcntGeneratorGFX12Plus final :
public WaitcntGenerator {
451 static constexpr const WaitEventSet
453 WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
454 WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
455 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
456 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
457 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
458 WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),
459 WaitEventSet({VMEM_BVH_READ_ACCESS}),
460 WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
461 WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
462 WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
464 WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
467 WaitcntGeneratorGFX12Plus() =
delete;
468 WaitcntGeneratorGFX12Plus(
const MachineFunction &MF,
470 const AMDGPU::HardwareLimits *Limits,
472 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
475 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
476 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
479 bool createNewWaitcnt(MachineBasicBlock &
Block,
481 AMDGPU::Waitcnt
Wait,
482 const WaitcntBrackets &ScoreBrackets)
override;
485 return WaitEventMaskForInstGFX12Plus[
T];
488 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
492struct PreheaderFlushFlags {
493 bool FlushVmCnt =
false;
494 bool FlushDsCnt =
false;
497class SIInsertWaitcnts {
499 const GCNSubtarget *
ST;
500 const SIInstrInfo *
TII =
nullptr;
501 const SIRegisterInfo *
TRI =
nullptr;
502 const MachineRegisterInfo *MRI =
nullptr;
505 bool IsExpertMode =
false;
508 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
509 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
510 MachineLoopInfo &MLI;
511 MachinePostDominatorTree &PDT;
516 std::unique_ptr<WaitcntBrackets> Incoming;
520 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
524 std::unique_ptr<WaitcntGenerator> WCG;
527 DenseSet<MachineInstr *> CallInsts;
528 DenseSet<MachineInstr *> ReturnInsts;
533 DenseMap<MachineInstr *, bool> EndPgmInsts;
535 AMDGPU::HardwareLimits Limits;
538 SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,
540 : MLI(MLI), PDT(PDT), AA(AA), MF(MF) {
541 (void)ForceExpCounter;
542 (void)ForceLgkmCounter;
543 (void)ForceVMCounter;
546 const AMDGPU::HardwareLimits &getLimits()
const {
return Limits; }
548 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *
ML,
549 const WaitcntBrackets &Brackets);
550 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &
MBB,
551 const WaitcntBrackets &ScoreBrackets);
552 bool isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const;
553 bool isDSRead(
const MachineInstr &
MI)
const;
554 bool mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const;
557 void setForceEmitWaitcnt() {
563 ForceEmitWaitcnt[
EXP_CNT] =
true;
565 ForceEmitWaitcnt[
EXP_CNT] =
false;
570 ForceEmitWaitcnt[
DS_CNT] =
true;
571 ForceEmitWaitcnt[
KM_CNT] =
true;
573 ForceEmitWaitcnt[
DS_CNT] =
false;
574 ForceEmitWaitcnt[
KM_CNT] =
false;
581 ForceEmitWaitcnt[
BVH_CNT] =
true;
585 ForceEmitWaitcnt[
BVH_CNT] =
false;
588 ForceEmitWaitcnt[
VA_VDST] =
false;
589 ForceEmitWaitcnt[
VM_VSRC] =
false;
595 WaitEventType getVmemWaitEventType(
const MachineInstr &Inst)
const {
598 case AMDGPU::GLOBAL_INV:
599 return GLOBAL_INV_ACCESS;
601 case AMDGPU::GLOBAL_WB:
602 case AMDGPU::GLOBAL_WBINV:
603 return VMEM_WRITE_ACCESS;
609 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
610 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
619 if (
TII->mayAccessScratch(Inst))
620 return SCRATCH_WRITE_ACCESS;
621 return VMEM_WRITE_ACCESS;
625 return VmemReadMapping[getVmemType(Inst)];
628 std::optional<WaitEventType>
629 getExpertSchedulingEventType(
const MachineInstr &Inst)
const;
631 bool isAsync(
const MachineInstr &
MI)
const {
636 const MachineOperand *
Async =
637 TII->getNamedOperand(
MI, AMDGPU::OpName::IsAsync);
641 bool isNonAsyncLdsDmaWrite(
const MachineInstr &
MI)
const {
645 bool isAsyncLdsDmaWrite(
const MachineInstr &
MI)
const {
649 bool isVmemAccess(
const MachineInstr &
MI)
const;
650 bool generateWaitcntInstBefore(MachineInstr &
MI,
651 WaitcntBrackets &ScoreBrackets,
652 MachineInstr *OldWaitcntInstr,
653 PreheaderFlushFlags FlushFlags);
654 bool generateWaitcnt(AMDGPU::Waitcnt
Wait,
656 MachineBasicBlock &
Block, WaitcntBrackets &ScoreBrackets,
657 MachineInstr *OldWaitcntInstr);
659 WaitEventSet getEventsFor(
const MachineInstr &Inst)
const;
660 void updateEventWaitcntAfter(MachineInstr &Inst,
661 WaitcntBrackets *ScoreBrackets);
663 MachineBasicBlock *
Block)
const;
664 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &
Block,
665 WaitcntBrackets &ScoreBrackets);
666 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &
Block,
667 WaitcntBrackets &ScoreBrackets);
670 bool removeRedundantSoftXcnts(MachineBasicBlock &
Block);
672 bool ExpertMode)
const;
674 return WCG->getWaitEvents(
T);
677 return WCG->getCounterFromEvent(
E);
689class WaitcntBrackets {
697 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
698 for (
auto &[
ID, Val] : VMem) {
702 for (
auto &[
ID, Val] : SGPRs) {
707 if (NumUnusedVmem || NumUnusedSGPRs) {
708 errs() <<
"WaitcntBracket had unused entries at destruction time: "
709 << NumUnusedVmem <<
" VMem and " << NumUnusedSGPRs
710 <<
" SGPR unused entries\n";
721 assert(isSmemCounter(
T) &&
"Invalid SMEM counter");
722 return T ==
X_CNT ? 1 : 0;
726 return ScoreUBs[
T] - ScoreLBs[
T];
730 return getVMemScore(
ID,
T) > getScoreLB(
T);
748 return getScoreUB(
T) - getScoreLB(
T);
752 auto It = SGPRs.find(RU);
753 return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(
T)] : 0;
757 auto It = VMem.find(TID);
758 return It != VMem.end() ? It->second.Scores[
T] : 0;
765 void simplifyWaitcnt(AMDGPU::Waitcnt &
Wait)
const {
768 void simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
769 AMDGPU::Waitcnt &UpdateWait)
const;
772 void simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
773 AMDGPU::Waitcnt &UpdateWait)
const;
774 void simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
775 AMDGPU::Waitcnt &UpdateWait)
const;
778 AMDGPU::Waitcnt &
Wait)
const;
780 AMDGPU::Waitcnt &
Wait)
const;
781 AMDGPU::Waitcnt determineAsyncWait(
unsigned N);
782 void tryClearSCCWriteEvent(MachineInstr *Inst);
784 void applyWaitcnt(
const AMDGPU::Waitcnt &
Wait);
787 void updateByEvent(WaitEventType
E, MachineInstr &
MI);
788 void recordAsyncMark(MachineInstr &
MI);
790 bool hasPendingEvent()
const {
return !PendingEvents.empty(); }
791 bool hasPendingEvent(WaitEventType
E)
const {
792 return PendingEvents.contains(
E);
795 bool HasPending = PendingEvents &
Context->getWaitEvents(
T);
797 "Expected pending events iff scoreboard is not empty");
802 WaitEventSet Events = PendingEvents &
Context->getWaitEvents(
T);
804 return Events.twoOrMore();
807 bool hasPendingFlat()
const {
814 void setPendingFlat() {
819 bool hasPendingGDS()
const {
820 return LastGDS > ScoreLBs[
DS_CNT] && LastGDS <= ScoreUBs[
DS_CNT];
823 unsigned getPendingGDSWait()
const {
824 return std::min(getScoreUB(
DS_CNT) - LastGDS,
828 void setPendingGDS() { LastGDS = ScoreUBs[
DS_CNT]; }
832 bool hasOtherPendingVmemTypes(
MCPhysReg Reg, VmemType V)
const {
833 for (MCRegUnit RU : regunits(
Reg)) {
834 auto It = VMem.find(toVMEMID(RU));
835 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
842 for (MCRegUnit RU : regunits(
Reg)) {
843 if (
auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
844 It->second.VMEMTypes = 0;
845 if (It->second.empty())
851 void setStateOnFunctionEntryOrReturn() {
857 ArrayRef<const MachineInstr *> getLDSDMAStores()
const {
861 bool hasPointSampleAccel(
const MachineInstr &
MI)
const;
862 bool hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
865 void print(raw_ostream &)
const;
870 void purgeEmptyTrackingData();
880 using CounterValueArray = std::array<unsigned, NUM_INST_CNTS>;
883 AMDGPU::Waitcnt &
Wait)
const;
885 static bool mergeScore(
const MergeInfo &M,
unsigned &Score,
886 unsigned OtherScore);
891 assert(
Reg != AMDGPU::SCC &&
"Shouldn't be used on SCC");
894 const TargetRegisterClass *RC =
Context->TRI->getPhysRegBaseClass(
Reg);
895 unsigned Size =
Context->TRI->getRegSizeInBits(*RC);
896 if (
Size == 16 &&
Context->ST->hasD16Writes32BitVgpr())
920 if (
Reg == AMDGPU::SCC) {
923 for (MCRegUnit RU : regunits(
Reg))
924 VMem[toVMEMID(RU)].Scores[
T] = Val;
926 auto STy = getSgprScoresIdx(
T);
927 for (MCRegUnit RU : regunits(
Reg))
928 SGPRs[RU].Scores[STy] = Val;
935 VMem[TID].Scores[
T] = Val;
941 const SIInsertWaitcnts *
Context;
945 WaitEventSet PendingEvents;
949 unsigned LastGDS = 0;
966 CounterValueArray Scores{};
968 unsigned VMEMTypes = 0;
978 std::array<unsigned, 2> Scores = {0};
980 bool empty()
const {
return !Scores[0] && !Scores[1]; }
983 DenseMap<VMEMID, VMEMInfo> VMem;
984 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
987 unsigned SCCScore = 0;
989 const MachineInstr *PendingSCCWrite =
nullptr;
993 SmallVector<const MachineInstr *> LDSDMAStores;
1002 static constexpr unsigned MaxAsyncMarks = 16;
1006 CounterValueArray AsyncScore{};
1009class SIInsertWaitcntsLegacy :
public MachineFunctionPass {
1012 SIInsertWaitcntsLegacy() : MachineFunctionPass(
ID) {}
1014 bool runOnMachineFunction(MachineFunction &MF)
override;
1016 StringRef getPassName()
const override {
1017 return "SI insert wait instructions";
1020 void getAnalysisUsage(AnalysisUsage &AU)
const override {
1023 AU.
addRequired<MachinePostDominatorTreeWrapperPass>();
1032void WaitcntBrackets::setScoreByOperand(
const MachineOperand &
Op,
1034 setRegScore(
Op.getReg().asMCReg(), CntTy, Score);
1042bool WaitcntBrackets::hasPointSampleAccel(
const MachineInstr &
MI)
const {
1047 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
1057bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
1059 if (!hasPointSampleAccel(
MI))
1062 return hasOtherPendingVmemTypes(
Reg, VMEM_NOSAMPLER);
1065void WaitcntBrackets::updateByEvent(WaitEventType
E, MachineInstr &Inst) {
1069 unsigned UB = getScoreUB(
T);
1070 unsigned CurrScore = UB + 1;
1076 PendingEvents.insert(
E);
1077 setScoreUB(
T, CurrScore);
1080 const MachineRegisterInfo *MRI =
Context->MRI;
1089 if (
const auto *AddrOp =
TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
1090 setScoreByOperand(*AddrOp,
EXP_CNT, CurrScore);
1093 if (
const auto *Data0 =
1094 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
1095 setScoreByOperand(*Data0,
EXP_CNT, CurrScore);
1096 if (
const auto *Data1 =
1097 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
1098 setScoreByOperand(*Data1,
EXP_CNT, CurrScore);
1100 Inst.
getOpcode() != AMDGPU::DS_APPEND &&
1101 Inst.
getOpcode() != AMDGPU::DS_CONSUME &&
1102 Inst.
getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1103 for (
const MachineOperand &
Op : Inst.
all_uses()) {
1104 if (
TRI->isVectorRegister(*MRI,
Op.getReg()))
1105 setScoreByOperand(
Op,
EXP_CNT, CurrScore);
1108 }
else if (
TII->isFLAT(Inst)) {
1110 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1113 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1116 }
else if (
TII->isMIMG(Inst)) {
1120 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1123 }
else if (
TII->isMTBUF(Inst)) {
1126 }
else if (
TII->isMUBUF(Inst)) {
1130 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1133 }
else if (
TII->isLDSDIR(Inst)) {
1135 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
1138 if (
TII->isEXP(Inst)) {
1143 for (MachineOperand &DefMO : Inst.
all_defs()) {
1144 if (
TRI->isVGPR(*MRI, DefMO.getReg())) {
1145 setScoreByOperand(DefMO,
EXP_CNT, CurrScore);
1149 for (
const MachineOperand &
Op : Inst.
all_uses()) {
1150 if (
TRI->isVectorRegister(*MRI,
Op.getReg()))
1151 setScoreByOperand(
Op,
EXP_CNT, CurrScore);
1155 WaitEventType OtherEvent =
E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1156 if (PendingEvents.contains(OtherEvent)) {
1161 setScoreLB(
T, getScoreUB(
T) - 1);
1162 PendingEvents.remove(OtherEvent);
1164 for (
const MachineOperand &
Op : Inst.
all_uses())
1165 setScoreByOperand(
Op,
T, CurrScore);
1169 for (
const MachineOperand &
Op : Inst.
operands()) {
1174 setScoreByOperand(
Op,
T, CurrScore);
1186 for (
const MachineOperand &
Op : Inst.
defs()) {
1188 if (!
TRI->isVectorRegister(*MRI,
Op.getReg()))
1190 if (updateVMCntOnly(Inst)) {
1195 VmemType
V = getVmemType(Inst);
1196 unsigned char TypesMask = 1 <<
V;
1199 if (hasPointSampleAccel(Inst))
1200 TypesMask |= 1 << VMEM_NOSAMPLER;
1201 for (MCRegUnit RU : regunits(
Op.getReg().asMCReg()))
1202 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1205 setScoreByOperand(
Op,
T, CurrScore);
1208 (
TII->isDS(Inst) ||
Context->isNonAsyncLdsDmaWrite(Inst))) {
1217 if (!MemOp->isStore() ||
1222 auto AAI = MemOp->getAAInfo();
1228 if (!AAI || !AAI.Scope)
1230 for (
unsigned I = 0,
E = LDSDMAStores.
size();
I !=
E && !Slot; ++
I) {
1231 for (
const auto *MemOp : LDSDMAStores[
I]->memoperands()) {
1232 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1247 setVMemScore(LDSDMA_BEGIN,
T, CurrScore);
1248 if (Slot && Slot < NUM_LDSDMA)
1249 setVMemScore(LDSDMA_BEGIN + Slot,
T, CurrScore);
1257 "unexpected GFX1250 instruction");
1258 AsyncScore[
T] = CurrScore;
1262 setRegScore(AMDGPU::SCC,
T, CurrScore);
1263 PendingSCCWrite = &Inst;
1268void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1274 AsyncMarks.push_back(AsyncScore);
1277 dbgs() <<
"recordAsyncMark:\n" << Inst;
1278 for (
const auto &Mark : AsyncMarks) {
1285void WaitcntBrackets::print(raw_ostream &OS)
const {
1289 unsigned SR = getScoreRange(
T);
1292 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"LOAD" :
"VM") <<
"_CNT("
1296 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"DS" :
"LGKM") <<
"_CNT("
1300 OS <<
" EXP_CNT(" << SR <<
"):";
1303 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"STORE" :
"VS") <<
"_CNT("
1307 OS <<
" SAMPLE_CNT(" << SR <<
"):";
1310 OS <<
" BVH_CNT(" << SR <<
"):";
1313 OS <<
" KM_CNT(" << SR <<
"):";
1316 OS <<
" X_CNT(" << SR <<
"):";
1319 OS <<
" VA_VDST(" << SR <<
"): ";
1322 OS <<
" VM_VSRC(" << SR <<
"): ";
1325 OS <<
" UNKNOWN(" << SR <<
"):";
1331 unsigned LB = getScoreLB(
T);
1334 sort(SortedVMEMIDs);
1336 for (
auto ID : SortedVMEMIDs) {
1337 unsigned RegScore = VMem.at(
ID).Scores[
T];
1340 unsigned RelScore = RegScore - LB - 1;
1341 if (
ID < REGUNITS_END) {
1342 OS <<
' ' << RelScore <<
":vRU" <<
ID;
1344 assert(
ID >= LDSDMA_BEGIN &&
ID < LDSDMA_END &&
1345 "Unhandled/unexpected ID value!");
1346 OS <<
' ' << RelScore <<
":LDSDMA" <<
ID;
1351 if (isSmemCounter(
T)) {
1353 sort(SortedSMEMIDs);
1354 for (
auto ID : SortedSMEMIDs) {
1355 unsigned RegScore = SGPRs.at(
ID).Scores[getSgprScoresIdx(
T)];
1358 unsigned RelScore = RegScore - LB - 1;
1359 OS <<
' ' << RelScore <<
":sRU" <<
static_cast<unsigned>(
ID);
1363 if (
T ==
KM_CNT && SCCScore > 0)
1364 OS <<
' ' << SCCScore <<
":scc";
1369 OS <<
"Pending Events: ";
1370 if (hasPendingEvent()) {
1372 for (
unsigned I = 0;
I != NUM_WAIT_EVENTS; ++
I) {
1373 if (hasPendingEvent((WaitEventType)
I)) {
1374 OS <<
LS << WaitEventTypeName[
I];
1382 OS <<
"Async score: ";
1383 if (AsyncScore.empty())
1389 OS <<
"Async marks: " << AsyncMarks.size() <<
'\n';
1391 for (
const auto &Mark : AsyncMarks) {
1393 unsigned MarkedScore = Mark[
T];
1396 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"LOAD" :
"VM")
1397 <<
"_CNT: " << MarkedScore;
1400 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"DS" :
"LGKM")
1401 <<
"_CNT: " << MarkedScore;
1404 OS <<
" EXP_CNT: " << MarkedScore;
1407 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"STORE" :
"VS")
1408 <<
"_CNT: " << MarkedScore;
1411 OS <<
" SAMPLE_CNT: " << MarkedScore;
1414 OS <<
" BVH_CNT: " << MarkedScore;
1417 OS <<
" KM_CNT: " << MarkedScore;
1420 OS <<
" X_CNT: " << MarkedScore;
1423 OS <<
" UNKNOWN: " << MarkedScore;
1434void WaitcntBrackets::simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
1435 AMDGPU::Waitcnt &UpdateWait)
const {
1436 simplifyWaitcnt(UpdateWait,
LOAD_CNT);
1437 simplifyWaitcnt(UpdateWait,
EXP_CNT);
1438 simplifyWaitcnt(UpdateWait,
DS_CNT);
1441 simplifyWaitcnt(UpdateWait,
BVH_CNT);
1442 simplifyWaitcnt(UpdateWait,
KM_CNT);
1443 simplifyXcnt(CheckWait, UpdateWait);
1444 simplifyWaitcnt(UpdateWait,
VA_VDST);
1445 simplifyVmVsrc(CheckWait, UpdateWait);
1449 unsigned &
Count)
const {
1453 if (
Count >= getScoreRange(
T))
1458 unsigned Cnt =
Wait.get(
T);
1459 simplifyWaitcnt(
T, Cnt);
1463void WaitcntBrackets::simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
1464 AMDGPU::Waitcnt &UpdateWait)
const {
1473 if (CheckWait.
get(
KM_CNT) == 0 && hasPendingEvent(SMEM_GROUP))
1478 if (CheckWait.
get(
LOAD_CNT) != ~0u && hasPendingEvent(VMEM_GROUP) &&
1482 simplifyWaitcnt(UpdateWait,
X_CNT);
1485void WaitcntBrackets::simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
1486 AMDGPU::Waitcnt &UpdateWait)
const {
1491 std::min({CheckWait.get(LOAD_CNT), CheckWait.get(STORE_CNT),
1492 CheckWait.get(SAMPLE_CNT), CheckWait.get(BVH_CNT),
1493 CheckWait.get(DS_CNT)}))
1495 simplifyWaitcnt(UpdateWait,
VM_VSRC);
1498void WaitcntBrackets::purgeEmptyTrackingData() {
1510 unsigned ScoreToWait,
1511 AMDGPU::Waitcnt &
Wait)
const {
1512 const unsigned LB = getScoreLB(
T);
1513 const unsigned UB = getScoreUB(
T);
1516 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1518 !
Context->ST->hasFlatLgkmVMemCountInOrder()) {
1522 addWait(
Wait,
T, 0);
1523 }
else if (counterOutOfOrder(
T)) {
1527 addWait(
Wait,
T, 0);
1531 unsigned NeededWait = std::min(
1532 UB - ScoreToWait, getWaitCountMax(
Context->getLimits(),
T) - 1);
1533 addWait(
Wait,
T, NeededWait);
1538AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(
unsigned N) {
1540 dbgs() <<
"Need " <<
N <<
" async marks. Found " << AsyncMarks.size()
1542 for (
const auto &Mark : AsyncMarks) {
1548 if (AsyncMarks.size() == MaxAsyncMarks) {
1553 LLVM_DEBUG(
dbgs() <<
"Possible truncation. Ensuring a non-trivial wait.\n");
1554 N = std::min(
N, (
unsigned)MaxAsyncMarks - 1);
1557 AMDGPU::Waitcnt
Wait;
1558 if (AsyncMarks.size() <=
N) {
1563 size_t MarkIndex = AsyncMarks.size() -
N - 1;
1564 const auto &RequiredMark = AsyncMarks[MarkIndex];
1566 determineWaitForScore(
T, RequiredMark[
T],
Wait);
1572 dbgs() <<
"Removing " << (MarkIndex + 1)
1573 <<
" async marks after determining wait\n";
1575 AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
1582 AMDGPU::Waitcnt &
Wait)
const {
1583 if (
Reg == AMDGPU::SCC) {
1584 determineWaitForScore(
T, SCCScore,
Wait);
1587 for (MCRegUnit RU : regunits(
Reg))
1588 determineWaitForScore(
1589 T, IsVGPR ? getVMemScore(toVMEMID(RU),
T) : getSGPRScore(RU,
T),
1595 AMDGPU::Waitcnt &
Wait)
const {
1596 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1597 determineWaitForScore(
T, getVMemScore(TID,
T),
Wait);
1600void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1603 if (PendingSCCWrite &&
1604 PendingSCCWrite->
getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1606 WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
1609 SCC_WRITE_PendingEvent) {
1613 PendingEvents.remove(SCC_WRITE_PendingEvent);
1614 PendingSCCWrite =
nullptr;
1618void WaitcntBrackets::applyWaitcnt(
const AMDGPU::Waitcnt &
Wait) {
1620 applyWaitcnt(
Wait,
T);
1624 const unsigned UB = getScoreUB(
T);
1628 if (counterOutOfOrder(
T))
1630 setScoreLB(
T, std::max(getScoreLB(
T), UB -
Count));
1633 PendingEvents.remove(
Context->getWaitEvents(
T));
1636 if (
T ==
KM_CNT &&
Count == 0 && hasPendingEvent(SMEM_GROUP)) {
1637 if (!hasMixedPendingEvents(
X_CNT))
1638 applyWaitcnt(
X_CNT, 0);
1640 PendingEvents.remove(SMEM_GROUP);
1642 if (
T ==
LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
1644 if (!hasMixedPendingEvents(
X_CNT))
1646 else if (
Count == 0)
1647 PendingEvents.remove(VMEM_GROUP);
1652 unsigned Cnt =
Wait.get(
T);
1653 applyWaitcnt(
T, Cnt);
1660 if ((
T ==
Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1661 (
T ==
X_CNT && hasPendingEvent(SMEM_GROUP)))
1668 unsigned Events = hasPendingEvent(
T);
1671 Events &= ~(1 << GLOBAL_INV_ACCESS);
1674 return Events & (Events - 1);
1677 return hasMixedPendingEvents(
T);
1687char SIInsertWaitcntsLegacy::
ID = 0;
1692 return new SIInsertWaitcntsLegacy();
1697 int OpIdx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
OpName);
1702 if (NewEnc == MO.
getImm())
1713 case AMDGPU::S_WAIT_LOADCNT:
1715 case AMDGPU::S_WAIT_EXPCNT:
1717 case AMDGPU::S_WAIT_STORECNT:
1719 case AMDGPU::S_WAIT_SAMPLECNT:
1721 case AMDGPU::S_WAIT_BVHCNT:
1723 case AMDGPU::S_WAIT_DSCNT:
1725 case AMDGPU::S_WAIT_KMCNT:
1727 case AMDGPU::S_WAIT_XCNT:
1734bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt)
const {
1748bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1749 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1751 assert(isNormalMode(MaxCounter));
1754 MachineInstr *WaitcntInstr =
nullptr;
1755 MachineInstr *WaitcntVsCntInstr =
nullptr;
1758 dbgs() <<
"PreGFX12::applyPreexistingWaitcnt at: ";
1760 dbgs() <<
"end of block\n";
1768 if (
II.isMetaInstruction()) {
1774 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1778 if (Opcode == AMDGPU::S_WAITCNT) {
1779 unsigned IEnc =
II.getOperand(0).getImm();
1782 ScoreBrackets.simplifyWaitcnt(OldWait);
1786 if (WaitcntInstr || (!
Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1787 II.eraseFromParent();
1791 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1794 <<
"Before: " <<
Wait <<
'\n';);
1795 ScoreBrackets.determineWaitForLDSDMA(
LOAD_CNT, LDSDMA_BEGIN,
Wait);
1804 II.eraseFromParent();
1805 }
else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1806 unsigned N =
II.getOperand(0).getImm();
1808 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(
N);
1811 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1812 assert(
II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1815 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1817 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1820 if (WaitcntVsCntInstr || (!
Wait.hasWaitStoreCnt() && TrySimplify)) {
1821 II.eraseFromParent();
1824 WaitcntVsCntInstr = &
II;
1831 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1840 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
1841 <<
"New Instr at block end: "
1842 << *WaitcntInstr <<
'\n'
1843 :
dbgs() <<
"applied pre-existing waitcnt\n"
1844 <<
"Old Instr: " << *It
1845 <<
"New Instr: " << *WaitcntInstr <<
'\n');
1848 if (WaitcntVsCntInstr) {
1850 *WaitcntVsCntInstr, AMDGPU::OpName::simm16,
Wait.get(
STORE_CNT));
1851 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1857 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1858 <<
"New Instr at block end: " << *WaitcntVsCntInstr
1860 :
dbgs() <<
"applied pre-existing waitcnt\n"
1861 <<
"Old Instr: " << *It
1862 <<
"New Instr: " << *WaitcntVsCntInstr <<
'\n');
1870bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1872 AMDGPU::Waitcnt
Wait,
const WaitcntBrackets &ScoreBrackets) {
1873 assert(isNormalMode(MaxCounter));
1881 auto EmitExpandedWaitcnt = [&](
unsigned Outstanding,
unsigned Target,
1884 EmitWaitcnt(--Outstanding);
1885 }
while (Outstanding > Target);
1891 if (
Wait.hasWaitExceptStoreCnt()) {
1893 if (ExpandWaitcntProfiling) {
1897 bool AnyOutOfOrder =
false;
1899 unsigned WaitCnt =
Wait.get(CT);
1900 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
1901 AnyOutOfOrder =
true;
1906 if (AnyOutOfOrder) {
1914 unsigned WaitCnt =
Wait.get(CT);
1918 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
1919 getWaitCountMax(getLimits(), CT) - 1);
1920 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](
unsigned Count) {
1931 [[maybe_unused]]
auto SWaitInst =
1936 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1937 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1941 if (
Wait.hasWaitStoreCnt()) {
1945 !ScoreBrackets.counterOutOfOrder(
STORE_CNT)) {
1947 unsigned Outstanding =
1948 std::min(ScoreBrackets.getOutstanding(
STORE_CNT),
1949 getWaitCountMax(getLimits(),
STORE_CNT) - 1);
1950 EmitExpandedWaitcnt(
1952 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
1953 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1957 [[maybe_unused]]
auto SWaitInst =
1959 .
addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1964 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1965 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1973WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1974 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt &&
ST.hasVscnt() ? 0 : ~0u);
1978WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1979 unsigned ExpertVal = IsExpertMode ? 0 : ~0
u;
1980 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1981 ~0u , ExpertVal, ExpertVal);
1988bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1989 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1991 assert(!isNormalMode(MaxCounter));
1994 MachineInstr *CombinedLoadDsCntInstr =
nullptr;
1995 MachineInstr *CombinedStoreDsCntInstr =
nullptr;
1996 MachineInstr *WaitcntDepctrInstr =
nullptr;
2000 dbgs() <<
"GFX12Plus::applyPreexistingWaitcnt at: ";
2002 dbgs() <<
"end of block\n";
2008 AMDGPU::Waitcnt RequiredWait;
2013 if (
II.isMetaInstruction()) {
2022 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
2026 if (Opcode == AMDGPU::S_WAITCNT)
2029 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
2031 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2036 RequiredWait = RequiredWait.combined(OldWait);
2038 if (CombinedLoadDsCntInstr ==
nullptr) {
2039 CombinedLoadDsCntInstr = &
II;
2041 II.eraseFromParent();
2044 }
else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
2046 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2051 RequiredWait = RequiredWait.combined(OldWait);
2053 if (CombinedStoreDsCntInstr ==
nullptr) {
2054 CombinedStoreDsCntInstr = &
II;
2056 II.eraseFromParent();
2059 }
else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
2061 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2062 AMDGPU::Waitcnt OldWait;
2066 ScoreBrackets.simplifyWaitcnt(OldWait);
2068 if (WaitcntDepctrInstr ==
nullptr) {
2069 WaitcntDepctrInstr = &
II;
2078 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2086 II.eraseFromParent();
2090 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
2093 II.eraseFromParent();
2095 }
else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
2101 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2103 addWait(
Wait, CT.value(), OldCnt);
2105 addWait(RequiredWait, CT.value(), OldCnt);
2107 if (WaitInstrs[CT.value()] ==
nullptr) {
2108 WaitInstrs[CT.value()] = &
II;
2110 II.eraseFromParent();
2116 ScoreBrackets.simplifyWaitcnt(
Wait.combined(RequiredWait),
Wait);
2117 Wait =
Wait.combined(RequiredWait);
2119 if (CombinedLoadDsCntInstr) {
2135 AMDGPU::OpName::simm16, NewEnc);
2136 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
2142 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
2143 <<
"New Instr at block end: "
2144 << *CombinedLoadDsCntInstr <<
'\n'
2145 :
dbgs() <<
"applied pre-existing waitcnt\n"
2146 <<
"Old Instr: " << *It <<
"New Instr: "
2147 << *CombinedLoadDsCntInstr <<
'\n');
2154 if (CombinedStoreDsCntInstr) {
2159 AMDGPU::OpName::simm16, NewEnc);
2160 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2166 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
2167 <<
"New Instr at block end: "
2168 << *CombinedStoreDsCntInstr <<
'\n'
2169 :
dbgs() <<
"applied pre-existing waitcnt\n"
2170 <<
"Old Instr: " << *It <<
"New Instr: "
2171 << *CombinedStoreDsCntInstr <<
'\n');
2201 for (MachineInstr **WI : WaitsToErase) {
2205 (*WI)->eraseFromParent();
2212 if (!WaitInstrs[CT])
2215 unsigned NewCnt =
Wait.get(CT);
2216 if (NewCnt != ~0u) {
2218 AMDGPU::OpName::simm16, NewCnt);
2219 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2221 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2222 setNoWait(
Wait, CT);
2225 ?
dbgs() <<
"applied pre-existing waitcnt\n"
2226 <<
"New Instr at block end: " << *WaitInstrs[CT]
2228 :
dbgs() <<
"applied pre-existing waitcnt\n"
2229 <<
"Old Instr: " << *It
2230 <<
"New Instr: " << *WaitInstrs[CT] <<
'\n');
2237 if (WaitcntDepctrInstr) {
2241 TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2256 AMDGPU::OpName::simm16, Enc);
2258 <<
"New Instr at block end: "
2259 << *WaitcntDepctrInstr <<
'\n'
2260 :
dbgs() <<
"applyPreexistingWaitcnt\n"
2261 <<
"Old Instr: " << *It <<
"New Instr: "
2262 << *WaitcntDepctrInstr <<
'\n');
2273bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2275 AMDGPU::Waitcnt
Wait,
const WaitcntBrackets &ScoreBrackets) {
2276 assert(!isNormalMode(MaxCounter));
2282 auto EmitExpandedWaitcnt = [&](
unsigned Outstanding,
unsigned Target,
2284 for (
unsigned I = Outstanding - 1;
I >
Target &&
I != ~0
u; --
I)
2286 EmitWaitcnt(Target);
2292 if (ExpandWaitcntProfiling) {
2299 if (ScoreBrackets.counterOutOfOrder(CT)) {
2306 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
2307 getWaitCountMax(getLimits(), CT) - 1);
2308 EmitExpandedWaitcnt(Outstanding,
Count, [&](
unsigned Val) {
2319 MachineInstr *SWaitInst =
nullptr;
2343 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2344 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2356 [[maybe_unused]]
auto SWaitInst =
2363 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2364 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2367 if (
Wait.hasWaitDepctr()) {
2372 [[maybe_unused]]
auto SWaitInst =
2378 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2379 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2398bool SIInsertWaitcnts::generateWaitcntInstBefore(
2399 MachineInstr &
MI, WaitcntBrackets &ScoreBrackets,
2400 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2402 setForceEmitWaitcnt();
2406 AMDGPU::Waitcnt
Wait;
2407 const unsigned Opc =
MI.getOpcode();
2410 case AMDGPU::BUFFER_WBINVL1:
2411 case AMDGPU::BUFFER_WBINVL1_SC:
2412 case AMDGPU::BUFFER_WBINVL1_VOL:
2413 case AMDGPU::BUFFER_GL0_INV:
2414 case AMDGPU::BUFFER_GL1_INV: {
2422 case AMDGPU::SI_RETURN_TO_EPILOG:
2423 case AMDGPU::SI_RETURN:
2424 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2425 case AMDGPU::S_SETPC_B64_return: {
2430 AMDGPU::Waitcnt AllZeroWait =
2431 WCG->getAllZeroWaitcnt(
false);
2436 if (
ST->hasExtendedWaitCounts() &&
2437 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
2442 case AMDGPU::S_ENDPGM:
2443 case AMDGPU::S_ENDPGM_SAVED: {
2452 EndPgmInsts[&
MI] = !ScoreBrackets.empty(
STORE_CNT) &&
2453 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
2456 case AMDGPU::S_SENDMSG:
2457 case AMDGPU::S_SENDMSGHALT: {
2458 if (
ST->hasLegacyGeometry() &&
2473 if (
MI.modifiesRegister(AMDGPU::EXEC,
TRI)) {
2476 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2477 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2478 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2479 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2486 if (
TII->isAlwaysGDS(
Opc) && ScoreBrackets.hasPendingGDS())
2487 addWait(
Wait,
DS_CNT, ScoreBrackets.getPendingGDSWait());
2494 Wait = AMDGPU::Waitcnt();
2496 const MachineOperand &CallAddrOp =
TII->getCalleeOperand(
MI);
2497 if (CallAddrOp.
isReg()) {
2498 ScoreBrackets.determineWaitForPhysReg(
2501 if (
const auto *RtnAddrOp =
2502 TII->getNamedOperand(
MI, AMDGPU::OpName::dst)) {
2503 ScoreBrackets.determineWaitForPhysReg(
2504 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(),
Wait);
2507 }
else if (
Opc == AMDGPU::S_BARRIER_WAIT) {
2508 ScoreBrackets.tryClearSCCWriteEvent(&
MI);
2524 for (
const MachineMemOperand *Memop :
MI.memoperands()) {
2525 const Value *Ptr = Memop->getValue();
2526 if (Memop->isStore()) {
2527 if (
auto It = SLoadAddresses.
find(Ptr); It != SLoadAddresses.
end()) {
2528 addWait(
Wait, SmemAccessCounter, 0);
2530 SLoadAddresses.
erase(It);
2533 unsigned AS = Memop->getAddrSpace();
2537 if (
TII->mayWriteLDSThroughDMA(
MI))
2541 unsigned TID = LDSDMA_BEGIN;
2542 if (Ptr && Memop->getAAInfo()) {
2543 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2544 for (
unsigned I = 0,
E = LDSDMAStores.size();
I !=
E; ++
I) {
2545 if (
MI.mayAlias(AA, *LDSDMAStores[
I],
true)) {
2546 if ((
I + 1) >= NUM_LDSDMA) {
2549 ScoreBrackets.determineWaitForLDSDMA(
LOAD_CNT, TID,
Wait);
2553 ScoreBrackets.determineWaitForLDSDMA(
LOAD_CNT, TID +
I + 1,
Wait);
2557 ScoreBrackets.determineWaitForLDSDMA(
LOAD_CNT, TID,
Wait);
2559 if (Memop->isStore()) {
2560 ScoreBrackets.determineWaitForLDSDMA(
EXP_CNT, TID,
Wait);
2565 for (
const MachineOperand &
Op :
MI.operands()) {
2570 if (
Op.isTied() &&
Op.isUse() &&
TII->doesNotReadTiedSource(
MI))
2575 const bool IsVGPR =
TRI->isVectorRegister(*MRI,
Op.getReg());
2582 if (
Op.isImplicit() &&
MI.mayLoadOrStore())
2594 if (
Op.isUse() || !updateVMCntOnly(
MI) ||
2595 ScoreBrackets.hasOtherPendingVmemTypes(
Reg, getVmemType(
MI)) ||
2596 ScoreBrackets.hasPointSamplePendingVmemTypes(
MI,
Reg) ||
2597 !
ST->hasVmemWriteVgprInOrder()) {
2601 ScoreBrackets.clearVgprVmemTypes(
Reg);
2604 if (
Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2608 }
else if (
Op.getReg() == AMDGPU::SCC) {
2611 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter,
Reg,
Wait);
2614 if (
ST->hasWaitXcnt() &&
Op.isDef())
2615 ScoreBrackets.determineWaitForPhysReg(
X_CNT,
Reg,
Wait);
2633 if (
Opc == AMDGPU::S_BARRIER && !
ST->hasAutoWaitcntBeforeBarrier() &&
2634 !
ST->hasBackOffBarrier()) {
2635 Wait =
Wait.combined(WCG->getAllZeroWaitcnt(
true));
2642 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2647 ScoreBrackets.simplifyWaitcnt(
Wait);
2653 if (
TII->isVALU(
MI))
2660 ScoreBrackets.applyWaitcnt(
Wait,
X_CNT);
2667 Wait = WCG->getAllZeroWaitcnt(
false);
2671 if (!ForceEmitWaitcnt[
T])
2676 if (FlushFlags.FlushVmCnt) {
2681 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(
DS_CNT))
2687 return generateWaitcnt(
Wait,
MI.getIterator(), *
MI.getParent(), ScoreBrackets,
2691bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt
Wait,
2693 MachineBasicBlock &
Block,
2694 WaitcntBrackets &ScoreBrackets,
2695 MachineInstr *OldWaitcntInstr) {
2698 if (OldWaitcntInstr)
2702 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr,
Wait, It);
2707 MachineOperand *WaitExp =
2708 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2718 <<
"Update Instr: " << *It);
2721 if (WCG->createNewWaitcnt(
Block, It,
Wait, ScoreBrackets))
2726 ScoreBrackets.applyWaitcnt(
Wait);
2731std::optional<WaitEventType>
2732SIInsertWaitcnts::getExpertSchedulingEventType(
const MachineInstr &Inst)
const {
2733 if (
TII->isVALU(Inst)) {
2738 if (
TII->isXDL(Inst))
2739 return VGPR_XDL_WRITE;
2741 if (
TII->isTRANS(Inst))
2742 return VGPR_TRANS_WRITE;
2745 return VGPR_DPMACC_WRITE;
2747 return VGPR_CSMACC_WRITE;
2754 if (
TII->isFLAT(Inst))
2755 return VGPR_FLAT_READ;
2757 if (
TII->isDS(Inst))
2758 return VGPR_LDS_READ;
2760 if (
TII->isVMEM(Inst) ||
TII->isVIMAGE(Inst) ||
TII->isVSAMPLE(Inst))
2761 return VGPR_VMEM_READ;
2768bool SIInsertWaitcnts::isVmemAccess(
const MachineInstr &
MI)
const {
2769 return (
TII->isFLAT(
MI) &&
TII->mayAccessVMEMThroughFlat(
MI)) ||
2776 MachineBasicBlock *
Block)
const {
2777 auto BlockEnd =
Block->getParent()->end();
2778 auto BlockIter =
Block->getIterator();
2782 if (++BlockIter != BlockEnd) {
2783 It = BlockIter->instr_begin();
2790 if (!It->isMetaInstruction())
2798 return It->getOpcode() == AMDGPU::S_ENDPGM;
2802bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2803 MachineBasicBlock &
Block,
2804 WaitcntBrackets &ScoreBrackets) {
2805 AMDGPU::Waitcnt
Wait;
2806 bool NeedsEndPGMCheck =
false;
2814 NeedsEndPGMCheck =
true;
2817 ScoreBrackets.simplifyWaitcnt(
Wait);
2820 bool Result = generateWaitcnt(
Wait, SuccessorIt,
Block, ScoreBrackets,
2823 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &
Block)) {
2831WaitEventSet SIInsertWaitcnts::getEventsFor(
const MachineInstr &Inst)
const {
2832 WaitEventSet Events;
2834 if (
const auto ET = getExpertSchedulingEventType(Inst))
2838 if (
TII->isDS(Inst) &&
TII->usesLGKM_CNT(Inst)) {
2840 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2841 Events.insert(GDS_ACCESS);
2842 Events.insert(GDS_GPR_LOCK);
2844 Events.insert(LDS_ACCESS);
2846 }
else if (
TII->isFLAT(Inst)) {
2848 Events.insert(getVmemWaitEventType(Inst));
2851 if (
TII->mayAccessVMEMThroughFlat(Inst)) {
2852 if (
ST->hasWaitXcnt())
2853 Events.insert(VMEM_GROUP);
2854 Events.insert(getVmemWaitEventType(Inst));
2856 if (
TII->mayAccessLDSThroughFlat(Inst))
2857 Events.insert(LDS_ACCESS);
2861 Inst.
getOpcode() == AMDGPU::BUFFER_WBL2)) {
2865 if (
ST->hasWaitXcnt())
2866 Events.insert(VMEM_GROUP);
2867 Events.insert(getVmemWaitEventType(Inst));
2868 if (
ST->vmemWriteNeedsExpWaitcnt() &&
2870 Events.insert(VMW_GPR_LOCK);
2872 }
else if (
TII->isSMRD(Inst)) {
2873 if (
ST->hasWaitXcnt())
2874 Events.insert(SMEM_GROUP);
2875 Events.insert(SMEM_ACCESS);
2877 Events.insert(EXP_LDS_ACCESS);
2879 unsigned Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2881 Events.insert(EXP_PARAM_ACCESS);
2883 Events.insert(EXP_POS_ACCESS);
2885 Events.insert(EXP_GPR_LOCK);
2887 Events.insert(SCC_WRITE);
2890 case AMDGPU::S_SENDMSG:
2891 case AMDGPU::S_SENDMSG_RTN_B32:
2892 case AMDGPU::S_SENDMSG_RTN_B64:
2893 case AMDGPU::S_SENDMSGHALT:
2894 Events.insert(SQ_MESSAGE);
2896 case AMDGPU::S_MEMTIME:
2897 case AMDGPU::S_MEMREALTIME:
2898 case AMDGPU::S_GET_BARRIER_STATE_M0:
2899 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2900 Events.insert(SMEM_ACCESS);
2907void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2908 WaitcntBrackets *ScoreBrackets) {
2910 WaitEventSet InstEvents = getEventsFor(Inst);
2911 for (WaitEventType
E : wait_events()) {
2912 if (InstEvents.contains(
E))
2913 ScoreBrackets->updateByEvent(
E, Inst);
2916 if (
TII->isDS(Inst) &&
TII->usesLGKM_CNT(Inst)) {
2918 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2919 ScoreBrackets->setPendingGDS();
2921 }
else if (
TII->isFLAT(Inst)) {
2929 ScoreBrackets->setPendingFlat();
2930 }
else if (Inst.
isCall()) {
2932 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(
false));
2933 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2934 }
else if (
TII->isVINTERP(Inst)) {
2935 int64_t
Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2940bool WaitcntBrackets::mergeScore(
const MergeInfo &M,
unsigned &Score,
2941 unsigned OtherScore) {
2942 unsigned MyShifted = Score <=
M.OldLB ? 0 : Score +
M.MyShift;
2943 unsigned OtherShifted =
2944 OtherScore <=
M.OtherLB ? 0 : OtherScore +
M.OtherShift;
2945 Score = std::max(MyShifted, OtherShifted);
2946 return OtherShifted > MyShifted;
2951 bool StrictDom =
false;
2955 if (AsyncMarks.empty() && OtherMarks.
empty()) {
2962 auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.
size());
2963 MaxSize = std::min(MaxSize, MaxAsyncMarks);
2966 if (AsyncMarks.size() > MaxSize)
2967 AsyncMarks.erase(AsyncMarks.begin(),
2968 AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
2974 constexpr CounterValueArray ZeroMark{};
2975 AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
2978 dbgs() <<
"Before merge:\n";
2979 for (
const auto &Mark : AsyncMarks) {
2983 dbgs() <<
"Other marks:\n";
2984 for (
const auto &Mark : OtherMarks) {
2993 unsigned OtherSize = OtherMarks.size();
2994 unsigned OurSize = AsyncMarks.size();
2995 unsigned MergeCount = std::min(OtherSize, OurSize);
2998 StrictDom |= mergeScore(MergeInfos[
T], AsyncMarks[OurSize - Idx][
T],
2999 OtherMarks[OtherSize - Idx][
T]);
3004 dbgs() <<
"After merge:\n";
3005 for (
const auto &Mark : AsyncMarks) {
3019bool WaitcntBrackets::merge(
const WaitcntBrackets &
Other) {
3020 bool StrictDom =
false;
3024 for (
auto K :
Other.VMem.keys())
3025 VMem.try_emplace(K);
3026 for (
auto K :
Other.SGPRs.keys())
3027 SGPRs.try_emplace(K);
3034 const WaitEventSet &EventsForT =
Context->getWaitEvents(
T);
3035 const WaitEventSet OldEvents = PendingEvents & EventsForT;
3036 const WaitEventSet OtherEvents =
Other.PendingEvents & EventsForT;
3037 if (!OldEvents.contains(OtherEvents))
3039 PendingEvents |= OtherEvents;
3042 const unsigned MyPending = ScoreUBs[
T] - ScoreLBs[
T];
3043 const unsigned OtherPending =
Other.ScoreUBs[
T] -
Other.ScoreLBs[
T];
3044 const unsigned NewUB = ScoreLBs[
T] + std::max(MyPending, OtherPending);
3045 if (NewUB < ScoreLBs[
T])
3048 MergeInfo &
M = MergeInfos[
T];
3049 M.OldLB = ScoreLBs[
T];
3050 M.OtherLB =
Other.ScoreLBs[
T];
3051 M.MyShift = NewUB - ScoreUBs[
T];
3052 M.OtherShift = NewUB -
Other.ScoreUBs[
T];
3054 ScoreUBs[
T] = NewUB;
3056 StrictDom |= mergeScore(M, LastFlat[
T],
Other.LastFlat[
T]);
3059 StrictDom |= mergeScore(M, LastGDS,
Other.LastGDS);
3062 StrictDom |= mergeScore(M, SCCScore,
Other.SCCScore);
3063 if (
Other.hasPendingEvent(SCC_WRITE)) {
3064 if (!OldEvents.contains(SCC_WRITE)) {
3065 PendingSCCWrite =
Other.PendingSCCWrite;
3066 }
else if (PendingSCCWrite !=
Other.PendingSCCWrite) {
3067 PendingSCCWrite =
nullptr;
3072 for (
auto &[RegID, Info] : VMem)
3073 StrictDom |= mergeScore(M,
Info.Scores[
T],
Other.getVMemScore(RegID,
T));
3075 if (isSmemCounter(
T)) {
3076 unsigned Idx = getSgprScoresIdx(
T);
3077 for (
auto &[RegID, Info] : SGPRs) {
3078 auto It =
Other.SGPRs.find(RegID);
3079 unsigned OtherScore =
3080 (It !=
Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;
3081 StrictDom |= mergeScore(M,
Info.Scores[Idx], OtherScore);
3086 for (
auto &[TID, Info] : VMem) {
3087 if (
auto It =
Other.VMem.find(TID); It !=
Other.VMem.end()) {
3088 unsigned char NewVmemTypes =
Info.VMEMTypes | It->second.VMEMTypes;
3089 StrictDom |= NewVmemTypes !=
Info.VMEMTypes;
3090 Info.VMEMTypes = NewVmemTypes;
3094 StrictDom |= mergeAsyncMarks(MergeInfos,
Other.AsyncMarks);
3096 StrictDom |= mergeScore(MergeInfos[
T], AsyncScore[
T],
Other.AsyncScore[
T]);
3098 purgeEmptyTrackingData();
3104 return Opcode == AMDGPU::S_WAITCNT ||
3107 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
3108 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
3109 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
3110 Opcode == AMDGPU::WAIT_ASYNCMARK ||
3114void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &
MBB,
3116 bool ExpertMode)
const {
3120 .
addImm(ExpertMode ? 2 : 0)
3138class VCCZWorkaround {
3139 const WaitcntBrackets &ScoreBrackets;
3140 const GCNSubtarget &
ST;
3141 const SIInstrInfo &
TII;
3142 const SIRegisterInfo &
TRI;
3143 bool VCCZCorruptionBug =
false;
3144 bool VCCZNotUpdatedByPartialWrites =
false;
3147 bool MustRecomputeVCCZ =
true;
3150 VCCZWorkaround(
const WaitcntBrackets &ScoreBrackets,
const GCNSubtarget &ST,
3151 const SIInstrInfo &
TII,
const SIRegisterInfo &
TRI)
3153 VCCZCorruptionBug =
ST.hasReadVCCZBug();
3154 VCCZNotUpdatedByPartialWrites = !
ST.partialVCCWritesUpdateVCCZ();
3161 bool tryRecomputeVCCZ(MachineInstr &
MI) {
3163 if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)
3173 MustRecomputeVCCZ |= VCCZCorruptionBug &&
TII.isSMRD(
MI);
3179 std::optional<bool> PartiallyWritesToVCCOpt;
3180 auto PartiallyWritesToVCC = [](MachineInstr &
MI) {
3181 return MI.definesRegister(AMDGPU::VCC_LO,
nullptr) ||
3182 MI.definesRegister(AMDGPU::VCC_HI,
nullptr);
3184 if (VCCZNotUpdatedByPartialWrites) {
3185 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(
MI);
3188 MustRecomputeVCCZ |= *PartiallyWritesToVCCOpt;
3194 if (!ScoreBrackets.hasPendingEvent(SMEM_ACCESS) || !VCCZCorruptionBug) {
3196 if (!PartiallyWritesToVCCOpt)
3197 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(
MI);
3198 bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&
3199 MI.definesRegister(AMDGPU::VCC,
nullptr);
3202 bool UpdatesVCCZ = FullyWritesToVCC || (!VCCZNotUpdatedByPartialWrites &&
3203 *PartiallyWritesToVCCOpt);
3205 MustRecomputeVCCZ =
false;
3215 TII.get(
ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
3218 MustRecomputeVCCZ =
false;
3228bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
3229 MachineBasicBlock &
Block,
3230 WaitcntBrackets &ScoreBrackets) {
3234 dbgs() <<
"*** Begin Block: ";
3236 ScoreBrackets.dump();
3238 VCCZWorkaround VCCZW(ScoreBrackets, *ST, *
TII, *
TRI);
3241 MachineInstr *OldWaitcntInstr =
nullptr;
3246 Iter !=
E; ++Iter) {
3247 MachineInstr &Inst = *Iter;
3253 (IsExpertMode && Inst.
getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3254 if (!OldWaitcntInstr)
3255 OldWaitcntInstr = &Inst;
3259 PreheaderFlushFlags FlushFlags;
3260 if (
Block.getFirstTerminator() == Inst)
3261 FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
3264 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3266 OldWaitcntInstr =
nullptr;
3268 if (Inst.
getOpcode() == AMDGPU::ASYNCMARK) {
3274 assert(
ST->getGeneration() < AMDGPUSubtarget::GFX12);
3275 ScoreBrackets.recordAsyncMark(Inst);
3279 if (
TII->isSMRD(Inst)) {
3280 for (
const MachineMemOperand *Memop : Inst.
memoperands()) {
3283 if (!Memop->isInvariant()) {
3284 const Value *Ptr = Memop->getValue();
3290 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3294 Modified |= insertForcedWaitAfter(Inst,
Block, ScoreBrackets);
3298 ScoreBrackets.dump();
3303 Modified |= VCCZW.tryRecomputeVCCZ(Inst);
3308 AMDGPU::Waitcnt
Wait;
3309 if (
Block.getFirstTerminator() ==
Block.end()) {
3310 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
3311 if (FlushFlags.FlushVmCnt) {
3312 if (ScoreBrackets.hasPendingEvent(
LOAD_CNT))
3314 if (ScoreBrackets.hasPendingEvent(
SAMPLE_CNT))
3316 if (ScoreBrackets.hasPendingEvent(
BVH_CNT))
3319 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(
DS_CNT))
3328 dbgs() <<
"*** End Block: ";
3330 ScoreBrackets.dump();
3336bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &
Block) {
3337 if (
Block.size() <= 1)
3345 MachineInstr *LastAtomicWithSoftXcnt =
nullptr;
3350 TII->isDS(
MI) || (
TII->isFLAT(
MI) &&
TII->mayAccessLDSThroughFlat(
MI));
3351 if (!IsLDS && (
MI.mayLoad() ^
MI.mayStore()))
3352 LastAtomicWithSoftXcnt =
nullptr;
3355 MI.mayLoad() &&
MI.mayStore();
3356 MachineInstr &PrevMI = *
MI.getPrevNode();
3358 if (PrevMI.
getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3361 if (LastAtomicWithSoftXcnt) {
3365 LastAtomicWithSoftXcnt = &
MI;
3373SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &
MBB,
3374 const WaitcntBrackets &ScoreBrackets) {
3375 auto [Iterator, IsInserted] =
3378 return Iterator->second;
3382 return PreheaderFlushFlags();
3386 return PreheaderFlushFlags();
3389 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3390 return Iterator->second;
3393 return PreheaderFlushFlags();
3396bool SIInsertWaitcnts::isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const {
3398 return TII->mayAccessVMEMThroughFlat(
MI);
3402bool SIInsertWaitcnts::isDSRead(
const MachineInstr &
MI)
const {
3408bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const {
3437SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *
ML,
3438 const WaitcntBrackets &Brackets) {
3439 PreheaderFlushFlags
Flags;
3440 bool HasVMemLoad =
false;
3441 bool HasVMemStore =
false;
3442 bool UsesVgprVMEMLoadedOutside =
false;
3443 bool UsesVgprDSReadOutside =
false;
3444 bool VMemInvalidated =
false;
3448 bool TrackSimpleDSOpt =
ST->hasExtendedWaitCounts();
3449 DenseSet<MCRegUnit> VgprUse;
3450 DenseSet<MCRegUnit> VgprDefVMEM;
3451 DenseSet<MCRegUnit> VgprDefDS;
3457 DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
3458 unsigned DSReadPosition = 0;
3459 bool IsSingleBlock =
ML->getNumBlocks() == 1;
3460 bool TrackDSFlushPoint =
ST->hasExtendedWaitCounts() && IsSingleBlock;
3461 unsigned LastDSFlushPosition = 0;
3463 for (MachineBasicBlock *
MBB :
ML->blocks()) {
3464 for (MachineInstr &
MI : *
MBB) {
3465 if (isVMEMOrFlatVMEM(
MI)) {
3466 HasVMemLoad |=
MI.mayLoad();
3467 HasVMemStore |=
MI.mayStore();
3471 if (mayStoreIncrementingDSCNT(
MI)) {
3474 if (VMemInvalidated)
3476 TrackSimpleDSOpt =
false;
3477 TrackDSFlushPoint =
false;
3479 bool IsDSRead = isDSRead(
MI);
3484 auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
3485 if (!TrackDSFlushPoint)
3487 if (
auto It = LastDSReadPositionMap.
find(RU);
3488 It != LastDSReadPositionMap.
end()) {
3492 LastDSFlushPosition = std::max(LastDSFlushPosition, It->second);
3496 for (
const MachineOperand &
Op :
MI.all_uses()) {
3497 if (
Op.isDebug() || !
TRI->isVectorRegister(*MRI,
Op.getReg()))
3500 for (MCRegUnit RU :
TRI->regunits(
Op.getReg().asMCReg())) {
3504 VMemInvalidated =
true;
3508 TrackSimpleDSOpt =
false;
3511 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3515 updateDSReadFlushTracking(RU);
3520 VMEMID
ID = toVMEMID(RU);
3524 UsesVgprVMEMLoadedOutside =
true;
3528 else if (Brackets.hasPendingVMEM(
ID,
DS_CNT))
3529 UsesVgprDSReadOutside =
true;
3534 if (isVMEMOrFlatVMEM(
MI) &&
MI.mayLoad()) {
3535 for (
const MachineOperand &
Op :
MI.all_defs()) {
3536 for (MCRegUnit RU :
TRI->regunits(
Op.getReg().asMCReg())) {
3540 VMemInvalidated =
true;
3545 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3556 if (IsDSRead || TrackDSFlushPoint) {
3557 for (
const MachineOperand &
Op :
MI.all_defs()) {
3558 if (!
TRI->isVectorRegister(*MRI,
Op.getReg()))
3560 for (MCRegUnit RU :
TRI->regunits(
Op.getReg().asMCReg())) {
3563 updateDSReadFlushTracking(RU);
3566 if (TrackDSFlushPoint)
3567 LastDSReadPositionMap[RU] = DSReadPosition;
3576 if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
3577 ((!
ST->hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3578 (HasVMemLoad &&
ST->hasVmemWriteVgprInOrder())))
3579 Flags.FlushVmCnt =
true;
3585 bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
3588 bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
3589 bool DSFlushPointPrefetch =
3590 TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
3592 if (SimpleDSOpt || DSFlushPointPrefetch)
3593 Flags.FlushDsCnt =
true;
3598bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3599 auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3601 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3603 if (
auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3604 AA = &AAR->getAAResults();
3606 return SIInsertWaitcnts(MLI, PDT, AA, MF).run();
3618 if (!SIInsertWaitcnts(MLI, PDT,
AA, MF).
run())
3623 .preserve<AAManager>();
3626bool SIInsertWaitcnts::run() {
3628 TII = ST->getInstrInfo();
3629 TRI = &
TII->getRegisterInfo();
3638 if (ST->hasExtendedWaitCounts()) {
3639 IsExpertMode = ST->hasExpertSchedulingMode() &&
3647 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, &Limits,
3657 ForceEmitWaitcnt[
T] =
false;
3659 SmemAccessCounter = getCounterFromEvent(SMEM_ACCESS);
3664 MachineBasicBlock &EntryBB = MF.
front();
3674 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3677 if (
ST->hasExtendedWaitCounts()) {
3684 if (!
ST->hasImageInsts() &&
3689 TII->get(instrsForExtendedCounterTypes[CT]))
3702 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
this);
3703 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3704 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3711 for (
auto *
MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3714 std::unique_ptr<WaitcntBrackets> Brackets;
3719 for (
auto BII = BlockInfos.
begin(), BIE = BlockInfos.
end(); BII != BIE;
3721 MachineBasicBlock *
MBB = BII->first;
3722 BlockInfo &BI = BII->second;
3728 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3730 *Brackets = *BI.Incoming;
3733 Brackets = std::make_unique<WaitcntBrackets>(
this);
3738 Brackets->~WaitcntBrackets();
3739 new (Brackets.get()) WaitcntBrackets(
this);
3743 if (
ST->hasWaitXcnt())
3745 Modified |= insertWaitcntInBlock(MF, *
MBB, *Brackets);
3748 if (Brackets->hasPendingEvent()) {
3749 BlockInfo *MoveBracketsToSucc =
nullptr;
3751 auto *SuccBII = BlockInfos.
find(Succ);
3752 BlockInfo &SuccBI = SuccBII->second;
3753 if (!SuccBI.Incoming) {
3754 SuccBI.Dirty =
true;
3755 if (SuccBII <= BII) {
3759 if (!MoveBracketsToSucc) {
3760 MoveBracketsToSucc = &SuccBI;
3762 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3766 dbgs() <<
"Try to merge ";
3772 if (SuccBI.Incoming->merge(*Brackets)) {
3773 SuccBI.Dirty =
true;
3774 if (SuccBII <= BII) {
3781 if (MoveBracketsToSucc)
3782 MoveBracketsToSucc->Incoming = std::move(Brackets);
3787 if (
ST->hasScalarStores()) {
3788 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3789 bool HaveScalarStores =
false;
3791 for (MachineBasicBlock &
MBB : MF) {
3792 for (MachineInstr &
MI :
MBB) {
3793 if (!HaveScalarStores &&
TII->isScalarStore(
MI))
3794 HaveScalarStores =
true;
3796 if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
3797 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3802 if (HaveScalarStores) {
3811 for (MachineBasicBlock *
MBB : EndPgmBlocks) {
3812 bool SeenDCacheWB =
false;
3816 if (
I->getOpcode() == AMDGPU::S_DCACHE_WB)
3817 SeenDCacheWB =
true;
3818 else if (
TII->isScalarStore(*
I))
3819 SeenDCacheWB =
false;
3822 if ((
I->getOpcode() == AMDGPU::S_ENDPGM ||
3823 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3839 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3841 setSchedulingMode(EntryBB,
I,
true);
3843 for (MachineInstr *
MI : CallInsts) {
3844 MachineBasicBlock &
MBB = *
MI->getParent();
3845 setSchedulingMode(
MBB,
MI,
false);
3846 setSchedulingMode(
MBB, std::next(
MI->getIterator()),
true);
3849 for (MachineInstr *
MI : ReturnInsts)
3850 setSchedulingMode(*
MI->getParent(),
MI,
false);
3861 for (
auto [
MI,
_] : EndPgmInsts) {
3863 TII->get(AMDGPU::S_ALLOC_VGPR))
3867 }
else if (!WCG->isOptNone() &&
3868 ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
3869 (MF.getFrameInfo().hasCalls() ||
3870 ST->getOccupancyWithNumVGPRs(
3871 TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
3874 for (
auto [
MI, Flag] : EndPgmInsts) {
3876 if (
ST->requiresNopBeforeDeallocVGPRs()) {
3878 TII->get(AMDGPU::S_NOP))
3882 TII->get(AMDGPU::S_SENDMSG))
3890 ReturnInsts.
clear();
3891 EndPgmInsts.clear();
3892 PreheadersToFlush.
clear();
3893 SLoadAddresses.
clear();
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
#define AMDGPU_EVENT_NAME(Name)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Provides some synthesis utilities to produce sequences of values.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
A manager for alias analyses.
bool isEntryFunction() const
Represents the counter values to wait for in an s_waitcnt instruction.
unsigned get(InstCounterType T) const
void set(InstCounterType T, unsigned Val)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
LLVM_ABI void printName(raw_ostream &os, unsigned printNameFlags=PrintNameIr, ModuleSlotTracker *moduleSlotTracker=nullptr) const
Print the basic block's name as:
MachineInstrBundleIterator< MachineInstr > iterator
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool usesASYNC_CNT(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isDynamicVGPREnabled() const
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
@ ID_DEALLOC_VGPRS_GFX11Plus
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
bool isDPMACCInstruction(unsigned Opc)
iota_range< InstCounterType > inst_counter_types(InstCounterType MaxCounter)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
LLVM_ABI std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
auto seq_inclusive(T Begin, T End)
Iterate over an integral type from Begin to End inclusive.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
APInt operator&(APInt a, const APInt &b)
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
bool operator!=(uint64_t V1, const APInt &V2)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
@ Async
"Asynchronous" unwind tables (instr precise)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
FunctionAddr VTableAddr Count
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool operator|=(SparseBitVector< ElementSize > &LHS, const SparseBitVector< ElementSize > *RHS)
APInt operator|(APInt a, const APInt &b)
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.
static constexpr bool is_iterable