46#define DEBUG_TYPE "si-insert-waitcnts"
49 "Force emit s_waitcnt expcnt(0) instrs");
51 "Force emit s_waitcnt lgkmcnt(0) instrs");
53 "Force emit s_waitcnt vmcnt(0) instrs");
57 cl::desc(
"Force all waitcnt instrs to be emitted as "
58 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
62 "amdgpu-waitcnt-load-forcezero",
63 cl::desc(
"Force all waitcnt load counters to wait until 0"),
67 "amdgpu-expert-scheduling-mode",
68 cl::desc(
"Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
82 SAMPLE_CNT = NUM_NORMAL_INST_CNTS,
86 NUM_EXTENDED_INST_CNTS,
87 VA_VDST = NUM_EXTENDED_INST_CNTS,
90 NUM_INST_CNTS = NUM_EXPERT_INST_CNTS
104auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
105 return enum_seq(LOAD_CNT, MaxCounter);
138 return MI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft;
146enum class AtomicRMWState {
167 TRACKINGID_RANGE_LEN = (1 << 16),
172 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
177 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
178 LDSDMA_BEGIN = REGUNITS_END,
179 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
183static constexpr VMEMID toVMEMID(MCRegUnit RU) {
184 return static_cast<unsigned>(RU);
187#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
189 DECL(VMEM_SAMPLER_READ_ACCESS) \
190 DECL(VMEM_BVH_READ_ACCESS) \
191 DECL(GLOBAL_INV_ACCESS) \
192 DECL(VMEM_WRITE_ACCESS) \
193 DECL(SCRATCH_WRITE_ACCESS) \
203 DECL(EXP_POS_ACCESS) \
204 DECL(EXP_PARAM_ACCESS) \
206 DECL(EXP_LDS_ACCESS) \
207 DECL(VGPR_CSMACC_WRITE) \
208 DECL(VGPR_DPMACC_WRITE) \
209 DECL(VGPR_TRANS_WRITE) \
210 DECL(VGPR_XDL_WRITE) \
211 DECL(VGPR_LDS_READ) \
212 DECL(VGPR_FLAT_READ) \
216#define AMDGPU_EVENT_ENUM(Name) Name,
221#undef AMDGPU_EVENT_ENUM
235auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
236 return enum_seq(VMEM_ACCESS, MaxEvent);
239#define AMDGPU_EVENT_NAME(Name) #Name,
243#undef AMDGPU_EVENT_NAME
244static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
245 return WaitEventTypeName[
Event];
267static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
268 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
269 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
270 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
278static bool isNormalMode(InstCounterType MaxCounter) {
279 return MaxCounter == NUM_NORMAL_INST_CNTS;
284 assert(updateVMCntOnly(Inst));
286 return VMEM_NOSAMPLER;
300 return VMEM_NOSAMPLER;
312 return Wait.StoreCnt;
314 return Wait.SampleCnt;
331 unsigned &WC = getCounterRef(
Wait,
T);
332 WC = std::min(WC,
Count);
336 getCounterRef(
Wait,
T) = ~0
u;
340 return getCounterRef(
Wait,
T);
348 WaitEventSet() =
default;
349 explicit constexpr WaitEventSet(WaitEventType Event) {
350 static_assert(NUM_WAIT_EVENTS <=
sizeof(Mask) * 8,
351 "Not enough bits in Mask for all the events");
354 constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
355 for (
auto &
E : Events) {
359 void insert(
const WaitEventType &Event) { Mask |= 1 <<
Event; }
360 void remove(
const WaitEventType &Event) { Mask &= ~(1 <<
Event); }
361 void remove(
const WaitEventSet &
Other) { Mask &= ~Other.Mask; }
362 bool contains(
const WaitEventType &Event)
const {
363 return Mask & (1 <<
Event);
367 return (~Mask &
Other.Mask) == 0;
392 return Mask ==
Other.Mask;
395 bool empty()
const {
return Mask == 0; }
397 bool twoOrMore()
const {
return Mask & (Mask - 1); }
398 operator bool()
const {
return !
empty(); }
399 void print(raw_ostream &OS)
const {
400 ListSeparator
LS(
", ");
401 for (WaitEventType Event : wait_events()) {
402 OS <<
LS << getWaitEventTypeName(Event);
408void WaitEventSet::dump()
const {
414InstCounterType eventCounter(
const WaitEventSet *masks, WaitEventType
E) {
415 for (
auto T : inst_counter_types()) {
422class WaitcntBrackets;
430class WaitcntGenerator {
432 const GCNSubtarget *ST =
nullptr;
433 const SIInstrInfo *
TII =
nullptr;
434 AMDGPU::IsaVersion
IV;
435 InstCounterType MaxCounter;
437 bool ExpandWaitcntProfiling =
false;
438 const AMDGPU::HardwareLimits *Limits =
nullptr;
441 WaitcntGenerator() =
delete;
442 WaitcntGenerator(
const WaitcntGenerator &) =
delete;
443 WaitcntGenerator(
const MachineFunction &MF, InstCounterType MaxCounter,
444 const AMDGPU::HardwareLimits *Limits)
445 :
ST(&MF.getSubtarget<GCNSubtarget>()),
TII(
ST->getInstrInfo()),
449 ExpandWaitcntProfiling(
450 MF.
getFunction().hasFnAttribute(
"amdgpu-expand-waitcnt-profiling")),
455 bool isOptNone()
const {
return OptNone; }
457 const AMDGPU::HardwareLimits &getLimits()
const {
return *Limits; }
471 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
472 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
476 bool promoteSoftWaitCnt(MachineInstr *Waitcnt)
const;
481 virtual bool createNewWaitcnt(MachineBasicBlock &
Block,
483 AMDGPU::Waitcnt
Wait,
484 const WaitcntBrackets &ScoreBrackets) = 0;
488 virtual const WaitEventSet *getWaitEventMask()
const = 0;
492 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const = 0;
494 virtual ~WaitcntGenerator() =
default;
497class WaitcntGeneratorPreGFX12 final :
public WaitcntGenerator {
498 static constexpr const WaitEventSet
499 WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
501 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
502 WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
503 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
504 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
505 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
514 using WaitcntGenerator::WaitcntGenerator;
516 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
517 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
520 bool createNewWaitcnt(MachineBasicBlock &
Block,
522 AMDGPU::Waitcnt
Wait,
523 const WaitcntBrackets &ScoreBrackets)
override;
525 const WaitEventSet *getWaitEventMask()
const override {
527 return WaitEventMaskForInstPreGFX12;
530 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
533class WaitcntGeneratorGFX12Plus final :
public WaitcntGenerator {
536 static constexpr const WaitEventSet
537 WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
538 WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
539 WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
540 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
541 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
542 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
543 WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),
544 WaitEventSet({VMEM_BVH_READ_ACCESS}),
545 WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
546 WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
547 WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
549 WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
552 WaitcntGeneratorGFX12Plus() =
delete;
553 WaitcntGeneratorGFX12Plus(
const MachineFunction &MF,
554 InstCounterType MaxCounter,
555 const AMDGPU::HardwareLimits *Limits,
557 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
560 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
561 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
564 bool createNewWaitcnt(MachineBasicBlock &
Block,
566 AMDGPU::Waitcnt
Wait,
567 const WaitcntBrackets &ScoreBrackets)
override;
569 const WaitEventSet *getWaitEventMask()
const override {
571 return WaitEventMaskForInstGFX12Plus;
574 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
578struct PreheaderFlushFlags {
579 bool FlushVmCnt =
false;
580 bool FlushDsCnt =
false;
583class SIInsertWaitcnts {
585 const GCNSubtarget *ST;
586 const SIInstrInfo *
TII =
nullptr;
587 const SIRegisterInfo *
TRI =
nullptr;
588 const MachineRegisterInfo *
MRI =
nullptr;
589 InstCounterType SmemAccessCounter;
590 InstCounterType MaxCounter;
591 bool IsExpertMode =
false;
594 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
595 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
596 MachineLoopInfo *MLI;
597 MachinePostDominatorTree *PDT;
601 std::unique_ptr<WaitcntBrackets> Incoming;
605 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
607 bool ForceEmitWaitcnt[NUM_INST_CNTS];
609 std::unique_ptr<WaitcntGenerator> WCG;
612 DenseSet<MachineInstr *> CallInsts;
613 DenseSet<MachineInstr *> ReturnInsts;
618 DenseMap<MachineInstr *, bool> EndPgmInsts;
620 AMDGPU::HardwareLimits Limits;
623 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
625 : MLI(MLI), PDT(PDT), AA(AA) {
626 (void)ForceExpCounter;
627 (void)ForceLgkmCounter;
628 (void)ForceVMCounter;
631 const AMDGPU::HardwareLimits &getLimits()
const {
return Limits; }
633 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *
ML,
634 const WaitcntBrackets &Brackets);
635 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &
MBB,
636 const WaitcntBrackets &ScoreBrackets);
637 bool isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const;
638 bool isDSRead(
const MachineInstr &
MI)
const;
639 bool mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const;
640 bool run(MachineFunction &MF);
642 void setForceEmitWaitcnt() {
648 ForceEmitWaitcnt[
EXP_CNT] =
true;
650 ForceEmitWaitcnt[
EXP_CNT] =
false;
655 ForceEmitWaitcnt[DS_CNT] =
true;
656 ForceEmitWaitcnt[KM_CNT] =
true;
658 ForceEmitWaitcnt[DS_CNT] =
false;
659 ForceEmitWaitcnt[KM_CNT] =
false;
664 ForceEmitWaitcnt[LOAD_CNT] =
true;
665 ForceEmitWaitcnt[SAMPLE_CNT] =
true;
666 ForceEmitWaitcnt[BVH_CNT] =
true;
668 ForceEmitWaitcnt[LOAD_CNT] =
false;
669 ForceEmitWaitcnt[SAMPLE_CNT] =
false;
670 ForceEmitWaitcnt[BVH_CNT] =
false;
673 ForceEmitWaitcnt[VA_VDST] =
false;
674 ForceEmitWaitcnt[VM_VSRC] =
false;
680 WaitEventType getVmemWaitEventType(
const MachineInstr &Inst)
const {
683 case AMDGPU::GLOBAL_INV:
684 return GLOBAL_INV_ACCESS;
686 case AMDGPU::GLOBAL_WB:
687 case AMDGPU::GLOBAL_WBINV:
688 return VMEM_WRITE_ACCESS;
694 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
695 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
704 if (
TII->mayAccessScratch(Inst))
705 return SCRATCH_WRITE_ACCESS;
706 return VMEM_WRITE_ACCESS;
710 return VmemReadMapping[getVmemType(Inst)];
713 std::optional<WaitEventType>
714 getExpertSchedulingEventType(
const MachineInstr &Inst)
const;
716 bool isVmemAccess(
const MachineInstr &
MI)
const;
717 bool generateWaitcntInstBefore(MachineInstr &
MI,
718 WaitcntBrackets &ScoreBrackets,
719 MachineInstr *OldWaitcntInstr,
720 PreheaderFlushFlags FlushFlags);
721 bool generateWaitcnt(AMDGPU::Waitcnt
Wait,
723 MachineBasicBlock &
Block, WaitcntBrackets &ScoreBrackets,
724 MachineInstr *OldWaitcntInstr);
725 void updateEventWaitcntAfter(MachineInstr &Inst,
726 WaitcntBrackets *ScoreBrackets);
728 MachineBasicBlock *
Block)
const;
729 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &
Block,
730 WaitcntBrackets &ScoreBrackets);
731 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &
Block,
732 WaitcntBrackets &ScoreBrackets);
734 bool ExpertMode)
const;
735 AtomicRMWState getAtomicRMWState(MachineInstr &
MI,
736 AtomicRMWState PrevState)
const;
737 const WaitEventSet *getWaitEventMask()
const {
738 return WCG->getWaitEventMask();
750class WaitcntBrackets {
758 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
759 for (
auto &[
ID, Val] : VMem) {
763 for (
auto &[
ID, Val] : SGPRs) {
768 if (NumUnusedVmem || NumUnusedSGPRs) {
769 errs() <<
"WaitcntBracket had unused entries at destruction time: "
770 << NumUnusedVmem <<
" VMem and " << NumUnusedSGPRs
771 <<
" SGPR unused entries\n";
777 bool isSmemCounter(InstCounterType
T)
const {
778 return T ==
Context->SmemAccessCounter ||
T == X_CNT;
781 unsigned getSgprScoresIdx(InstCounterType
T)
const {
782 assert(isSmemCounter(
T) &&
"Invalid SMEM counter");
783 return T == X_CNT ? 1 : 0;
786 unsigned getScoreLB(InstCounterType
T)
const {
791 unsigned getScoreUB(InstCounterType
T)
const {
796 unsigned getScoreRange(InstCounterType
T)
const {
797 return getScoreUB(
T) - getScoreLB(
T);
800 unsigned getSGPRScore(MCRegUnit RU, InstCounterType
T)
const {
801 auto It = SGPRs.find(RU);
802 return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(
T)] : 0;
805 unsigned getVMemScore(VMEMID TID, InstCounterType
T)
const {
806 auto It = VMem.find(TID);
807 return It != VMem.end() ? It->second.Scores[
T] : 0;
812 bool counterOutOfOrder(InstCounterType
T)
const;
813 void simplifyWaitcnt(AMDGPU::Waitcnt &
Wait)
const {
816 void simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
817 AMDGPU::Waitcnt &UpdateWait)
const;
818 void simplifyWaitcnt(InstCounterType
T,
unsigned &
Count)
const;
819 void simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
820 AMDGPU::Waitcnt &UpdateWait)
const;
821 void simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
822 AMDGPU::Waitcnt &UpdateWait)
const;
824 void determineWaitForPhysReg(InstCounterType
T,
MCPhysReg Reg,
825 AMDGPU::Waitcnt &
Wait)
const;
826 void determineWaitForLDSDMA(InstCounterType
T, VMEMID TID,
827 AMDGPU::Waitcnt &
Wait)
const;
828 void tryClearSCCWriteEvent(MachineInstr *Inst);
830 void applyWaitcnt(
const AMDGPU::Waitcnt &
Wait);
831 void applyWaitcnt(InstCounterType
T,
unsigned Count);
832 void updateByEvent(WaitEventType
E, MachineInstr &
MI);
834 bool hasPendingEvent()
const {
return !PendingEvents.empty(); }
835 bool hasPendingEvent(WaitEventType
E)
const {
836 return PendingEvents.contains(
E);
838 bool hasPendingEvent(InstCounterType
T)
const {
839 bool HasPending = PendingEvents &
Context->getWaitEventMask()[
T];
840 assert(HasPending == (getScoreRange(
T) != 0) &&
841 "Expected no pending events iff scoreboard is empty");
845 bool hasMixedPendingEvents(InstCounterType
T)
const {
846 WaitEventSet Events = PendingEvents &
Context->getWaitEventMask()[
T];
848 return Events.twoOrMore();
851 bool hasPendingFlat()
const {
852 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
853 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
854 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
855 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
858 void setPendingFlat() {
859 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
860 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
863 bool hasPendingGDS()
const {
864 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
867 unsigned getPendingGDSWait()
const {
868 return std::min(getScoreUB(DS_CNT) - LastGDS,
869 getWaitCountMax(
Context->getLimits(), DS_CNT) - 1);
872 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
876 bool hasOtherPendingVmemTypes(
MCPhysReg Reg, VmemType V)
const {
877 for (MCRegUnit RU : regunits(
Reg)) {
878 auto It = VMem.find(toVMEMID(RU));
879 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
886 for (MCRegUnit RU : regunits(
Reg)) {
887 if (
auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
888 It->second.VMEMTypes = 0;
889 if (It->second.empty())
895 void setStateOnFunctionEntryOrReturn() {
896 setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) +
897 getWaitCountMax(
Context->getLimits(), STORE_CNT));
898 PendingEvents |=
Context->getWaitEventMask()[STORE_CNT];
901 ArrayRef<const MachineInstr *> getLDSDMAStores()
const {
905 bool hasPointSampleAccel(
const MachineInstr &
MI)
const;
906 bool hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
909 void print(raw_ostream &)
const;
914 void purgeEmptyTrackingData();
924 void determineWaitForScore(InstCounterType
T,
unsigned Score,
925 AMDGPU::Waitcnt &
Wait)
const;
927 static bool mergeScore(
const MergeInfo &M,
unsigned &Score,
928 unsigned OtherScore);
931 assert(
Reg != AMDGPU::SCC &&
"Shouldn't be used on SCC");
934 const TargetRegisterClass *RC =
Context->TRI->getPhysRegBaseClass(
Reg);
935 unsigned Size =
Context->TRI->getRegSizeInBits(*RC);
936 if (
Size == 16 &&
Context->ST->hasD16Writes32BitVgpr())
941 void setScoreLB(InstCounterType
T,
unsigned Val) {
946 void setScoreUB(InstCounterType
T,
unsigned Val) {
953 if (getScoreRange(EXP_CNT) > getWaitCountMax(
Context->getLimits(), EXP_CNT))
955 ScoreUBs[
EXP_CNT] - getWaitCountMax(
Context->getLimits(), EXP_CNT);
958 void setRegScore(
MCPhysReg Reg, InstCounterType
T,
unsigned Val) {
960 if (
Reg == AMDGPU::SCC) {
963 for (MCRegUnit RU : regunits(
Reg))
964 VMem[toVMEMID(RU)].Scores[
T] = Val;
966 auto STy = getSgprScoresIdx(
T);
967 for (MCRegUnit RU : regunits(
Reg))
968 SGPRs[RU].Scores[STy] = Val;
974 void setVMemScore(VMEMID TID, InstCounterType
T,
unsigned Val) {
975 VMem[TID].Scores[
T] = Val;
978 void setScoreByOperand(
const MachineOperand &
Op, InstCounterType CntTy,
981 const SIInsertWaitcnts *
Context;
983 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
984 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
985 WaitEventSet PendingEvents;
987 unsigned LastFlat[NUM_INST_CNTS] = {0};
989 unsigned LastGDS = 0;
1006 std::array<unsigned, NUM_INST_CNTS> Scores = {0};
1008 unsigned VMEMTypes = 0;
1018 std::array<unsigned, 2> Scores = {0};
1020 bool empty()
const {
return !Scores[0] && !Scores[1]; }
1023 DenseMap<VMEMID, VMEMInfo> VMem;
1024 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
1027 unsigned SCCScore = 0;
1029 const MachineInstr *PendingSCCWrite =
nullptr;
1033 SmallVector<const MachineInstr *> LDSDMAStores;
1036class SIInsertWaitcntsLegacy :
public MachineFunctionPass {
1039 SIInsertWaitcntsLegacy() : MachineFunctionPass(
ID) {}
1041 bool runOnMachineFunction(MachineFunction &MF)
override;
1043 StringRef getPassName()
const override {
1044 return "SI insert wait instructions";
1047 void getAnalysisUsage(AnalysisUsage &AU)
const override {
1050 AU.
addRequired<MachinePostDominatorTreeWrapperPass>();
1059void WaitcntBrackets::setScoreByOperand(
const MachineOperand &
Op,
1060 InstCounterType CntTy,
unsigned Score) {
1061 setRegScore(
Op.getReg().asMCReg(), CntTy, Score);
1069bool WaitcntBrackets::hasPointSampleAccel(
const MachineInstr &
MI)
const {
1074 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
1084bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
1086 if (!hasPointSampleAccel(
MI))
1089 return hasOtherPendingVmemTypes(
Reg, VMEM_NOSAMPLER);
1092void WaitcntBrackets::updateByEvent(WaitEventType
E, MachineInstr &Inst) {
1093 InstCounterType
T = eventCounter(
Context->getWaitEventMask(),
E);
1096 unsigned UB = getScoreUB(
T);
1097 unsigned CurrScore = UB + 1;
1103 PendingEvents.insert(
E);
1104 setScoreUB(
T, CurrScore);
1107 const MachineRegisterInfo *
MRI =
Context->MRI;
1116 if (
const auto *AddrOp =
TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
1117 setScoreByOperand(*AddrOp, EXP_CNT, CurrScore);
1120 if (
const auto *Data0 =
1121 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
1122 setScoreByOperand(*Data0, EXP_CNT, CurrScore);
1123 if (
const auto *Data1 =
1124 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
1125 setScoreByOperand(*Data1, EXP_CNT, CurrScore);
1127 Inst.
getOpcode() != AMDGPU::DS_APPEND &&
1128 Inst.
getOpcode() != AMDGPU::DS_CONSUME &&
1129 Inst.
getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1130 for (
const MachineOperand &
Op : Inst.
all_uses()) {
1131 if (
TRI->isVectorRegister(*
MRI,
Op.getReg()))
1132 setScoreByOperand(
Op, EXP_CNT, CurrScore);
1135 }
else if (
TII->isFLAT(Inst)) {
1137 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1138 EXP_CNT, CurrScore);
1140 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1141 EXP_CNT, CurrScore);
1143 }
else if (
TII->isMIMG(Inst)) {
1145 setScoreByOperand(Inst.
getOperand(0), EXP_CNT, CurrScore);
1147 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1148 EXP_CNT, CurrScore);
1150 }
else if (
TII->isMTBUF(Inst)) {
1152 setScoreByOperand(Inst.
getOperand(0), EXP_CNT, CurrScore);
1153 }
else if (
TII->isMUBUF(Inst)) {
1155 setScoreByOperand(Inst.
getOperand(0), EXP_CNT, CurrScore);
1157 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1158 EXP_CNT, CurrScore);
1160 }
else if (
TII->isLDSDIR(Inst)) {
1162 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
1163 EXP_CNT, CurrScore);
1165 if (
TII->isEXP(Inst)) {
1170 for (MachineOperand &DefMO : Inst.
all_defs()) {
1171 if (
TRI->isVGPR(*
MRI, DefMO.getReg())) {
1172 setScoreByOperand(DefMO, EXP_CNT, CurrScore);
1176 for (
const MachineOperand &
Op : Inst.
all_uses()) {
1177 if (
TRI->isVectorRegister(*
MRI,
Op.getReg()))
1178 setScoreByOperand(
Op, EXP_CNT, CurrScore);
1181 }
else if (
T == X_CNT) {
1182 WaitEventType OtherEvent =
E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1183 if (PendingEvents.contains(OtherEvent)) {
1188 setScoreLB(
T, getScoreUB(
T) - 1);
1189 PendingEvents.remove(OtherEvent);
1191 for (
const MachineOperand &
Op : Inst.
all_uses())
1192 setScoreByOperand(
Op,
T, CurrScore);
1193 }
else if (
T == VA_VDST ||
T == VM_VSRC) {
1196 for (
const MachineOperand &
Op : Inst.
operands()) {
1197 if (!
Op.isReg() || (
T == VA_VDST &&
Op.isUse()) ||
1198 (
T == VM_VSRC &&
Op.isDef()))
1201 setScoreByOperand(
Op,
T, CurrScore);
1213 for (
const MachineOperand &
Op : Inst.
defs()) {
1214 if (
T == LOAD_CNT ||
T == SAMPLE_CNT ||
T == BVH_CNT) {
1215 if (!
TRI->isVectorRegister(*
MRI,
Op.getReg()))
1217 if (updateVMCntOnly(Inst)) {
1222 VmemType
V = getVmemType(Inst);
1223 unsigned char TypesMask = 1 <<
V;
1226 if (hasPointSampleAccel(Inst))
1227 TypesMask |= 1 << VMEM_NOSAMPLER;
1228 for (MCRegUnit RU : regunits(
Op.getReg().asMCReg()))
1229 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1232 setScoreByOperand(
Op,
T, CurrScore);
1235 (
TII->isDS(Inst) ||
TII->mayWriteLDSThroughDMA(Inst))) {
1244 if (!MemOp->isStore() ||
1249 auto AAI = MemOp->getAAInfo();
1255 if (!AAI || !AAI.Scope)
1257 for (
unsigned I = 0,
E = LDSDMAStores.
size();
I !=
E && !Slot; ++
I) {
1258 for (
const auto *MemOp : LDSDMAStores[
I]->memoperands()) {
1259 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1274 setVMemScore(LDSDMA_BEGIN,
T, CurrScore);
1275 if (Slot && Slot < NUM_LDSDMA)
1276 setVMemScore(LDSDMA_BEGIN + Slot,
T, CurrScore);
1280 setRegScore(AMDGPU::SCC,
T, CurrScore);
1281 PendingSCCWrite = &Inst;
1286void WaitcntBrackets::print(raw_ostream &OS)
const {
1290 for (
auto T : inst_counter_types(
Context->MaxCounter)) {
1291 unsigned SR = getScoreRange(
T);
1295 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"LOAD" :
"VM") <<
"_CNT("
1299 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"DS" :
"LGKM") <<
"_CNT("
1303 OS <<
" EXP_CNT(" << SR <<
"):";
1306 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"STORE" :
"VS") <<
"_CNT("
1310 OS <<
" SAMPLE_CNT(" << SR <<
"):";
1313 OS <<
" BVH_CNT(" << SR <<
"):";
1316 OS <<
" KM_CNT(" << SR <<
"):";
1319 OS <<
" X_CNT(" << SR <<
"):";
1322 OS <<
" VA_VDST(" << SR <<
"): ";
1325 OS <<
" VM_VSRC(" << SR <<
"): ";
1328 OS <<
" UNKNOWN(" << SR <<
"):";
1334 unsigned LB = getScoreLB(
T);
1337 sort(SortedVMEMIDs);
1339 for (
auto ID : SortedVMEMIDs) {
1340 unsigned RegScore = VMem.at(
ID).Scores[
T];
1343 unsigned RelScore = RegScore - LB - 1;
1344 if (
ID < REGUNITS_END) {
1345 OS <<
' ' << RelScore <<
":vRU" <<
ID;
1347 assert(
ID >= LDSDMA_BEGIN &&
ID < LDSDMA_END &&
1348 "Unhandled/unexpected ID value!");
1349 OS <<
' ' << RelScore <<
":LDSDMA" <<
ID;
1354 if (isSmemCounter(
T)) {
1356 sort(SortedSMEMIDs);
1357 for (
auto ID : SortedSMEMIDs) {
1358 unsigned RegScore = SGPRs.at(
ID).Scores[getSgprScoresIdx(
T)];
1361 unsigned RelScore = RegScore - LB - 1;
1362 OS <<
' ' << RelScore <<
":sRU" <<
static_cast<unsigned>(
ID);
1366 if (
T == KM_CNT && SCCScore > 0)
1367 OS <<
' ' << SCCScore <<
":scc";
1372 OS <<
"Pending Events: ";
1373 if (hasPendingEvent()) {
1375 for (
unsigned I = 0;
I != NUM_WAIT_EVENTS; ++
I) {
1376 if (hasPendingEvent((WaitEventType)
I)) {
1377 OS <<
LS << WaitEventTypeName[
I];
1390void WaitcntBrackets::simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
1391 AMDGPU::Waitcnt &UpdateWait)
const {
1392 simplifyWaitcnt(LOAD_CNT, UpdateWait.
LoadCnt);
1393 simplifyWaitcnt(EXP_CNT, UpdateWait.
ExpCnt);
1394 simplifyWaitcnt(DS_CNT, UpdateWait.
DsCnt);
1395 simplifyWaitcnt(STORE_CNT, UpdateWait.
StoreCnt);
1396 simplifyWaitcnt(SAMPLE_CNT, UpdateWait.
SampleCnt);
1397 simplifyWaitcnt(BVH_CNT, UpdateWait.
BvhCnt);
1398 simplifyWaitcnt(KM_CNT, UpdateWait.
KmCnt);
1399 simplifyXcnt(CheckWait, UpdateWait);
1400 simplifyWaitcnt(VA_VDST, UpdateWait.
VaVdst);
1401 simplifyVmVsrc(CheckWait, UpdateWait);
1404void WaitcntBrackets::simplifyWaitcnt(InstCounterType
T,
1405 unsigned &
Count)
const {
1409 if (
Count >= getScoreRange(
T))
1413void WaitcntBrackets::simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
1414 AMDGPU::Waitcnt &UpdateWait)
const {
1423 if (CheckWait.
KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
1424 UpdateWait.
XCnt = ~0
u;
1428 if (CheckWait.
LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1429 !hasPendingEvent(STORE_CNT) && CheckWait.
XCnt >= CheckWait.
LoadCnt)
1430 UpdateWait.
XCnt = ~0
u;
1431 simplifyWaitcnt(X_CNT, UpdateWait.
XCnt);
1434void WaitcntBrackets::simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
1435 AMDGPU::Waitcnt &UpdateWait)
const {
1440 std::min({CheckWait.LoadCnt, CheckWait.StoreCnt, CheckWait.SampleCnt,
1441 CheckWait.BvhCnt, CheckWait.DsCnt}))
1443 simplifyWaitcnt(VM_VSRC, UpdateWait.
VmVsrc);
1446void WaitcntBrackets::purgeEmptyTrackingData() {
1457void WaitcntBrackets::determineWaitForScore(InstCounterType
T,
1458 unsigned ScoreToWait,
1459 AMDGPU::Waitcnt &
Wait)
const {
1460 const unsigned LB = getScoreLB(
T);
1461 const unsigned UB = getScoreUB(
T);
1464 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1465 if ((
T == LOAD_CNT ||
T == DS_CNT) && hasPendingFlat() &&
1466 !
Context->ST->hasFlatLgkmVMemCountInOrder()) {
1470 addWait(
Wait,
T, 0);
1471 }
else if (counterOutOfOrder(
T)) {
1475 addWait(
Wait,
T, 0);
1479 unsigned NeededWait = std::min(
1480 UB - ScoreToWait, getWaitCountMax(
Context->getLimits(),
T) - 1);
1481 addWait(
Wait,
T, NeededWait);
1486void WaitcntBrackets::determineWaitForPhysReg(InstCounterType
T,
MCPhysReg Reg,
1487 AMDGPU::Waitcnt &
Wait)
const {
1488 if (
Reg == AMDGPU::SCC) {
1489 determineWaitForScore(
T, SCCScore,
Wait);
1492 for (MCRegUnit RU : regunits(
Reg))
1493 determineWaitForScore(
1494 T, IsVGPR ? getVMemScore(toVMEMID(RU),
T) : getSGPRScore(RU,
T),
1499void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType
T, VMEMID TID,
1500 AMDGPU::Waitcnt &
Wait)
const {
1501 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1502 determineWaitForScore(
T, getVMemScore(TID,
T),
Wait);
1505void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1508 if (PendingSCCWrite &&
1509 PendingSCCWrite->
getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1511 WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
1513 if ((PendingEvents &
Context->getWaitEventMask()[KM_CNT]) ==
1514 SCC_WRITE_PendingEvent) {
1515 setScoreLB(KM_CNT, getScoreUB(KM_CNT));
1518 PendingEvents.remove(SCC_WRITE_PendingEvent);
1519 PendingSCCWrite =
nullptr;
1523void WaitcntBrackets::applyWaitcnt(
const AMDGPU::Waitcnt &
Wait) {
1524 applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1525 applyWaitcnt(EXP_CNT,
Wait.ExpCnt);
1526 applyWaitcnt(DS_CNT,
Wait.DsCnt);
1527 applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1528 applyWaitcnt(SAMPLE_CNT,
Wait.SampleCnt);
1529 applyWaitcnt(BVH_CNT,
Wait.BvhCnt);
1530 applyWaitcnt(KM_CNT,
Wait.KmCnt);
1531 applyWaitcnt(X_CNT,
Wait.XCnt);
1532 applyWaitcnt(VA_VDST,
Wait.VaVdst);
1533 applyWaitcnt(VM_VSRC,
Wait.VmVsrc);
1536void WaitcntBrackets::applyWaitcnt(InstCounterType
T,
unsigned Count) {
1537 const unsigned UB = getScoreUB(
T);
1541 if (counterOutOfOrder(
T))
1543 setScoreLB(
T, std::max(getScoreLB(
T), UB -
Count));
1546 PendingEvents.remove(
Context->getWaitEventMask()[
T]);
1549 if (
T == KM_CNT &&
Count == 0 && hasPendingEvent(SMEM_GROUP)) {
1550 if (!hasMixedPendingEvents(X_CNT))
1551 applyWaitcnt(X_CNT, 0);
1553 PendingEvents.remove(SMEM_GROUP);
1555 if (
T == LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
1556 !hasPendingEvent(STORE_CNT)) {
1557 if (!hasMixedPendingEvents(X_CNT))
1558 applyWaitcnt(X_CNT,
Count);
1559 else if (
Count == 0)
1560 PendingEvents.remove(VMEM_GROUP);
1566bool WaitcntBrackets::counterOutOfOrder(InstCounterType
T)
const {
1568 if ((
T ==
Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1569 (
T == X_CNT && hasPendingEvent(SMEM_GROUP)))
1575 if (
T == LOAD_CNT) {
1576 unsigned Events = hasPendingEvent(
T);
1579 Events &= ~(1 << GLOBAL_INV_ACCESS);
1582 return Events & (Events - 1);
1585 return hasMixedPendingEvents(
T);
1595char SIInsertWaitcntsLegacy::
ID = 0;
1600 return new SIInsertWaitcntsLegacy();
1605 int OpIdx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
OpName);
1610 if (NewEnc == MO.
getImm())
1621 case AMDGPU::S_WAIT_LOADCNT:
1623 case AMDGPU::S_WAIT_EXPCNT:
1625 case AMDGPU::S_WAIT_STORECNT:
1627 case AMDGPU::S_WAIT_SAMPLECNT:
1629 case AMDGPU::S_WAIT_BVHCNT:
1631 case AMDGPU::S_WAIT_DSCNT:
1633 case AMDGPU::S_WAIT_KMCNT:
1635 case AMDGPU::S_WAIT_XCNT:
1642bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt)
const {
1656bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1657 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1660 assert(isNormalMode(MaxCounter));
1663 MachineInstr *WaitcntInstr =
nullptr;
1664 MachineInstr *WaitcntVsCntInstr =
nullptr;
1667 dbgs() <<
"PreGFX12::applyPreexistingWaitcnt at: ";
1669 dbgs() <<
"end of block\n";
1677 if (
II.isMetaInstruction()) {
1683 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1687 if (Opcode == AMDGPU::S_WAITCNT) {
1688 unsigned IEnc =
II.getOperand(0).getImm();
1691 ScoreBrackets.simplifyWaitcnt(OldWait);
1695 if (WaitcntInstr || (!
Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1696 II.eraseFromParent();
1700 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1703 <<
"Before: " <<
Wait <<
'\n';);
1704 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, LDSDMA_BEGIN,
Wait);
1713 II.eraseFromParent();
1715 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1716 assert(
II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1719 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1721 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1722 Wait.StoreCnt = std::min(
Wait.StoreCnt, OldVSCnt);
1724 if (WaitcntVsCntInstr || (!
Wait.hasWaitStoreCnt() && TrySimplify)) {
1725 II.eraseFromParent();
1728 WaitcntVsCntInstr = &
II;
1735 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1737 ScoreBrackets.applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1738 ScoreBrackets.applyWaitcnt(EXP_CNT,
Wait.ExpCnt);
1739 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
1744 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
1745 <<
"New Instr at block end: "
1746 << *WaitcntInstr <<
'\n'
1747 :
dbgs() <<
"applied pre-existing waitcnt\n"
1748 <<
"Old Instr: " << *It
1749 <<
"New Instr: " << *WaitcntInstr <<
'\n');
1752 if (WaitcntVsCntInstr) {
1754 AMDGPU::OpName::simm16,
Wait.StoreCnt);
1755 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1757 ScoreBrackets.applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1758 Wait.StoreCnt = ~0
u;
1761 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1762 <<
"New Instr at block end: " << *WaitcntVsCntInstr
1764 :
dbgs() <<
"applied pre-existing waitcnt\n"
1765 <<
"Old Instr: " << *It
1766 <<
"New Instr: " << *WaitcntVsCntInstr <<
'\n');
1774bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1776 AMDGPU::Waitcnt
Wait,
const WaitcntBrackets &ScoreBrackets) {
1778 assert(isNormalMode(MaxCounter));
1786 auto EmitExpandedWaitcnt = [&](
unsigned Outstanding,
unsigned Target,
1789 EmitWaitcnt(--Outstanding);
1790 }
while (Outstanding > Target);
1796 if (
Wait.hasWaitExceptStoreCnt()) {
1798 if (ExpandWaitcntProfiling) {
1802 bool AnyOutOfOrder =
false;
1803 for (
auto CT : {LOAD_CNT, DS_CNT,
EXP_CNT}) {
1804 unsigned &WaitCnt = getCounterRef(
Wait, CT);
1805 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
1806 AnyOutOfOrder =
true;
1811 if (AnyOutOfOrder) {
1818 for (
auto CT : {LOAD_CNT, DS_CNT,
EXP_CNT}) {
1819 unsigned &WaitCnt = getCounterRef(
Wait, CT);
1823 unsigned Outstanding = std::min(ScoreBrackets.getScoreUB(CT) -
1824 ScoreBrackets.getScoreLB(CT),
1825 getWaitCountMax(getLimits(), CT) - 1);
1826 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](
unsigned Count) {
1828 getCounterRef(W, CT) =
Count;
1837 [[maybe_unused]]
auto SWaitInst =
1842 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1843 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1847 if (
Wait.hasWaitStoreCnt()) {
1850 if (ExpandWaitcntProfiling &&
Wait.StoreCnt != ~0u &&
1851 !ScoreBrackets.counterOutOfOrder(STORE_CNT)) {
1853 unsigned Outstanding =
1854 std::min(ScoreBrackets.getScoreUB(STORE_CNT) -
1855 ScoreBrackets.getScoreLB(STORE_CNT),
1856 getWaitCountMax(getLimits(), STORE_CNT) - 1);
1857 EmitExpandedWaitcnt(Outstanding,
Wait.StoreCnt, [&](
unsigned Count) {
1858 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1859 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1863 [[maybe_unused]]
auto SWaitInst =
1865 .
addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1870 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1871 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1879WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1880 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt &&
ST->hasVscnt() ? 0 : ~0u);
1884WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1885 unsigned ExpertVal = IsExpertMode ? 0 : ~0
u;
1886 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1887 ~0u , ExpertVal, ExpertVal);
1894bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1895 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1898 assert(!isNormalMode(MaxCounter));
1901 MachineInstr *CombinedLoadDsCntInstr =
nullptr;
1902 MachineInstr *CombinedStoreDsCntInstr =
nullptr;
1903 MachineInstr *WaitcntDepctrInstr =
nullptr;
1904 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1907 dbgs() <<
"GFX12Plus::applyPreexistingWaitcnt at: ";
1909 dbgs() <<
"end of block\n";
1915 AMDGPU::Waitcnt RequiredWait;
1920 if (
II.isMetaInstruction()) {
1925 MachineInstr **UpdatableInstr;
1931 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1935 if (Opcode == AMDGPU::S_WAITCNT)
1938 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1940 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1945 RequiredWait = RequiredWait.combined(OldWait);
1946 UpdatableInstr = &CombinedLoadDsCntInstr;
1947 }
else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1949 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1954 RequiredWait = RequiredWait.combined(OldWait);
1955 UpdatableInstr = &CombinedStoreDsCntInstr;
1956 }
else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
1958 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1959 AMDGPU::Waitcnt OldWait;
1963 ScoreBrackets.simplifyWaitcnt(OldWait);
1965 UpdatableInstr = &WaitcntDepctrInstr;
1966 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1969 II.eraseFromParent();
1975 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1977 addWait(
Wait, CT.value(), OldCnt);
1979 addWait(RequiredWait, CT.value(), OldCnt);
1980 UpdatableInstr = &WaitInstrs[CT.value()];
1984 if (!*UpdatableInstr) {
1985 *UpdatableInstr = &
II;
1986 }
else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
1993 unsigned Enc =
TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2001 II.eraseFromParent();
2005 II.eraseFromParent();
2010 ScoreBrackets.simplifyWaitcnt(
Wait.combined(RequiredWait),
Wait);
2011 Wait =
Wait.combined(RequiredWait);
2013 if (CombinedLoadDsCntInstr) {
2026 if (
Wait.LoadCnt != ~0u &&
Wait.DsCnt != ~0u) {
2029 AMDGPU::OpName::simm16, NewEnc);
2030 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
2031 ScoreBrackets.applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
2032 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
2036 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
2037 <<
"New Instr at block end: "
2038 << *CombinedLoadDsCntInstr <<
'\n'
2039 :
dbgs() <<
"applied pre-existing waitcnt\n"
2040 <<
"Old Instr: " << *It <<
"New Instr: "
2041 << *CombinedLoadDsCntInstr <<
'\n');
2048 if (CombinedStoreDsCntInstr) {
2050 if (
Wait.StoreCnt != ~0u &&
Wait.DsCnt != ~0u) {
2053 AMDGPU::OpName::simm16, NewEnc);
2054 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2055 ScoreBrackets.applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
2056 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
2057 Wait.StoreCnt = ~0
u;
2060 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
2061 <<
"New Instr at block end: "
2062 << *CombinedStoreDsCntInstr <<
'\n'
2063 :
dbgs() <<
"applied pre-existing waitcnt\n"
2064 <<
"Old Instr: " << *It <<
"New Instr: "
2065 << *CombinedStoreDsCntInstr <<
'\n');
2078 if (
Wait.DsCnt != ~0u) {
2087 if (
Wait.LoadCnt != ~0u) {
2088 WaitsToErase.
push_back(&WaitInstrs[LOAD_CNT]);
2089 WaitsToErase.
push_back(&WaitInstrs[DS_CNT]);
2090 }
else if (
Wait.StoreCnt != ~0u) {
2091 WaitsToErase.
push_back(&WaitInstrs[STORE_CNT]);
2092 WaitsToErase.
push_back(&WaitInstrs[DS_CNT]);
2095 for (MachineInstr **WI : WaitsToErase) {
2099 (*WI)->eraseFromParent();
2105 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2106 if (!WaitInstrs[CT])
2109 unsigned NewCnt = getWait(
Wait, CT);
2110 if (NewCnt != ~0u) {
2112 AMDGPU::OpName::simm16, NewCnt);
2113 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2115 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2116 setNoWait(
Wait, CT);
2119 ?
dbgs() <<
"applied pre-existing waitcnt\n"
2120 <<
"New Instr at block end: " << *WaitInstrs[CT]
2122 :
dbgs() <<
"applied pre-existing waitcnt\n"
2123 <<
"Old Instr: " << *It
2124 <<
"New Instr: " << *WaitInstrs[CT] <<
'\n');
2131 if (WaitcntDepctrInstr) {
2135 TII->getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2140 ScoreBrackets.applyWaitcnt(VA_VDST,
Wait.VaVdst);
2141 ScoreBrackets.applyWaitcnt(VM_VSRC,
Wait.VmVsrc);
2150 AMDGPU::OpName::simm16, Enc);
2152 <<
"New Instr at block end: "
2153 << *WaitcntDepctrInstr <<
'\n'
2154 :
dbgs() <<
"applyPreexistingWaitcnt\n"
2155 <<
"Old Instr: " << *It <<
"New Instr: "
2156 << *WaitcntDepctrInstr <<
'\n');
2167bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2169 AMDGPU::Waitcnt
Wait,
const WaitcntBrackets &ScoreBrackets) {
2171 assert(!isNormalMode(MaxCounter));
2177 auto EmitExpandedWaitcnt = [&](
unsigned Outstanding,
unsigned Target,
2179 for (
unsigned I = Outstanding - 1;
I >
Target &&
I != ~0
u; --
I)
2181 EmitWaitcnt(Target);
2187 if (ExpandWaitcntProfiling) {
2188 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2194 if (ScoreBrackets.counterOutOfOrder(CT)) {
2201 unsigned Outstanding =
2202 std::min(ScoreBrackets.getScoreUB(CT) - ScoreBrackets.getScoreLB(CT),
2203 getWaitCountMax(getLimits(), CT) - 1);
2204 EmitExpandedWaitcnt(Outstanding,
Count, [&](
unsigned Val) {
2214 if (
Wait.DsCnt != ~0u) {
2215 MachineInstr *SWaitInst =
nullptr;
2217 if (
Wait.LoadCnt != ~0u) {
2225 }
else if (
Wait.StoreCnt != ~0u) {
2232 Wait.StoreCnt = ~0
u;
2240 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2241 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2248 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2253 [[maybe_unused]]
auto SWaitInst =
2260 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2261 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2264 if (
Wait.hasWaitDepctr()) {
2269 [[maybe_unused]]
auto SWaitInst =
2275 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2276 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2295bool SIInsertWaitcnts::generateWaitcntInstBefore(
2296 MachineInstr &
MI, WaitcntBrackets &ScoreBrackets,
2297 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2298 setForceEmitWaitcnt();
2302 AMDGPU::Waitcnt
Wait;
2303 const unsigned Opc =
MI.getOpcode();
2309 if (
Opc == AMDGPU::BUFFER_WBINVL1 ||
Opc == AMDGPU::BUFFER_WBINVL1_SC ||
2310 Opc == AMDGPU::BUFFER_WBINVL1_VOL ||
Opc == AMDGPU::BUFFER_GL0_INV ||
2311 Opc == AMDGPU::BUFFER_GL1_INV) {
2318 if (
Opc == AMDGPU::SI_RETURN_TO_EPILOG ||
Opc == AMDGPU::SI_RETURN ||
2319 Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
2320 Opc == AMDGPU::S_SETPC_B64_return) {
2322 AMDGPU::Waitcnt AllZeroWait =
2323 WCG->getAllZeroWaitcnt(
false);
2328 if (
ST->hasExtendedWaitCounts() &&
2329 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
2341 else if (
Opc == AMDGPU::S_ENDPGM ||
Opc == AMDGPU::S_ENDPGM_SAVED) {
2342 EndPgmInsts[&
MI] = ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
2343 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
2346 else if ((
Opc == AMDGPU::S_SENDMSG ||
Opc == AMDGPU::S_SENDMSGHALT) &&
2347 ST->hasLegacyGeometry() &&
2358 if (
MI.modifiesRegister(AMDGPU::EXEC,
TRI)) {
2361 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2362 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2363 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2364 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2371 if (
TII->isAlwaysGDS(
Opc) && ScoreBrackets.hasPendingGDS())
2372 addWait(
Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
2379 Wait = AMDGPU::Waitcnt();
2381 const MachineOperand &CallAddrOp =
TII->getCalleeOperand(
MI);
2382 if (CallAddrOp.
isReg()) {
2383 ScoreBrackets.determineWaitForPhysReg(
2386 if (
const auto *RtnAddrOp =
2387 TII->getNamedOperand(
MI, AMDGPU::OpName::dst)) {
2388 ScoreBrackets.determineWaitForPhysReg(
2389 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(),
Wait);
2392 }
else if (
Opc == AMDGPU::S_BARRIER_WAIT) {
2393 ScoreBrackets.tryClearSCCWriteEvent(&
MI);
2409 for (
const MachineMemOperand *Memop :
MI.memoperands()) {
2410 const Value *Ptr = Memop->getValue();
2411 if (Memop->isStore()) {
2412 if (
auto It = SLoadAddresses.
find(Ptr); It != SLoadAddresses.
end()) {
2413 addWait(
Wait, SmemAccessCounter, 0);
2415 SLoadAddresses.
erase(It);
2418 unsigned AS = Memop->getAddrSpace();
2422 if (
TII->mayWriteLDSThroughDMA(
MI))
2426 unsigned TID = LDSDMA_BEGIN;
2427 if (Ptr && Memop->getAAInfo()) {
2428 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2429 for (
unsigned I = 0,
E = LDSDMAStores.size();
I !=
E; ++
I) {
2430 if (
MI.mayAlias(AA, *LDSDMAStores[
I],
true)) {
2431 if ((
I + 1) >= NUM_LDSDMA) {
2434 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID,
Wait);
2438 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID +
I + 1,
Wait);
2442 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID,
Wait);
2444 if (Memop->isStore()) {
2445 ScoreBrackets.determineWaitForLDSDMA(EXP_CNT, TID,
Wait);
2450 for (
const MachineOperand &
Op :
MI.operands()) {
2455 if (
Op.isTied() &&
Op.isUse() &&
TII->doesNotReadTiedSource(
MI))
2460 const bool IsVGPR =
TRI->isVectorRegister(*
MRI,
Op.getReg());
2467 if (
Op.isImplicit() &&
MI.mayLoadOrStore())
2470 ScoreBrackets.determineWaitForPhysReg(VA_VDST,
Reg,
Wait);
2472 ScoreBrackets.determineWaitForPhysReg(VM_VSRC,
Reg,
Wait);
2479 if (
Op.isUse() || !updateVMCntOnly(
MI) ||
2480 ScoreBrackets.hasOtherPendingVmemTypes(
Reg, getVmemType(
MI)) ||
2481 ScoreBrackets.hasPointSamplePendingVmemTypes(
MI,
Reg) ||
2482 !
ST->hasVmemWriteVgprInOrder()) {
2483 ScoreBrackets.determineWaitForPhysReg(LOAD_CNT,
Reg,
Wait);
2484 ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT,
Reg,
Wait);
2485 ScoreBrackets.determineWaitForPhysReg(BVH_CNT,
Reg,
Wait);
2486 ScoreBrackets.clearVgprVmemTypes(
Reg);
2489 if (
Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2490 ScoreBrackets.determineWaitForPhysReg(EXP_CNT,
Reg,
Wait);
2492 ScoreBrackets.determineWaitForPhysReg(DS_CNT,
Reg,
Wait);
2493 }
else if (
Op.getReg() == AMDGPU::SCC) {
2494 ScoreBrackets.determineWaitForPhysReg(KM_CNT,
Reg,
Wait);
2496 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter,
Reg,
Wait);
2499 if (
ST->hasWaitXcnt() &&
Op.isDef())
2500 ScoreBrackets.determineWaitForPhysReg(X_CNT,
Reg,
Wait);
2517 if (
Opc == AMDGPU::S_BARRIER && !
ST->hasAutoWaitcntBeforeBarrier() &&
2518 !
ST->hasBackOffBarrier()) {
2519 Wait =
Wait.combined(WCG->getAllZeroWaitcnt(
true));
2526 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2531 ScoreBrackets.simplifyWaitcnt(
Wait);
2537 if (
TII->isVALU(
MI))
2543 if (
Wait.XCnt != ~0u && isVmemAccess(
MI)) {
2544 ScoreBrackets.applyWaitcnt(X_CNT,
Wait.XCnt);
2551 Wait = WCG->getAllZeroWaitcnt(
false);
2554 for (InstCounterType
T : inst_counter_types()) {
2555 if (!ForceEmitWaitcnt[
T])
2557 getCounterRef(
Wait,
T) = 0;
2560 if (FlushFlags.FlushVmCnt) {
2561 for (InstCounterType
T : {LOAD_CNT, SAMPLE_CNT, BVH_CNT})
2562 getCounterRef(
Wait,
T) = 0;
2565 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
2571 return generateWaitcnt(
Wait,
MI.getIterator(), *
MI.getParent(), ScoreBrackets,
2575bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt
Wait,
2577 MachineBasicBlock &
Block,
2578 WaitcntBrackets &ScoreBrackets,
2579 MachineInstr *OldWaitcntInstr) {
2582 if (OldWaitcntInstr)
2586 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr,
Wait, It);
2589 if (
Wait.ExpCnt != ~0u && It !=
Block.instr_end() &&
2591 MachineOperand *WaitExp =
2592 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2598 ScoreBrackets.applyWaitcnt(EXP_CNT,
Wait.ExpCnt);
2602 <<
"Update Instr: " << *It);
2605 if (WCG->createNewWaitcnt(
Block, It,
Wait, ScoreBrackets))
2610 ScoreBrackets.applyWaitcnt(
Wait);
2615std::optional<WaitEventType>
2616SIInsertWaitcnts::getExpertSchedulingEventType(
const MachineInstr &Inst)
const {
2617 if (
TII->isVALU(Inst)) {
2622 if (
TII->isXDL(Inst))
2623 return VGPR_XDL_WRITE;
2625 if (
TII->isTRANS(Inst))
2626 return VGPR_TRANS_WRITE;
2629 return VGPR_DPMACC_WRITE;
2631 return VGPR_CSMACC_WRITE;
2638 if (
TII->isFLAT(Inst))
2639 return VGPR_FLAT_READ;
2641 if (
TII->isDS(Inst))
2642 return VGPR_LDS_READ;
2644 if (
TII->isVMEM(Inst) ||
TII->isVIMAGE(Inst) ||
TII->isVSAMPLE(Inst))
2645 return VGPR_VMEM_READ;
2652bool SIInsertWaitcnts::isVmemAccess(
const MachineInstr &
MI)
const {
2653 return (
TII->isFLAT(
MI) &&
TII->mayAccessVMEMThroughFlat(
MI)) ||
2660 MachineBasicBlock *
Block)
const {
2661 auto BlockEnd =
Block->getParent()->end();
2662 auto BlockIter =
Block->getIterator();
2666 if (++BlockIter != BlockEnd) {
2667 It = BlockIter->instr_begin();
2674 if (!It->isMetaInstruction())
2682 return It->getOpcode() == AMDGPU::S_ENDPGM;
2686bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2687 MachineBasicBlock &
Block,
2688 WaitcntBrackets &ScoreBrackets) {
2689 AMDGPU::Waitcnt
Wait;
2690 bool NeedsEndPGMCheck =
false;
2698 NeedsEndPGMCheck =
true;
2701 ScoreBrackets.simplifyWaitcnt(
Wait);
2704 bool Result = generateWaitcnt(
Wait, SuccessorIt,
Block, ScoreBrackets,
2707 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &
Block)) {
2715void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2716 WaitcntBrackets *ScoreBrackets) {
2724 bool IsVMEMAccess =
false;
2725 bool IsSMEMAccess =
false;
2728 if (
const auto ET = getExpertSchedulingEventType(Inst))
2729 ScoreBrackets->updateByEvent(*ET, Inst);
2732 if (
TII->isDS(Inst) &&
TII->usesLGKM_CNT(Inst)) {
2734 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2735 ScoreBrackets->updateByEvent(GDS_ACCESS, Inst);
2736 ScoreBrackets->updateByEvent(GDS_GPR_LOCK, Inst);
2737 ScoreBrackets->setPendingGDS();
2739 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2741 }
else if (
TII->isFLAT(Inst)) {
2743 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2749 int FlatASCount = 0;
2751 if (
TII->mayAccessVMEMThroughFlat(Inst)) {
2753 IsVMEMAccess =
true;
2754 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2757 if (
TII->mayAccessLDSThroughFlat(Inst)) {
2759 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2768 ScoreBrackets->setPendingFlat();
2771 IsVMEMAccess =
true;
2772 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2774 if (
ST->vmemWriteNeedsExpWaitcnt() &&
2776 ScoreBrackets->updateByEvent(VMW_GPR_LOCK, Inst);
2778 }
else if (
TII->isSMRD(Inst)) {
2779 IsSMEMAccess =
true;
2780 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2781 }
else if (Inst.
isCall()) {
2783 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(
false));
2784 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2786 ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst);
2787 }
else if (
TII->isVINTERP(Inst)) {
2788 int64_t
Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2789 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2791 unsigned Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2793 ScoreBrackets->updateByEvent(EXP_PARAM_ACCESS, Inst);
2795 ScoreBrackets->updateByEvent(EXP_POS_ACCESS, Inst);
2797 ScoreBrackets->updateByEvent(EXP_GPR_LOCK, Inst);
2799 ScoreBrackets->updateByEvent(SCC_WRITE, Inst);
2802 case AMDGPU::S_SENDMSG:
2803 case AMDGPU::S_SENDMSG_RTN_B32:
2804 case AMDGPU::S_SENDMSG_RTN_B64:
2805 case AMDGPU::S_SENDMSGHALT:
2806 ScoreBrackets->updateByEvent(SQ_MESSAGE, Inst);
2808 case AMDGPU::S_MEMTIME:
2809 case AMDGPU::S_MEMREALTIME:
2810 case AMDGPU::S_GET_BARRIER_STATE_M0:
2811 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2812 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2817 if (!
ST->hasWaitXcnt())
2821 ScoreBrackets->updateByEvent(VMEM_GROUP, Inst);
2824 ScoreBrackets->updateByEvent(SMEM_GROUP, Inst);
2827bool WaitcntBrackets::mergeScore(
const MergeInfo &M,
unsigned &Score,
2828 unsigned OtherScore) {
2829 unsigned MyShifted = Score <=
M.OldLB ? 0 : Score +
M.MyShift;
2830 unsigned OtherShifted =
2831 OtherScore <=
M.OtherLB ? 0 : OtherScore +
M.OtherShift;
2832 Score = std::max(MyShifted, OtherShifted);
2833 return OtherShifted > MyShifted;
2841bool WaitcntBrackets::merge(
const WaitcntBrackets &
Other) {
2842 bool StrictDom =
false;
2846 for (
auto K :
Other.VMem.keys())
2847 VMem.try_emplace(K);
2848 for (
auto K :
Other.SGPRs.keys())
2849 SGPRs.try_emplace(K);
2851 for (
auto T : inst_counter_types(
Context->MaxCounter)) {
2853 const WaitEventSet &EventsForT =
Context->getWaitEventMask()[
T];
2854 const WaitEventSet OldEvents = PendingEvents & EventsForT;
2855 const WaitEventSet OtherEvents =
Other.PendingEvents & EventsForT;
2856 if (!OldEvents.contains(OtherEvents))
2858 PendingEvents |= OtherEvents;
2861 const unsigned MyPending = ScoreUBs[
T] - ScoreLBs[
T];
2862 const unsigned OtherPending =
Other.ScoreUBs[
T] -
Other.ScoreLBs[
T];
2863 const unsigned NewUB = ScoreLBs[
T] + std::max(MyPending, OtherPending);
2864 if (NewUB < ScoreLBs[
T])
2868 M.OldLB = ScoreLBs[
T];
2869 M.OtherLB =
Other.ScoreLBs[
T];
2870 M.MyShift = NewUB - ScoreUBs[
T];
2871 M.OtherShift = NewUB -
Other.ScoreUBs[
T];
2873 ScoreUBs[
T] = NewUB;
2875 StrictDom |= mergeScore(M, LastFlat[
T],
Other.LastFlat[
T]);
2878 StrictDom |= mergeScore(M, LastGDS,
Other.LastGDS);
2881 StrictDom |= mergeScore(M, SCCScore,
Other.SCCScore);
2882 if (
Other.hasPendingEvent(SCC_WRITE)) {
2883 if (!OldEvents.contains(SCC_WRITE)) {
2884 PendingSCCWrite =
Other.PendingSCCWrite;
2885 }
else if (PendingSCCWrite !=
Other.PendingSCCWrite) {
2886 PendingSCCWrite =
nullptr;
2891 for (
auto &[RegID, Info] : VMem)
2892 StrictDom |= mergeScore(M,
Info.Scores[
T],
Other.getVMemScore(RegID,
T));
2894 if (isSmemCounter(
T)) {
2895 unsigned Idx = getSgprScoresIdx(
T);
2896 for (
auto &[RegID, Info] : SGPRs) {
2897 auto It =
Other.SGPRs.find(RegID);
2898 unsigned OtherScore =
2899 (It !=
Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;
2900 StrictDom |= mergeScore(M,
Info.Scores[Idx], OtherScore);
2905 for (
auto &[TID, Info] : VMem) {
2906 if (
auto It =
Other.VMem.find(TID); It !=
Other.VMem.end()) {
2907 unsigned char NewVmemTypes =
Info.VMEMTypes | It->second.VMEMTypes;
2908 StrictDom |= NewVmemTypes !=
Info.VMEMTypes;
2909 Info.VMEMTypes = NewVmemTypes;
2913 purgeEmptyTrackingData();
2919 return Opcode == AMDGPU::S_WAITCNT ||
2922 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2923 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2924 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2928void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &
MBB,
2930 bool ExpertMode)
const {
2934 .
addImm(ExpertMode ? 2 : 0)
2947SIInsertWaitcnts::getAtomicRMWState(MachineInstr &
MI,
2948 AtomicRMWState PrevState)
const {
2949 if (isAtomicRMW(
MI)) {
2951 if (PrevState == AtomicRMWState::NotInBlock)
2952 return AtomicRMWState::NewBlock;
2953 if (PrevState == AtomicRMWState::NewBlock)
2954 return AtomicRMWState::InsideBlock;
2960 if (
TII->isDS(
MI) || (
TII->isFLAT(
MI) &&
TII->mayAccessLDSThroughFlat(
MI)))
2964 if (
MI.mayLoad() ^
MI.mayStore())
2965 return AtomicRMWState::NotInBlock;
2972bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2973 MachineBasicBlock &
Block,
2974 WaitcntBrackets &ScoreBrackets) {
2978 dbgs() <<
"*** Begin Block: ";
2980 ScoreBrackets.dump();
2986 bool VCCZCorrect =
true;
2987 if (
ST->hasReadVCCZBug()) {
2990 VCCZCorrect =
false;
2991 }
else if (!
ST->partialVCCWritesUpdateVCCZ()) {
2994 VCCZCorrect =
false;
2998 MachineInstr *OldWaitcntInstr =
nullptr;
2999 AtomicRMWState RMWState = AtomicRMWState::NotInBlock;
3004 MachineInstr &Inst = *Iter;
3010 RMWState = getAtomicRMWState(Inst, RMWState);
3015 (IsExpertMode && Inst.
getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3017 bool IsSoftXcnt = isSoftXcnt(Inst);
3022 if (Iter !=
E && IsSoftXcnt) {
3025 RMWState = getAtomicRMWState(*Iter, RMWState);
3028 if (IsSoftXcnt && RMWState == AtomicRMWState::InsideBlock) {
3032 }
else if (!OldWaitcntInstr) {
3033 OldWaitcntInstr = &Inst;
3038 PreheaderFlushFlags FlushFlags;
3039 if (
Block.getFirstTerminator() == Inst)
3040 FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
3043 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3045 OldWaitcntInstr =
nullptr;
3051 if (
ST->hasReadVCCZBug() || !
ST->partialVCCWritesUpdateVCCZ()) {
3055 if (!
ST->partialVCCWritesUpdateVCCZ())
3056 VCCZCorrect =
false;
3065 if (
ST->hasReadVCCZBug() &&
3066 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
3069 VCCZCorrect =
false;
3077 if (
TII->isSMRD(Inst)) {
3078 for (
const MachineMemOperand *Memop : Inst.
memoperands()) {
3081 if (!Memop->isInvariant()) {
3082 const Value *Ptr = Memop->getValue();
3086 if (
ST->hasReadVCCZBug()) {
3088 VCCZCorrect =
false;
3092 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3094 Modified |= insertForcedWaitAfter(Inst,
Block, ScoreBrackets);
3098 ScoreBrackets.dump();
3108 TII->get(
ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
3120 AMDGPU::Waitcnt
Wait;
3121 if (
Block.getFirstTerminator() ==
Block.end()) {
3122 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
3123 if (FlushFlags.FlushVmCnt) {
3124 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
3126 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
3128 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
3131 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
3140 dbgs() <<
"*** End Block: ";
3142 ScoreBrackets.dump();
3150SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &
MBB,
3151 const WaitcntBrackets &ScoreBrackets) {
3152 auto [Iterator, IsInserted] =
3155 return Iterator->second;
3159 return PreheaderFlushFlags();
3163 return PreheaderFlushFlags();
3166 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3167 return Iterator->second;
3170 return PreheaderFlushFlags();
3173bool SIInsertWaitcnts::isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const {
3175 return TII->mayAccessVMEMThroughFlat(
MI);
3179bool SIInsertWaitcnts::isDSRead(
const MachineInstr &
MI)
const {
3185bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const {
3211SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *
ML,
3212 const WaitcntBrackets &Brackets) {
3213 PreheaderFlushFlags
Flags;
3214 bool HasVMemLoad =
false;
3215 bool HasVMemStore =
false;
3216 bool SeenDSStoreInLoop =
false;
3217 bool UsesVgprLoadedOutsideVMEM =
false;
3218 bool UsesVgprLoadedOutsideDS =
false;
3219 bool VMemInvalidated =
false;
3221 bool DSInvalidated = !
ST->hasExtendedWaitCounts();
3222 DenseSet<MCRegUnit> VgprUse;
3223 DenseSet<MCRegUnit> VgprDefVMEM;
3224 DenseSet<MCRegUnit> VgprDefDS;
3226 for (MachineBasicBlock *
MBB :
ML->blocks()) {
3227 bool SeenDSStoreInCurrMBB =
false;
3228 for (MachineInstr &
MI : *
MBB) {
3229 if (isVMEMOrFlatVMEM(
MI)) {
3230 HasVMemLoad |=
MI.mayLoad();
3231 HasVMemStore |=
MI.mayStore();
3233 if (mayStoreIncrementingDSCNT(
MI))
3234 SeenDSStoreInCurrMBB =
true;
3239 if (
MI.getOpcode() == AMDGPU::S_BARRIER)
3240 SeenDSStoreInCurrMBB =
false;
3241 for (
const MachineOperand &
Op :
MI.all_uses()) {
3242 if (
Op.isDebug() || !
TRI->isVectorRegister(*
MRI,
Op.getReg()))
3245 for (MCRegUnit RU :
TRI->regunits(
Op.getReg().asMCReg())) {
3249 VMemInvalidated =
true;
3253 DSInvalidated =
true;
3256 if (VMemInvalidated && DSInvalidated)
3262 VMEMID
ID = toVMEMID(RU);
3263 bool HasPendingVMEM =
3264 Brackets.getVMemScore(
ID, LOAD_CNT) >
3265 Brackets.getScoreLB(LOAD_CNT) ||
3266 Brackets.getVMemScore(
ID, SAMPLE_CNT) >
3267 Brackets.getScoreLB(SAMPLE_CNT) ||
3268 Brackets.getVMemScore(
ID, BVH_CNT) > Brackets.getScoreLB(BVH_CNT);
3270 UsesVgprLoadedOutsideVMEM =
true;
3274 if (!HasPendingVMEM &&
3275 Brackets.getVMemScore(
ID, DS_CNT) > Brackets.getScoreLB(DS_CNT))
3276 UsesVgprLoadedOutsideDS =
true;
3281 if (isVMEMOrFlatVMEM(
MI) &&
MI.mayLoad()) {
3282 for (
const MachineOperand &
Op :
MI.all_defs()) {
3283 for (MCRegUnit RU :
TRI->regunits(
Op.getReg().asMCReg())) {
3287 VMemInvalidated =
true;
3292 if (VMemInvalidated && DSInvalidated)
3304 for (
const MachineOperand &
Op :
MI.all_defs()) {
3305 for (MCRegUnit RU :
TRI->regunits(
Op.getReg().asMCReg())) {
3312 SeenDSStoreInLoop |= SeenDSStoreInCurrMBB;
3316 if (!VMemInvalidated && UsesVgprLoadedOutsideVMEM &&
3317 ((!
ST->hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3318 (HasVMemLoad &&
ST->hasVmemWriteVgprInOrder())))
3319 Flags.FlushVmCnt =
true;
3326 if (!DSInvalidated && !SeenDSStoreInLoop && UsesVgprLoadedOutsideDS)
3327 Flags.FlushDsCnt =
true;
3332bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3333 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3335 &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3337 if (
auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3338 AA = &AAR->getAAResults();
3340 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
3352 if (!SIInsertWaitcnts(MLI, PDT,
AA).
run(MF))
3357 .preserve<AAManager>();
3362 TII = ST->getInstrInfo();
3363 TRI = &
TII->getRegisterInfo();
3372 if (ST->hasExtendedWaitCounts()) {
3373 IsExpertMode = ST->hasExpertSchedulingMode() &&
3379 MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS;
3381 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, &Limits,
3384 MaxCounter = NUM_NORMAL_INST_CNTS;
3386 WCG = std::make_unique<WaitcntGeneratorPreGFX12>(MF, NUM_NORMAL_INST_CNTS,
3390 for (
auto T : inst_counter_types())
3391 ForceEmitWaitcnt[
T] =
false;
3393 SmemAccessCounter = eventCounter(WCG->getWaitEventMask(), SMEM_ACCESS);
3398 MachineBasicBlock &EntryBB = MF.
front();
3408 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3411 if (
ST->hasExtendedWaitCounts()) {
3414 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
3415 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
3418 if (!
ST->hasImageInsts() &&
3419 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
3423 TII->get(instrsForExtendedCounterTypes[CT]))
3436 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
this);
3437 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3438 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3445 for (
auto *
MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3448 std::unique_ptr<WaitcntBrackets> Brackets;
3453 for (
auto BII = BlockInfos.
begin(), BIE = BlockInfos.
end(); BII != BIE;
3455 MachineBasicBlock *
MBB = BII->first;
3456 BlockInfo &BI = BII->second;
3462 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3464 *Brackets = *BI.Incoming;
3467 Brackets = std::make_unique<WaitcntBrackets>(
this);
3472 Brackets->~WaitcntBrackets();
3473 new (Brackets.get()) WaitcntBrackets(
this);
3477 Modified |= insertWaitcntInBlock(MF, *
MBB, *Brackets);
3480 if (Brackets->hasPendingEvent()) {
3481 BlockInfo *MoveBracketsToSucc =
nullptr;
3483 auto *SuccBII = BlockInfos.
find(Succ);
3484 BlockInfo &SuccBI = SuccBII->second;
3485 if (!SuccBI.Incoming) {
3486 SuccBI.Dirty =
true;
3487 if (SuccBII <= BII) {
3491 if (!MoveBracketsToSucc) {
3492 MoveBracketsToSucc = &SuccBI;
3494 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3496 }
else if (SuccBI.Incoming->merge(*Brackets)) {
3497 SuccBI.Dirty =
true;
3498 if (SuccBII <= BII) {
3504 if (MoveBracketsToSucc)
3505 MoveBracketsToSucc->Incoming = std::move(Brackets);
3510 if (
ST->hasScalarStores()) {
3511 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3512 bool HaveScalarStores =
false;
3514 for (MachineBasicBlock &
MBB : MF) {
3515 for (MachineInstr &
MI :
MBB) {
3516 if (!HaveScalarStores &&
TII->isScalarStore(
MI))
3517 HaveScalarStores =
true;
3519 if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
3520 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3525 if (HaveScalarStores) {
3534 for (MachineBasicBlock *
MBB : EndPgmBlocks) {
3535 bool SeenDCacheWB =
false;
3539 if (
I->getOpcode() == AMDGPU::S_DCACHE_WB)
3540 SeenDCacheWB =
true;
3541 else if (
TII->isScalarStore(*
I))
3542 SeenDCacheWB =
false;
3545 if ((
I->getOpcode() == AMDGPU::S_ENDPGM ||
3546 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3562 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3564 setSchedulingMode(EntryBB,
I,
true);
3566 for (MachineInstr *
MI : CallInsts) {
3567 MachineBasicBlock &
MBB = *
MI->getParent();
3568 setSchedulingMode(
MBB,
MI,
false);
3569 setSchedulingMode(
MBB, std::next(
MI->getIterator()),
true);
3572 for (MachineInstr *
MI : ReturnInsts)
3573 setSchedulingMode(*
MI->getParent(),
MI,
false);
3584 for (
auto [
MI,
_] : EndPgmInsts) {
3586 TII->get(AMDGPU::S_ALLOC_VGPR))
3590 }
else if (!WCG->isOptNone() &&
3591 ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
3592 (MF.getFrameInfo().hasCalls() ||
3593 ST->getOccupancyWithNumVGPRs(
3594 TRI->getNumUsedPhysRegs(*
MRI, AMDGPU::VGPR_32RegClass),
3597 for (
auto [
MI, Flag] : EndPgmInsts) {
3599 if (
ST->requiresNopBeforeDeallocVGPRs()) {
3601 TII->get(AMDGPU::S_NOP))
3605 TII->get(AMDGPU::S_SENDMSG))
3613 ReturnInsts.
clear();
3614 EndPgmInsts.clear();
3615 PreheadersToFlush.
clear();
3616 SLoadAddresses.
clear();
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
#define AMDGPU_EVENT_NAME(Name)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Provides some synthesis utilities to produce sequences of values.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
A manager for alias analyses.
bool isEntryFunction() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isDynamicVGPREnabled() const
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
@ ID_DEALLOC_VGPRS_GFX11Plus
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
bool isDPMACCInstruction(unsigned Opc)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
LLVM_ABI std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
APInt operator&(APInt a, const APInt &b)
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
bool operator!=(uint64_t V1, const APInt &V2)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
FunctionAddr VTableAddr Count
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
DWARFExpression::Operation Op
bool operator|=(SparseBitVector< ElementSize > &LHS, const SparseBitVector< ElementSize > *RHS)
APInt operator|(APInt a, const APInt &b)
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.
static constexpr bool is_iterable
static constexpr bool is_iterable