46#define DEBUG_TYPE "si-insert-waitcnts"
49 "Force emit s_waitcnt expcnt(0) instrs");
51 "Force emit s_waitcnt lgkmcnt(0) instrs");
53 "Force emit s_waitcnt vmcnt(0) instrs");
57 cl::desc(
"Force all waitcnt instrs to be emitted as "
58 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
62 "amdgpu-waitcnt-load-forcezero",
63 cl::desc(
"Force all waitcnt load counters to wait until 0"),
77 SAMPLE_CNT = NUM_NORMAL_INST_CNTS,
81 NUM_EXTENDED_INST_CNTS,
82 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
96auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
97 return enum_seq(LOAD_CNT, MaxCounter);
115 TRACKINGID_RANGE_LEN = (1 << 16),
120 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
125 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
126 LDSDMA_BEGIN = REGUNITS_END,
127 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
131static constexpr VMEMID toVMEMID(MCRegUnit RU) {
132 return static_cast<unsigned>(RU);
135struct HardwareLimits {
139 unsigned StorecntMax;
140 unsigned SamplecntMax;
146#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
148 DECL(VMEM_SAMPLER_READ_ACCESS) \
149 DECL(VMEM_BVH_READ_ACCESS) \
150 DECL(GLOBAL_INV_ACCESS) \
151 DECL(VMEM_WRITE_ACCESS) \
152 DECL(SCRATCH_WRITE_ACCESS) \
162 DECL(EXP_POS_ACCESS) \
163 DECL(EXP_PARAM_ACCESS) \
168#define AMDGPU_EVENT_ENUM(Name) Name,
173#undef AMDGPU_EVENT_ENUM
175#define AMDGPU_EVENT_NAME(Name) #Name,
179#undef AMDGPU_EVENT_NAME
200static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
201 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
202 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
203 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
211static bool isNormalMode(InstCounterType MaxCounter) {
212 return MaxCounter == NUM_NORMAL_INST_CNTS;
217 assert(updateVMCntOnly(Inst));
219 return VMEM_NOSAMPLER;
233 return VMEM_NOSAMPLER;
245 return Wait.StoreCnt;
247 return Wait.SampleCnt;
260 unsigned &WC = getCounterRef(
Wait,
T);
261 WC = std::min(WC,
Count);
265 getCounterRef(
Wait,
T) = ~0
u;
269 return getCounterRef(
Wait,
T);
273InstCounterType eventCounter(
const unsigned *masks, WaitEventType
E) {
274 for (
auto T : inst_counter_types()) {
275 if (masks[
T] & (1 <<
E))
281class WaitcntBrackets;
289class WaitcntGenerator {
291 const GCNSubtarget *ST =
nullptr;
292 const SIInstrInfo *TII =
nullptr;
293 AMDGPU::IsaVersion IV;
294 InstCounterType MaxCounter;
298 WaitcntGenerator() =
default;
299 WaitcntGenerator(
const MachineFunction &MF, InstCounterType MaxCounter)
300 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
307 bool isOptNone()
const {
return OptNone; }
321 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
322 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
326 bool promoteSoftWaitCnt(MachineInstr *Waitcnt)
const;
330 virtual bool createNewWaitcnt(MachineBasicBlock &
Block,
332 AMDGPU::Waitcnt
Wait) = 0;
336 virtual const unsigned *getWaitEventMask()
const = 0;
340 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const = 0;
342 virtual ~WaitcntGenerator() =
default;
345 static constexpr unsigned
346 eventMask(std::initializer_list<WaitEventType> Events) {
348 for (
auto &
E : Events)
355class WaitcntGeneratorPreGFX12 :
public WaitcntGenerator {
357 using WaitcntGenerator::WaitcntGenerator;
360 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
361 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
364 bool createNewWaitcnt(MachineBasicBlock &
Block,
366 AMDGPU::Waitcnt
Wait)
override;
368 const unsigned *getWaitEventMask()
const override {
371 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
373 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
374 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
375 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
376 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
377 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
383 return WaitEventMaskForInstPreGFX12;
386 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
389class WaitcntGeneratorGFX12Plus :
public WaitcntGenerator {
391 using WaitcntGenerator::WaitcntGenerator;
394 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
395 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
398 bool createNewWaitcnt(MachineBasicBlock &
Block,
400 AMDGPU::Waitcnt
Wait)
override;
402 const unsigned *getWaitEventMask()
const override {
405 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
406 eventMask({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
407 eventMask({LDS_ACCESS, GDS_ACCESS}),
408 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
409 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
410 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
411 eventMask({VMEM_SAMPLER_READ_ACCESS}),
412 eventMask({VMEM_BVH_READ_ACCESS}),
413 eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
414 eventMask({VMEM_GROUP, SMEM_GROUP})};
416 return WaitEventMaskForInstGFX12Plus;
419 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
422class SIInsertWaitcnts {
424 const GCNSubtarget *ST;
425 const SIInstrInfo *TII =
nullptr;
426 const SIRegisterInfo *TRI =
nullptr;
427 const MachineRegisterInfo *MRI =
nullptr;
428 InstCounterType SmemAccessCounter;
429 InstCounterType MaxCounter;
430 const unsigned *WaitEventMaskForInst;
433 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
434 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
435 MachineLoopInfo *MLI;
436 MachinePostDominatorTree *PDT;
440 std::unique_ptr<WaitcntBrackets> Incoming;
444 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
446 bool ForceEmitWaitcnt[NUM_INST_CNTS];
451 WaitcntGeneratorPreGFX12 WCGPreGFX12;
452 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
454 WaitcntGenerator *WCG =
nullptr;
458 DenseSet<MachineInstr *> ReleaseVGPRInsts;
460 HardwareLimits Limits;
463 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
465 : MLI(MLI), PDT(PDT), AA(AA) {
466 (void)ForceExpCounter;
467 (void)ForceLgkmCounter;
468 (void)ForceVMCounter;
471 unsigned getWaitCountMax(InstCounterType
T)
const {
474 return Limits.LoadcntMax;
476 return Limits.DscntMax;
478 return Limits.ExpcntMax;
480 return Limits.StorecntMax;
482 return Limits.SamplecntMax;
484 return Limits.BvhcntMax;
486 return Limits.KmcntMax;
488 return Limits.XcntMax;
495 bool shouldFlushVmCnt(MachineLoop *
ML,
const WaitcntBrackets &Brackets);
496 bool isPreheaderToFlush(MachineBasicBlock &
MBB,
497 const WaitcntBrackets &ScoreBrackets);
498 bool isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const;
499 bool run(MachineFunction &MF);
501 void setForceEmitWaitcnt() {
507 ForceEmitWaitcnt[
EXP_CNT] =
true;
509 ForceEmitWaitcnt[
EXP_CNT] =
false;
514 ForceEmitWaitcnt[DS_CNT] =
true;
515 ForceEmitWaitcnt[KM_CNT] =
true;
517 ForceEmitWaitcnt[DS_CNT] =
false;
518 ForceEmitWaitcnt[KM_CNT] =
false;
523 ForceEmitWaitcnt[LOAD_CNT] =
true;
524 ForceEmitWaitcnt[SAMPLE_CNT] =
true;
525 ForceEmitWaitcnt[BVH_CNT] =
true;
527 ForceEmitWaitcnt[LOAD_CNT] =
false;
528 ForceEmitWaitcnt[SAMPLE_CNT] =
false;
529 ForceEmitWaitcnt[BVH_CNT] =
false;
536 WaitEventType getVmemWaitEventType(
const MachineInstr &Inst)
const {
539 case AMDGPU::GLOBAL_INV:
540 return GLOBAL_INV_ACCESS;
542 case AMDGPU::GLOBAL_WB:
543 case AMDGPU::GLOBAL_WBINV:
544 return VMEM_WRITE_ACCESS;
550 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
551 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
560 if (TII->mayAccessScratch(Inst))
561 return SCRATCH_WRITE_ACCESS;
562 return VMEM_WRITE_ACCESS;
566 return VmemReadMapping[getVmemType(Inst)];
569 bool isVmemAccess(
const MachineInstr &
MI)
const;
570 bool generateWaitcntInstBefore(MachineInstr &
MI,
571 WaitcntBrackets &ScoreBrackets,
572 MachineInstr *OldWaitcntInstr,
574 bool generateWaitcnt(AMDGPU::Waitcnt
Wait,
576 MachineBasicBlock &
Block, WaitcntBrackets &ScoreBrackets,
577 MachineInstr *OldWaitcntInstr);
578 void updateEventWaitcntAfter(MachineInstr &Inst,
579 WaitcntBrackets *ScoreBrackets);
581 MachineBasicBlock *
Block)
const;
582 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &
Block,
583 WaitcntBrackets &ScoreBrackets);
584 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &
Block,
585 WaitcntBrackets &ScoreBrackets);
596class WaitcntBrackets {
598 WaitcntBrackets(
const SIInsertWaitcnts *Context) : Context(Context) {
599 assert(Context->TRI->getNumRegUnits() < REGUNITS_END);
604 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
605 for (
auto &[
ID, Val] : VMem) {
609 for (
auto &[
ID, Val] : SGPRs) {
614 if (NumUnusedVmem || NumUnusedSGPRs) {
615 errs() <<
"WaitcntBracket had unused entries at destruction time: "
616 << NumUnusedVmem <<
" VMem and " << NumUnusedSGPRs
617 <<
" SGPR unused entries\n";
623 bool isSmemCounter(InstCounterType
T)
const {
624 return T == Context->SmemAccessCounter ||
T == X_CNT;
627 unsigned getSgprScoresIdx(InstCounterType
T)
const {
628 assert(isSmemCounter(
T) &&
"Invalid SMEM counter");
629 return T == X_CNT ? 1 : 0;
632 unsigned getScoreLB(InstCounterType
T)
const {
637 unsigned getScoreUB(InstCounterType
T)
const {
642 unsigned getScoreRange(InstCounterType
T)
const {
643 return getScoreUB(
T) - getScoreLB(
T);
646 unsigned getSGPRScore(MCRegUnit RU, InstCounterType
T)
const {
647 auto It = SGPRs.find(RU);
648 return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(
T)] : 0;
651 unsigned getVMemScore(VMEMID TID, InstCounterType
T)
const {
652 auto It = VMem.find(TID);
653 return It != VMem.end() ? It->second.Scores[
T] : 0;
658 bool counterOutOfOrder(InstCounterType
T)
const;
659 void simplifyWaitcnt(AMDGPU::Waitcnt &
Wait)
const {
662 void simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
663 AMDGPU::Waitcnt &UpdateWait)
const;
664 void simplifyWaitcnt(InstCounterType
T,
unsigned &
Count)
const;
665 bool hasRedundantXCntWithKmCnt(
const AMDGPU::Waitcnt &
Wait)
const;
666 bool canOptimizeXCntWithLoadCnt(
const AMDGPU::Waitcnt &
Wait)
const;
667 void simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
668 AMDGPU::Waitcnt &UpdateWait)
const;
670 void determineWaitForPhysReg(InstCounterType
T,
MCPhysReg Reg,
671 AMDGPU::Waitcnt &
Wait)
const;
672 void determineWaitForLDSDMA(InstCounterType
T, VMEMID TID,
673 AMDGPU::Waitcnt &
Wait)
const;
674 void tryClearSCCWriteEvent(MachineInstr *Inst);
676 void applyWaitcnt(
const AMDGPU::Waitcnt &
Wait);
677 void applyWaitcnt(InstCounterType
T,
unsigned Count);
678 void updateByEvent(WaitEventType
E, MachineInstr &
MI);
680 unsigned hasPendingEvent()
const {
return PendingEvents; }
681 unsigned hasPendingEvent(WaitEventType
E)
const {
682 return PendingEvents & (1 <<
E);
684 unsigned hasPendingEvent(InstCounterType
T)
const {
685 unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[
T];
686 assert((HasPending != 0) == (getScoreRange(
T) != 0));
690 bool hasMixedPendingEvents(InstCounterType
T)
const {
691 unsigned Events = hasPendingEvent(
T);
693 return Events & (Events - 1);
696 bool hasPendingFlat()
const {
697 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
698 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
699 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
700 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
703 void setPendingFlat() {
704 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
705 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
708 bool hasPendingGDS()
const {
709 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
712 unsigned getPendingGDSWait()
const {
713 return std::min(getScoreUB(DS_CNT) - LastGDS,
714 Context->getWaitCountMax(DS_CNT) - 1);
717 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
721 bool hasOtherPendingVmemTypes(
MCPhysReg Reg, VmemType V)
const {
722 for (MCRegUnit RU : regunits(
Reg)) {
723 auto It = VMem.find(toVMEMID(RU));
724 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
731 for (MCRegUnit RU : regunits(
Reg)) {
732 if (
auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
733 It->second.VMEMTypes = 0;
734 if (It->second.empty())
740 void setStateOnFunctionEntryOrReturn() {
741 setScoreUB(STORE_CNT,
742 getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
743 PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
746 ArrayRef<const MachineInstr *> getLDSDMAStores()
const {
750 bool hasPointSampleAccel(
const MachineInstr &
MI)
const;
751 bool hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
754 void print(raw_ostream &)
const;
759 void purgeEmptyTrackingData();
769 void determineWaitForScore(InstCounterType
T,
unsigned Score,
770 AMDGPU::Waitcnt &
Wait)
const;
772 static bool mergeScore(
const MergeInfo &M,
unsigned &Score,
773 unsigned OtherScore);
776 assert(
Reg != AMDGPU::SCC &&
"Shouldn't be used on SCC");
777 if (!Context->TRI->isInAllocatableClass(
Reg))
779 const TargetRegisterClass *RC = Context->TRI->getPhysRegBaseClass(
Reg);
780 unsigned Size = Context->TRI->getRegSizeInBits(*RC);
781 if (
Size == 16 && Context->ST->hasD16Writes32BitVgpr())
782 Reg = Context->TRI->get32BitRegister(
Reg);
783 return Context->TRI->regunits(
Reg);
786 void setScoreLB(InstCounterType
T,
unsigned Val) {
791 void setScoreUB(InstCounterType
T,
unsigned Val) {
798 if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
799 ScoreLBs[
EXP_CNT] = ScoreUBs[
EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
802 void setRegScore(
MCPhysReg Reg, InstCounterType
T,
unsigned Val) {
803 const SIRegisterInfo *
TRI = Context->TRI;
804 if (
Reg == AMDGPU::SCC) {
806 }
else if (
TRI->isVectorRegister(*Context->MRI,
Reg)) {
807 for (MCRegUnit RU : regunits(
Reg))
808 VMem[toVMEMID(RU)].Scores[
T] = Val;
809 }
else if (
TRI->isSGPRReg(*Context->MRI,
Reg)) {
810 auto STy = getSgprScoresIdx(
T);
811 for (MCRegUnit RU : regunits(
Reg))
812 SGPRs[RU].Scores[STy] = Val;
818 void setVMemScore(VMEMID TID, InstCounterType
T,
unsigned Val) {
819 VMem[TID].Scores[
T] = Val;
822 void setScoreByOperand(
const MachineOperand &
Op, InstCounterType CntTy,
825 const SIInsertWaitcnts *Context;
827 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
828 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
829 unsigned PendingEvents = 0;
831 unsigned LastFlat[NUM_INST_CNTS] = {0};
833 unsigned LastGDS = 0;
850 std::array<unsigned, NUM_INST_CNTS> Scores = {0};
852 unsigned VMEMTypes = 0;
855 return all_of(Scores, [](
unsigned K) {
return K == 0; }) && !VMEMTypes;
864 std::array<unsigned, 2> Scores = {0};
866 bool empty()
const {
return !Scores[0] && !Scores[1]; }
869 DenseMap<VMEMID, VMEMInfo> VMem;
870 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
873 unsigned SCCScore = 0;
875 const MachineInstr *PendingSCCWrite =
nullptr;
879 SmallVector<const MachineInstr *> LDSDMAStores;
885 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
887 bool runOnMachineFunction(MachineFunction &MF)
override;
889 StringRef getPassName()
const override {
890 return "SI insert wait instructions";
893 void getAnalysisUsage(AnalysisUsage &AU)
const override {
896 AU.
addRequired<MachinePostDominatorTreeWrapperPass>();
906 InstCounterType CntTy,
unsigned Score) {
907 setRegScore(
Op.getReg().asMCReg(), CntTy, Score);
915bool WaitcntBrackets::hasPointSampleAccel(
const MachineInstr &
MI)
const {
920 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
930bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
932 if (!hasPointSampleAccel(
MI))
935 return hasOtherPendingVmemTypes(
Reg, VMEM_NOSAMPLER);
938void WaitcntBrackets::updateByEvent(WaitEventType
E, MachineInstr &Inst) {
939 InstCounterType
T = eventCounter(
Context->WaitEventMaskForInst,
E);
942 unsigned UB = getScoreUB(
T);
943 unsigned CurrScore = UB + 1;
949 PendingEvents |= 1 <<
E;
950 setScoreUB(
T, CurrScore);
953 const MachineRegisterInfo *
MRI =
Context->MRI;
962 if (
const auto *AddrOp =
TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
963 setScoreByOperand(*AddrOp, EXP_CNT, CurrScore);
966 if (
const auto *Data0 =
967 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
968 setScoreByOperand(*Data0, EXP_CNT, CurrScore);
969 if (
const auto *Data1 =
970 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
971 setScoreByOperand(*Data1, EXP_CNT, CurrScore);
974 Inst.
getOpcode() != AMDGPU::DS_CONSUME &&
975 Inst.
getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
976 for (
const MachineOperand &
Op : Inst.
all_uses()) {
977 if (
TRI->isVectorRegister(*
MRI,
Op.getReg()))
978 setScoreByOperand(
Op, EXP_CNT, CurrScore);
981 }
else if (
TII->isFLAT(Inst)) {
983 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
986 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
989 }
else if (
TII->isMIMG(Inst)) {
991 setScoreByOperand(Inst.
getOperand(0), EXP_CNT, CurrScore);
993 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
996 }
else if (
TII->isMTBUF(Inst)) {
998 setScoreByOperand(Inst.
getOperand(0), EXP_CNT, CurrScore);
999 }
else if (
TII->isMUBUF(Inst)) {
1001 setScoreByOperand(Inst.
getOperand(0), EXP_CNT, CurrScore);
1003 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1004 EXP_CNT, CurrScore);
1006 }
else if (
TII->isLDSDIR(Inst)) {
1008 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
1009 EXP_CNT, CurrScore);
1011 if (
TII->isEXP(Inst)) {
1016 for (MachineOperand &DefMO : Inst.
all_defs()) {
1017 if (
TRI->isVGPR(*
MRI, DefMO.getReg())) {
1018 setScoreByOperand(DefMO, EXP_CNT, CurrScore);
1022 for (
const MachineOperand &
Op : Inst.
all_uses()) {
1023 if (
TRI->isVectorRegister(*
MRI,
Op.getReg()))
1024 setScoreByOperand(
Op, EXP_CNT, CurrScore);
1027 }
else if (
T == X_CNT) {
1028 WaitEventType OtherEvent =
E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1029 if (PendingEvents & (1 << OtherEvent)) {
1034 setScoreLB(
T, getScoreUB(
T) - 1);
1035 PendingEvents &= ~(1 << OtherEvent);
1037 for (
const MachineOperand &
Op : Inst.
all_uses())
1038 setScoreByOperand(
Op,
T, CurrScore);
1049 for (
const MachineOperand &
Op : Inst.
defs()) {
1050 if (
T == LOAD_CNT ||
T == SAMPLE_CNT ||
T == BVH_CNT) {
1051 if (!
TRI->isVectorRegister(*
MRI,
Op.getReg()))
1053 if (updateVMCntOnly(Inst)) {
1058 VmemType
V = getVmemType(Inst);
1059 unsigned char TypesMask = 1 <<
V;
1062 if (hasPointSampleAccel(Inst))
1063 TypesMask |= 1 << VMEM_NOSAMPLER;
1064 for (MCRegUnit RU : regunits(
Op.getReg().asMCReg()))
1065 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1068 setScoreByOperand(
Op,
T, CurrScore);
1071 (
TII->isDS(Inst) ||
TII->mayWriteLDSThroughDMA(Inst))) {
1080 if (!MemOp->isStore() ||
1085 auto AAI = MemOp->getAAInfo();
1091 if (!AAI || !AAI.Scope)
1093 for (
unsigned I = 0,
E = LDSDMAStores.
size();
I !=
E && !Slot; ++
I) {
1094 for (
const auto *MemOp : LDSDMAStores[
I]->memoperands()) {
1095 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1110 setVMemScore(LDSDMA_BEGIN,
T, CurrScore);
1111 if (Slot && Slot < NUM_LDSDMA)
1112 setVMemScore(LDSDMA_BEGIN + Slot,
T, CurrScore);
1116 setRegScore(AMDGPU::SCC,
T, CurrScore);
1117 PendingSCCWrite = &Inst;
1122void WaitcntBrackets::print(raw_ostream &OS)
const {
1126 for (
auto T : inst_counter_types(
Context->MaxCounter)) {
1127 unsigned SR = getScoreRange(
T);
1131 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"LOAD" :
"VM") <<
"_CNT("
1135 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"DS" :
"LGKM") <<
"_CNT("
1139 OS <<
" EXP_CNT(" << SR <<
"):";
1142 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"STORE" :
"VS") <<
"_CNT("
1146 OS <<
" SAMPLE_CNT(" << SR <<
"):";
1149 OS <<
" BVH_CNT(" << SR <<
"):";
1152 OS <<
" KM_CNT(" << SR <<
"):";
1155 OS <<
" X_CNT(" << SR <<
"):";
1158 OS <<
" UNKNOWN(" << SR <<
"):";
1164 unsigned LB = getScoreLB(
T);
1167 sort(SortedVMEMIDs);
1169 for (
auto ID : SortedVMEMIDs) {
1170 unsigned RegScore = VMem.at(
ID).Scores[
T];
1173 unsigned RelScore = RegScore - LB - 1;
1174 if (
ID < REGUNITS_END) {
1175 OS <<
' ' << RelScore <<
":vRU" <<
ID;
1177 assert(
ID >= LDSDMA_BEGIN &&
ID < LDSDMA_END &&
1178 "Unhandled/unexpected ID value!");
1179 OS <<
' ' << RelScore <<
":LDSDMA" <<
ID;
1184 if (isSmemCounter(
T)) {
1186 sort(SortedSMEMIDs);
1187 for (
auto ID : SortedSMEMIDs) {
1188 unsigned RegScore = SGPRs.at(
ID).Scores[getSgprScoresIdx(
T)];
1191 unsigned RelScore = RegScore - LB - 1;
1192 OS <<
' ' << RelScore <<
":sRU" <<
static_cast<unsigned>(
ID);
1196 if (
T == KM_CNT && SCCScore > 0)
1197 OS <<
' ' << SCCScore <<
":scc";
1202 OS <<
"Pending Events: ";
1203 if (hasPendingEvent()) {
1205 for (
unsigned I = 0;
I != NUM_WAIT_EVENTS; ++
I) {
1206 if (hasPendingEvent((WaitEventType)
I)) {
1207 OS <<
LS << WaitEventTypeName[
I];
1220void WaitcntBrackets::simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
1221 AMDGPU::Waitcnt &UpdateWait)
const {
1222 simplifyWaitcnt(LOAD_CNT, UpdateWait.
LoadCnt);
1223 simplifyWaitcnt(EXP_CNT, UpdateWait.
ExpCnt);
1224 simplifyWaitcnt(DS_CNT, UpdateWait.
DsCnt);
1225 simplifyWaitcnt(STORE_CNT, UpdateWait.
StoreCnt);
1226 simplifyWaitcnt(SAMPLE_CNT, UpdateWait.
SampleCnt);
1227 simplifyWaitcnt(BVH_CNT, UpdateWait.
BvhCnt);
1228 simplifyWaitcnt(KM_CNT, UpdateWait.
KmCnt);
1229 simplifyXcnt(CheckWait, UpdateWait);
1232void WaitcntBrackets::simplifyWaitcnt(InstCounterType
T,
1233 unsigned &
Count)
const {
1237 if (
Count >= getScoreRange(
T))
1241void WaitcntBrackets::purgeEmptyTrackingData() {
1252void WaitcntBrackets::determineWaitForScore(InstCounterType
T,
1253 unsigned ScoreToWait,
1254 AMDGPU::Waitcnt &
Wait)
const {
1255 const unsigned LB = getScoreLB(
T);
1256 const unsigned UB = getScoreUB(
T);
1259 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1260 if ((
T == LOAD_CNT ||
T == DS_CNT) && hasPendingFlat() &&
1261 !
Context->ST->hasFlatLgkmVMemCountInOrder()) {
1265 addWait(
Wait,
T, 0);
1266 }
else if (counterOutOfOrder(
T)) {
1270 addWait(
Wait,
T, 0);
1274 unsigned NeededWait =
1275 std::min(UB - ScoreToWait,
Context->getWaitCountMax(
T) - 1);
1276 addWait(
Wait,
T, NeededWait);
1281void WaitcntBrackets::determineWaitForPhysReg(InstCounterType
T,
MCPhysReg Reg,
1282 AMDGPU::Waitcnt &
Wait)
const {
1283 if (
Reg == AMDGPU::SCC) {
1284 determineWaitForScore(
T, SCCScore,
Wait);
1287 for (MCRegUnit RU : regunits(
Reg))
1288 determineWaitForScore(
1289 T, IsVGPR ? getVMemScore(toVMEMID(RU),
T) : getSGPRScore(RU,
T),
1294void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType
T, VMEMID TID,
1295 AMDGPU::Waitcnt &
Wait)
const {
1296 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1297 determineWaitForScore(
T, getVMemScore(TID,
T),
Wait);
1300void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1303 if (PendingSCCWrite &&
1304 PendingSCCWrite->
getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1306 unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
1308 if ((PendingEvents &
Context->WaitEventMaskForInst[KM_CNT]) ==
1309 SCC_WRITE_PendingEvent) {
1310 setScoreLB(KM_CNT, getScoreUB(KM_CNT));
1313 PendingEvents &= ~SCC_WRITE_PendingEvent;
1314 PendingSCCWrite =
nullptr;
1318void WaitcntBrackets::applyWaitcnt(
const AMDGPU::Waitcnt &
Wait) {
1319 applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1320 applyWaitcnt(EXP_CNT,
Wait.ExpCnt);
1321 applyWaitcnt(DS_CNT,
Wait.DsCnt);
1322 applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1323 applyWaitcnt(SAMPLE_CNT,
Wait.SampleCnt);
1324 applyWaitcnt(BVH_CNT,
Wait.BvhCnt);
1325 applyWaitcnt(KM_CNT,
Wait.KmCnt);
1326 applyWaitcnt(X_CNT,
Wait.XCnt);
1329void WaitcntBrackets::applyWaitcnt(InstCounterType
T,
unsigned Count) {
1330 const unsigned UB = getScoreUB(
T);
1334 if (counterOutOfOrder(
T))
1336 setScoreLB(
T, std::max(getScoreLB(
T), UB -
Count));
1339 PendingEvents &= ~Context->WaitEventMaskForInst[
T];
1342 if (
T == KM_CNT &&
Count == 0 && hasPendingEvent(SMEM_GROUP)) {
1343 if (!hasMixedPendingEvents(X_CNT))
1344 applyWaitcnt(X_CNT, 0);
1346 PendingEvents &= ~(1 << SMEM_GROUP);
1348 if (
T == LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
1349 !hasPendingEvent(STORE_CNT)) {
1350 if (!hasMixedPendingEvents(X_CNT))
1351 applyWaitcnt(X_CNT,
Count);
1352 else if (
Count == 0)
1353 PendingEvents &= ~(1 << VMEM_GROUP);
1357bool WaitcntBrackets::hasRedundantXCntWithKmCnt(
1358 const AMDGPU::Waitcnt &
Wait)
const {
1362 return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP);
1365bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(
1366 const AMDGPU::Waitcnt &
Wait)
const {
1370 return Wait.LoadCnt != ~0
u && hasPendingEvent(VMEM_GROUP) &&
1371 !hasPendingEvent(STORE_CNT);
1374void WaitcntBrackets::simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
1375 AMDGPU::Waitcnt &UpdateWait)
const {
1381 if (hasRedundantXCntWithKmCnt(CheckWait))
1382 UpdateWait.
XCnt = ~0
u;
1383 if (canOptimizeXCntWithLoadCnt(CheckWait) &&
1385 UpdateWait.
XCnt = ~0
u;
1386 simplifyWaitcnt(X_CNT, UpdateWait.
XCnt);
1391bool WaitcntBrackets::counterOutOfOrder(InstCounterType
T)
const {
1393 if ((
T ==
Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1394 (
T == X_CNT && hasPendingEvent(SMEM_GROUP)))
1400 if (
T == LOAD_CNT) {
1401 unsigned Events = hasPendingEvent(
T);
1404 Events &= ~(1 << GLOBAL_INV_ACCESS);
1407 return Events & (Events - 1);
1410 return hasMixedPendingEvents(
T);
1420char SIInsertWaitcntsLegacy::
ID = 0;
1425 return new SIInsertWaitcntsLegacy();
1430 int OpIdx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
OpName);
1435 if (NewEnc == MO.
getImm())
1446 case AMDGPU::S_WAIT_LOADCNT:
1448 case AMDGPU::S_WAIT_EXPCNT:
1450 case AMDGPU::S_WAIT_STORECNT:
1452 case AMDGPU::S_WAIT_SAMPLECNT:
1454 case AMDGPU::S_WAIT_BVHCNT:
1456 case AMDGPU::S_WAIT_DSCNT:
1458 case AMDGPU::S_WAIT_KMCNT:
1460 case AMDGPU::S_WAIT_XCNT:
1467bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt)
const {
1481bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1482 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1485 assert(isNormalMode(MaxCounter));
1488 MachineInstr *WaitcntInstr =
nullptr;
1489 MachineInstr *WaitcntVsCntInstr =
nullptr;
1492 dbgs() <<
"PreGFX12::applyPreexistingWaitcnt at: ";
1494 dbgs() <<
"end of block\n";
1502 if (
II.isMetaInstruction()) {
1508 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1512 if (Opcode == AMDGPU::S_WAITCNT) {
1513 unsigned IEnc =
II.getOperand(0).getImm();
1516 ScoreBrackets.simplifyWaitcnt(OldWait);
1520 if (WaitcntInstr || (!
Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1521 II.eraseFromParent();
1525 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1528 <<
"Before: " <<
Wait <<
'\n';);
1529 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, LDSDMA_BEGIN,
Wait);
1538 II.eraseFromParent();
1540 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1541 assert(
II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1544 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1546 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1547 Wait.StoreCnt = std::min(
Wait.StoreCnt, OldVSCnt);
1549 if (WaitcntVsCntInstr || (!
Wait.hasWaitStoreCnt() && TrySimplify)) {
1550 II.eraseFromParent();
1553 WaitcntVsCntInstr = &
II;
1560 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1562 ScoreBrackets.applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1563 ScoreBrackets.applyWaitcnt(EXP_CNT,
Wait.ExpCnt);
1564 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
1571 <<
"applied pre-existing waitcnt\n"
1572 <<
"New Instr at block end: " << *WaitcntInstr <<
'\n'
1573 :
dbgs() <<
"applied pre-existing waitcnt\n"
1574 <<
"Old Instr: " << *It
1575 <<
"New Instr: " << *WaitcntInstr <<
'\n');
1578 if (WaitcntVsCntInstr) {
1580 AMDGPU::OpName::simm16,
Wait.StoreCnt);
1581 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1583 ScoreBrackets.applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1584 Wait.StoreCnt = ~0
u;
1587 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1588 <<
"New Instr at block end: " << *WaitcntVsCntInstr
1590 :
dbgs() <<
"applied pre-existing waitcnt\n"
1591 <<
"Old Instr: " << *It
1592 <<
"New Instr: " << *WaitcntVsCntInstr <<
'\n');
1600bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1602 AMDGPU::Waitcnt
Wait) {
1604 assert(isNormalMode(MaxCounter));
1611 if (
Wait.hasWaitExceptStoreCnt()) {
1613 [[maybe_unused]]
auto SWaitInst =
1618 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1619 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1622 if (
Wait.hasWaitStoreCnt()) {
1625 [[maybe_unused]]
auto SWaitInst =
1632 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1633 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1640WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1641 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt &&
ST->hasVscnt() ? 0 : ~0u);
1645WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1646 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1654bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1655 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1658 assert(!isNormalMode(MaxCounter));
1661 MachineInstr *CombinedLoadDsCntInstr =
nullptr;
1662 MachineInstr *CombinedStoreDsCntInstr =
nullptr;
1663 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1666 dbgs() <<
"GFX12Plus::applyPreexistingWaitcnt at: ";
1668 dbgs() <<
"end of block\n";
1674 AMDGPU::Waitcnt RequiredWait;
1679 if (
II.isMetaInstruction()) {
1684 MachineInstr **UpdatableInstr;
1690 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1694 if (Opcode == AMDGPU::S_WAITCNT)
1697 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1699 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1704 RequiredWait = RequiredWait.combined(OldWait);
1705 UpdatableInstr = &CombinedLoadDsCntInstr;
1706 }
else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1708 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1713 RequiredWait = RequiredWait.combined(OldWait);
1714 UpdatableInstr = &CombinedStoreDsCntInstr;
1715 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1718 II.eraseFromParent();
1724 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1726 addWait(
Wait, CT.value(), OldCnt);
1728 addWait(RequiredWait, CT.value(), OldCnt);
1729 UpdatableInstr = &WaitInstrs[CT.value()];
1733 if (!*UpdatableInstr) {
1734 *UpdatableInstr = &
II;
1736 II.eraseFromParent();
1741 ScoreBrackets.simplifyWaitcnt(
Wait.combined(RequiredWait),
Wait);
1742 Wait =
Wait.combined(RequiredWait);
1744 if (CombinedLoadDsCntInstr) {
1752 if (
Wait.LoadCnt != ~0u &&
Wait.DsCnt != ~0u) {
1755 AMDGPU::OpName::simm16, NewEnc);
1756 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1757 ScoreBrackets.applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1758 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
1763 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1764 <<
"New Instr at block end: "
1765 << *CombinedLoadDsCntInstr <<
'\n'
1766 :
dbgs() <<
"applied pre-existing waitcnt\n"
1767 <<
"Old Instr: " << *It <<
"New Instr: "
1768 << *CombinedLoadDsCntInstr <<
'\n');
1775 if (CombinedStoreDsCntInstr) {
1777 if (
Wait.StoreCnt != ~0u &&
Wait.DsCnt != ~0u) {
1780 AMDGPU::OpName::simm16, NewEnc);
1781 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1782 ScoreBrackets.applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1783 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
1784 Wait.StoreCnt = ~0
u;
1788 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1789 <<
"New Instr at block end: "
1790 << *CombinedStoreDsCntInstr <<
'\n'
1791 :
dbgs() <<
"applied pre-existing waitcnt\n"
1792 <<
"Old Instr: " << *It <<
"New Instr: "
1793 << *CombinedStoreDsCntInstr <<
'\n');
1806 if (
Wait.DsCnt != ~0u) {
1815 if (
Wait.LoadCnt != ~0u) {
1816 WaitsToErase.
push_back(&WaitInstrs[LOAD_CNT]);
1817 WaitsToErase.
push_back(&WaitInstrs[DS_CNT]);
1818 }
else if (
Wait.StoreCnt != ~0u) {
1819 WaitsToErase.
push_back(&WaitInstrs[STORE_CNT]);
1820 WaitsToErase.
push_back(&WaitInstrs[DS_CNT]);
1823 for (MachineInstr **WI : WaitsToErase) {
1827 (*WI)->eraseFromParent();
1833 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1834 if (!WaitInstrs[CT])
1837 unsigned NewCnt = getWait(
Wait, CT);
1838 if (NewCnt != ~0u) {
1840 AMDGPU::OpName::simm16, NewCnt);
1841 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1843 ScoreBrackets.applyWaitcnt(CT, NewCnt);
1844 setNoWait(
Wait, CT);
1847 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1848 <<
"New Instr at block end: " << *WaitInstrs[CT]
1850 :
dbgs() <<
"applied pre-existing waitcnt\n"
1851 <<
"Old Instr: " << *It
1852 <<
"New Instr: " << *WaitInstrs[CT] <<
'\n');
1863bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1865 AMDGPU::Waitcnt
Wait) {
1867 assert(!isNormalMode(MaxCounter));
1873 if (
Wait.DsCnt != ~0u) {
1874 MachineInstr *SWaitInst =
nullptr;
1876 if (
Wait.LoadCnt != ~0u) {
1884 }
else if (
Wait.StoreCnt != ~0u) {
1891 Wait.StoreCnt = ~0
u;
1899 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1900 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1907 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1912 [[maybe_unused]]
auto SWaitInst =
1919 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1920 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1938bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &
MI,
1939 WaitcntBrackets &ScoreBrackets,
1940 MachineInstr *OldWaitcntInstr,
1942 setForceEmitWaitcnt();
1946 AMDGPU::Waitcnt
Wait;
1947 const unsigned Opc =
MI.getOpcode();
1953 if (
Opc == AMDGPU::BUFFER_WBINVL1 ||
Opc == AMDGPU::BUFFER_WBINVL1_SC ||
1954 Opc == AMDGPU::BUFFER_WBINVL1_VOL ||
Opc == AMDGPU::BUFFER_GL0_INV ||
1955 Opc == AMDGPU::BUFFER_GL1_INV) {
1962 if (
Opc == AMDGPU::SI_RETURN_TO_EPILOG ||
Opc == AMDGPU::SI_RETURN ||
1963 Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
1964 Opc == AMDGPU::S_SETPC_B64_return) {
1965 AMDGPU::Waitcnt AllZeroWait =
1966 WCG->getAllZeroWaitcnt(
false);
1971 if (
ST->hasExtendedWaitCounts() &&
1972 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
1984 else if (
Opc == AMDGPU::S_ENDPGM ||
Opc == AMDGPU::S_ENDPGM_SAVED) {
1985 if (!WCG->isOptNone() &&
1986 (
MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
1987 (
ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
1988 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1989 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
1993 else if ((
Opc == AMDGPU::S_SENDMSG ||
Opc == AMDGPU::S_SENDMSGHALT) &&
1994 ST->hasLegacyGeometry() &&
2005 if (
MI.modifiesRegister(AMDGPU::EXEC,
TRI)) {
2008 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2009 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2010 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2011 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2018 if (
TII->isAlwaysGDS(
Opc) && ScoreBrackets.hasPendingGDS())
2019 addWait(
Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
2025 Wait = AMDGPU::Waitcnt();
2027 const MachineOperand &CallAddrOp =
TII->getCalleeOperand(
MI);
2028 if (CallAddrOp.
isReg()) {
2029 ScoreBrackets.determineWaitForPhysReg(
2032 if (
const auto *RtnAddrOp =
2033 TII->getNamedOperand(
MI, AMDGPU::OpName::dst)) {
2034 ScoreBrackets.determineWaitForPhysReg(
2035 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(),
Wait);
2038 }
else if (
Opc == AMDGPU::S_BARRIER_WAIT) {
2039 ScoreBrackets.tryClearSCCWriteEvent(&
MI);
2055 for (
const MachineMemOperand *Memop :
MI.memoperands()) {
2056 const Value *Ptr = Memop->getValue();
2057 if (Memop->isStore()) {
2058 if (
auto It = SLoadAddresses.
find(Ptr); It != SLoadAddresses.
end()) {
2059 addWait(
Wait, SmemAccessCounter, 0);
2061 SLoadAddresses.
erase(It);
2064 unsigned AS = Memop->getAddrSpace();
2068 if (
TII->mayWriteLDSThroughDMA(
MI))
2072 unsigned TID = LDSDMA_BEGIN;
2073 if (Ptr && Memop->getAAInfo()) {
2074 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2075 for (
unsigned I = 0,
E = LDSDMAStores.size();
I !=
E; ++
I) {
2076 if (
MI.mayAlias(AA, *LDSDMAStores[
I],
true)) {
2077 if ((
I + 1) >= NUM_LDSDMA) {
2080 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID,
Wait);
2084 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID +
I + 1,
Wait);
2088 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID,
Wait);
2090 if (Memop->isStore()) {
2091 ScoreBrackets.determineWaitForLDSDMA(EXP_CNT, TID,
Wait);
2096 for (
const MachineOperand &
Op :
MI.operands()) {
2101 if (
Op.isTied() &&
Op.isUse() &&
TII->doesNotReadTiedSource(
MI))
2106 const bool IsVGPR =
TRI->isVectorRegister(*
MRI,
Op.getReg());
2113 if (
Op.isImplicit() &&
MI.mayLoadOrStore())
2122 if (
Op.isUse() || !updateVMCntOnly(
MI) ||
2123 ScoreBrackets.hasOtherPendingVmemTypes(
Reg, getVmemType(
MI)) ||
2124 ScoreBrackets.hasPointSamplePendingVmemTypes(
MI,
Reg) ||
2125 !
ST->hasVmemWriteVgprInOrder()) {
2126 ScoreBrackets.determineWaitForPhysReg(LOAD_CNT,
Reg,
Wait);
2127 ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT,
Reg,
Wait);
2128 ScoreBrackets.determineWaitForPhysReg(BVH_CNT,
Reg,
Wait);
2129 ScoreBrackets.clearVgprVmemTypes(
Reg);
2132 if (
Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2133 ScoreBrackets.determineWaitForPhysReg(EXP_CNT,
Reg,
Wait);
2135 ScoreBrackets.determineWaitForPhysReg(DS_CNT,
Reg,
Wait);
2136 }
else if (
Op.getReg() == AMDGPU::SCC) {
2137 ScoreBrackets.determineWaitForPhysReg(KM_CNT,
Reg,
Wait);
2139 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter,
Reg,
Wait);
2142 if (
ST->hasWaitXCnt() &&
Op.isDef())
2143 ScoreBrackets.determineWaitForPhysReg(X_CNT,
Reg,
Wait);
2160 if (
Opc == AMDGPU::S_BARRIER && !
ST->hasAutoWaitcntBeforeBarrier() &&
2161 !
ST->supportsBackOffBarrier()) {
2162 Wait =
Wait.combined(WCG->getAllZeroWaitcnt(
true));
2169 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2174 ScoreBrackets.simplifyWaitcnt(
Wait);
2179 if (
Wait.XCnt != ~0u && isVmemAccess(
MI)) {
2180 ScoreBrackets.applyWaitcnt(X_CNT,
Wait.XCnt);
2187 Wait = WCG->getAllZeroWaitcnt(
false);
2189 if (ForceEmitWaitcnt[LOAD_CNT])
2191 if (ForceEmitWaitcnt[EXP_CNT])
2193 if (ForceEmitWaitcnt[DS_CNT])
2195 if (ForceEmitWaitcnt[SAMPLE_CNT])
2197 if (ForceEmitWaitcnt[BVH_CNT])
2199 if (ForceEmitWaitcnt[KM_CNT])
2201 if (ForceEmitWaitcnt[X_CNT])
2205 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2207 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2209 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2216 return generateWaitcnt(
Wait,
MI.getIterator(), *
MI.getParent(), ScoreBrackets,
2220bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt
Wait,
2222 MachineBasicBlock &
Block,
2223 WaitcntBrackets &ScoreBrackets,
2224 MachineInstr *OldWaitcntInstr) {
2227 if (OldWaitcntInstr)
2231 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr,
Wait, It);
2235 ScoreBrackets.applyWaitcnt(
Wait);
2238 if (
Wait.ExpCnt != ~0u && It !=
Block.instr_end() &&
2240 MachineOperand *WaitExp =
2241 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2249 <<
"Update Instr: " << *It);
2252 if (WCG->createNewWaitcnt(
Block, It,
Wait))
2258bool SIInsertWaitcnts::isVmemAccess(
const MachineInstr &
MI)
const {
2259 return (
TII->isFLAT(
MI) &&
TII->mayAccessVMEMThroughFlat(
MI)) ||
2266 MachineBasicBlock *
Block)
const {
2267 auto BlockEnd =
Block->getParent()->end();
2268 auto BlockIter =
Block->getIterator();
2272 if (++BlockIter != BlockEnd) {
2273 It = BlockIter->instr_begin();
2280 if (!It->isMetaInstruction())
2288 return It->getOpcode() == AMDGPU::S_ENDPGM;
2292bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2293 MachineBasicBlock &
Block,
2294 WaitcntBrackets &ScoreBrackets) {
2295 AMDGPU::Waitcnt
Wait;
2296 bool NeedsEndPGMCheck =
false;
2304 NeedsEndPGMCheck =
true;
2307 ScoreBrackets.simplifyWaitcnt(
Wait);
2310 bool Result = generateWaitcnt(
Wait, SuccessorIt,
Block, ScoreBrackets,
2313 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &
Block)) {
2321void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2322 WaitcntBrackets *ScoreBrackets) {
2330 bool IsVMEMAccess =
false;
2331 bool IsSMEMAccess =
false;
2332 if (
TII->isDS(Inst) &&
TII->usesLGKM_CNT(Inst)) {
2334 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2335 ScoreBrackets->updateByEvent(GDS_ACCESS, Inst);
2336 ScoreBrackets->updateByEvent(GDS_GPR_LOCK, Inst);
2337 ScoreBrackets->setPendingGDS();
2339 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2341 }
else if (
TII->isFLAT(Inst)) {
2343 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2349 int FlatASCount = 0;
2351 if (
TII->mayAccessVMEMThroughFlat(Inst)) {
2353 IsVMEMAccess =
true;
2354 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2357 if (
TII->mayAccessLDSThroughFlat(Inst)) {
2359 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2368 ScoreBrackets->setPendingFlat();
2371 IsVMEMAccess =
true;
2372 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2374 if (
ST->vmemWriteNeedsExpWaitcnt() &&
2376 ScoreBrackets->updateByEvent(VMW_GPR_LOCK, Inst);
2378 }
else if (
TII->isSMRD(Inst)) {
2379 IsSMEMAccess =
true;
2380 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2381 }
else if (Inst.
isCall()) {
2383 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(
false));
2384 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2386 ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst);
2387 }
else if (
TII->isVINTERP(Inst)) {
2388 int64_t
Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2389 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2391 unsigned Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2393 ScoreBrackets->updateByEvent(EXP_PARAM_ACCESS, Inst);
2395 ScoreBrackets->updateByEvent(EXP_POS_ACCESS, Inst);
2397 ScoreBrackets->updateByEvent(EXP_GPR_LOCK, Inst);
2399 ScoreBrackets->updateByEvent(SCC_WRITE, Inst);
2402 case AMDGPU::S_SENDMSG:
2403 case AMDGPU::S_SENDMSG_RTN_B32:
2404 case AMDGPU::S_SENDMSG_RTN_B64:
2405 case AMDGPU::S_SENDMSGHALT:
2406 ScoreBrackets->updateByEvent(SQ_MESSAGE, Inst);
2408 case AMDGPU::S_MEMTIME:
2409 case AMDGPU::S_MEMREALTIME:
2410 case AMDGPU::S_GET_BARRIER_STATE_M0:
2411 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2412 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2417 if (!
ST->hasWaitXCnt())
2421 ScoreBrackets->updateByEvent(VMEM_GROUP, Inst);
2424 ScoreBrackets->updateByEvent(SMEM_GROUP, Inst);
2427bool WaitcntBrackets::mergeScore(
const MergeInfo &M,
unsigned &Score,
2428 unsigned OtherScore) {
2429 unsigned MyShifted = Score <=
M.OldLB ? 0 : Score +
M.MyShift;
2430 unsigned OtherShifted =
2431 OtherScore <=
M.OtherLB ? 0 : OtherScore +
M.OtherShift;
2432 Score = std::max(MyShifted, OtherShifted);
2433 return OtherShifted > MyShifted;
2441bool WaitcntBrackets::merge(
const WaitcntBrackets &
Other) {
2442 bool StrictDom =
false;
2446 for (
auto K :
Other.VMem.keys())
2447 VMem.try_emplace(K);
2448 for (
auto K :
Other.SGPRs.keys())
2449 SGPRs.try_emplace(K);
2451 for (
auto T : inst_counter_types(
Context->MaxCounter)) {
2453 const unsigned *WaitEventMaskForInst =
Context->WaitEventMaskForInst;
2454 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[
T];
2455 const unsigned OtherEvents =
Other.PendingEvents & WaitEventMaskForInst[
T];
2456 if (OtherEvents & ~OldEvents)
2458 PendingEvents |= OtherEvents;
2461 const unsigned MyPending = ScoreUBs[
T] - ScoreLBs[
T];
2462 const unsigned OtherPending =
Other.ScoreUBs[
T] -
Other.ScoreLBs[
T];
2463 const unsigned NewUB = ScoreLBs[
T] + std::max(MyPending, OtherPending);
2464 if (NewUB < ScoreLBs[
T])
2468 M.OldLB = ScoreLBs[
T];
2469 M.OtherLB =
Other.ScoreLBs[
T];
2470 M.MyShift = NewUB - ScoreUBs[
T];
2471 M.OtherShift = NewUB -
Other.ScoreUBs[
T];
2473 ScoreUBs[
T] = NewUB;
2475 StrictDom |= mergeScore(M, LastFlat[
T],
Other.LastFlat[
T]);
2478 StrictDom |= mergeScore(M, LastGDS,
Other.LastGDS);
2481 StrictDom |= mergeScore(M, SCCScore,
Other.SCCScore);
2482 if (
Other.hasPendingEvent(SCC_WRITE)) {
2483 unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
2484 if (!OldEventsHasSCCWrite) {
2485 PendingSCCWrite =
Other.PendingSCCWrite;
2486 }
else if (PendingSCCWrite !=
Other.PendingSCCWrite) {
2487 PendingSCCWrite =
nullptr;
2492 for (
auto &[RegID,
Info] : VMem)
2493 StrictDom |= mergeScore(M,
Info.Scores[
T],
Other.getVMemScore(RegID,
T));
2495 if (isSmemCounter(
T)) {
2496 unsigned Idx = getSgprScoresIdx(
T);
2497 for (
auto &[RegID,
Info] : SGPRs) {
2498 auto It =
Other.SGPRs.find(RegID);
2499 unsigned OtherScore =
2500 (It !=
Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;
2501 StrictDom |= mergeScore(M,
Info.Scores[Idx], OtherScore);
2506 for (
auto &[TID,
Info] : VMem) {
2507 if (
auto It =
Other.VMem.find(TID); It !=
Other.VMem.end()) {
2508 unsigned char NewVmemTypes =
Info.VMEMTypes | It->second.VMEMTypes;
2509 StrictDom |= NewVmemTypes !=
Info.VMEMTypes;
2510 Info.VMEMTypes = NewVmemTypes;
2514 purgeEmptyTrackingData();
2520 return Opcode == AMDGPU::S_WAITCNT ||
2523 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2524 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2525 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2530bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2531 MachineBasicBlock &
Block,
2532 WaitcntBrackets &ScoreBrackets) {
2536 dbgs() <<
"*** Begin Block: ";
2538 ScoreBrackets.dump();
2544 bool VCCZCorrect =
true;
2545 if (
ST->hasReadVCCZBug()) {
2548 VCCZCorrect =
false;
2549 }
else if (!
ST->partialVCCWritesUpdateVCCZ()) {
2552 VCCZCorrect =
false;
2556 MachineInstr *OldWaitcntInstr =
nullptr;
2561 MachineInstr &Inst = *Iter;
2570 if (!OldWaitcntInstr)
2571 OldWaitcntInstr = &Inst;
2576 bool FlushVmCnt =
Block.getFirstTerminator() == Inst &&
2577 isPreheaderToFlush(
Block, ScoreBrackets);
2580 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2582 OldWaitcntInstr =
nullptr;
2588 if (
ST->hasReadVCCZBug() || !
ST->partialVCCWritesUpdateVCCZ()) {
2592 if (!
ST->partialVCCWritesUpdateVCCZ())
2593 VCCZCorrect =
false;
2602 if (
ST->hasReadVCCZBug() &&
2603 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2606 VCCZCorrect =
false;
2614 if (
TII->isSMRD(Inst)) {
2615 for (
const MachineMemOperand *Memop : Inst.
memoperands()) {
2618 if (!Memop->isInvariant()) {
2619 const Value *Ptr = Memop->getValue();
2623 if (
ST->hasReadVCCZBug()) {
2625 VCCZCorrect =
false;
2629 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2631 Modified |= insertForcedWaitAfter(Inst,
Block, ScoreBrackets);
2635 ScoreBrackets.dump();
2645 TII->get(
ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2657 AMDGPU::Waitcnt
Wait;
2658 if (
Block.getFirstTerminator() ==
Block.end() &&
2659 isPreheaderToFlush(
Block, ScoreBrackets)) {
2660 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2662 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2664 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2673 dbgs() <<
"*** End Block: ";
2675 ScoreBrackets.dump();
2683bool SIInsertWaitcnts::isPreheaderToFlush(
2684 MachineBasicBlock &
MBB,
const WaitcntBrackets &ScoreBrackets) {
2685 auto [Iterator, IsInserted] = PreheadersToFlush.
try_emplace(&
MBB,
false);
2687 return Iterator->second;
2698 shouldFlushVmCnt(Loop, ScoreBrackets)) {
2699 Iterator->second =
true;
2706bool SIInsertWaitcnts::isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const {
2708 return TII->mayAccessVMEMThroughFlat(
MI);
2720bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *
ML,
2721 const WaitcntBrackets &Brackets) {
2722 bool HasVMemLoad =
false;
2723 bool HasVMemStore =
false;
2724 bool UsesVgprLoadedOutside =
false;
2725 DenseSet<MCRegUnit> VgprUse;
2726 DenseSet<MCRegUnit> VgprDef;
2728 for (MachineBasicBlock *
MBB :
ML->blocks()) {
2729 for (MachineInstr &
MI : *
MBB) {
2730 if (isVMEMOrFlatVMEM(
MI)) {
2731 HasVMemLoad |=
MI.mayLoad();
2732 HasVMemStore |=
MI.mayStore();
2735 for (
const MachineOperand &
Op :
MI.all_uses()) {
2736 if (
Op.isDebug() || !
TRI->isVectorRegister(*
MRI,
Op.getReg()))
2739 for (MCRegUnit RU :
TRI->regunits(
Op.getReg().asMCReg())) {
2747 VMEMID
ID = toVMEMID(RU);
2748 if (Brackets.getVMemScore(
ID, LOAD_CNT) >
2749 Brackets.getScoreLB(LOAD_CNT) ||
2750 Brackets.getVMemScore(
ID, SAMPLE_CNT) >
2751 Brackets.getScoreLB(SAMPLE_CNT) ||
2752 Brackets.getVMemScore(
ID, BVH_CNT) >
2753 Brackets.getScoreLB(BVH_CNT)) {
2754 UsesVgprLoadedOutside =
true;
2761 if (isVMEMOrFlatVMEM(
MI) &&
MI.mayLoad()) {
2762 for (
const MachineOperand &
Op :
MI.all_defs()) {
2763 for (MCRegUnit RU :
TRI->regunits(
Op.getReg().asMCReg())) {
2774 if (!
ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2776 return HasVMemLoad && UsesVgprLoadedOutside &&
ST->hasVmemWriteVgprInOrder();
2779bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
2780 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2782 &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
2784 if (
auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2785 AA = &AAR->getAAResults();
2787 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
2799 if (!SIInsertWaitcnts(MLI, PDT,
AA).
run(MF))
2804 .preserve<AAManager>();
2809 TII = ST->getInstrInfo();
2810 TRI = &
TII->getRegisterInfo();
2816 if (ST->hasExtendedWaitCounts()) {
2817 MaxCounter = NUM_EXTENDED_INST_CNTS;
2818 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
2819 WCG = &WCGGFX12Plus;
2821 MaxCounter = NUM_NORMAL_INST_CNTS;
2822 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, MaxCounter);
2826 for (
auto T : inst_counter_types())
2827 ForceEmitWaitcnt[
T] =
false;
2829 WaitEventMaskForInst = WCG->getWaitEventMask();
2831 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
2833 if (
ST->hasExtendedWaitCounts()) {
2850 MachineBasicBlock &EntryBB = MF.
front();
2860 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
2863 if (
ST->hasExtendedWaitCounts()) {
2866 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2867 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
2870 if (!
ST->hasImageInsts() &&
2871 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
2875 TII->get(instrsForExtendedCounterTypes[CT]))
2882 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
this);
2883 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2884 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2891 for (
auto *
MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
2894 std::unique_ptr<WaitcntBrackets> Brackets;
2899 for (
auto BII = BlockInfos.
begin(), BIE = BlockInfos.
end(); BII != BIE;
2901 MachineBasicBlock *
MBB = BII->first;
2902 BlockInfo &BI = BII->second;
2908 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
2910 *Brackets = *BI.Incoming;
2913 Brackets = std::make_unique<WaitcntBrackets>(
this);
2915 *Brackets = WaitcntBrackets(
this);
2918 Modified |= insertWaitcntInBlock(MF, *
MBB, *Brackets);
2921 if (Brackets->hasPendingEvent()) {
2922 BlockInfo *MoveBracketsToSucc =
nullptr;
2924 auto *SuccBII = BlockInfos.
find(Succ);
2925 BlockInfo &SuccBI = SuccBII->second;
2926 if (!SuccBI.Incoming) {
2927 SuccBI.Dirty =
true;
2928 if (SuccBII <= BII) {
2932 if (!MoveBracketsToSucc) {
2933 MoveBracketsToSucc = &SuccBI;
2935 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
2937 }
else if (SuccBI.Incoming->merge(*Brackets)) {
2938 SuccBI.Dirty =
true;
2939 if (SuccBII <= BII) {
2945 if (MoveBracketsToSucc)
2946 MoveBracketsToSucc->Incoming = std::move(Brackets);
2951 if (
ST->hasScalarStores()) {
2952 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
2953 bool HaveScalarStores =
false;
2955 for (MachineBasicBlock &
MBB : MF) {
2956 for (MachineInstr &
MI :
MBB) {
2957 if (!HaveScalarStores &&
TII->isScalarStore(
MI))
2958 HaveScalarStores =
true;
2960 if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
2961 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2966 if (HaveScalarStores) {
2975 for (MachineBasicBlock *
MBB : EndPgmBlocks) {
2976 bool SeenDCacheWB =
false;
2980 if (
I->getOpcode() == AMDGPU::S_DCACHE_WB)
2981 SeenDCacheWB =
true;
2982 else if (
TII->isScalarStore(*
I))
2983 SeenDCacheWB =
false;
2986 if ((
I->getOpcode() == AMDGPU::S_ENDPGM ||
2987 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3003 for (MachineInstr *
MI : ReleaseVGPRInsts) {
3005 TII->get(AMDGPU::S_ALLOC_VGPR))
3010 if (!ReleaseVGPRInsts.empty() &&
3011 (MF.getFrameInfo().hasCalls() ||
3012 ST->getOccupancyWithNumVGPRs(
3013 TRI->getNumUsedPhysRegs(*
MRI, AMDGPU::VGPR_32RegClass),
3016 for (MachineInstr *
MI : ReleaseVGPRInsts) {
3017 if (
ST->requiresNopBeforeDeallocVGPRs()) {
3019 TII->get(AMDGPU::S_NOP))
3023 TII->get(AMDGPU::S_SENDMSG))
3029 ReleaseVGPRInsts.clear();
3030 PreheadersToFlush.
clear();
3031 SLoadAddresses.
clear();
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
#define AMDGPU_EVENT_NAME(Name)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
Provides some synthesis utilities to produce sequences of values.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
A manager for alias analyses.
bool isEntryFunction() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Represents analyses that only rely on functions' control flow.
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
instr_iterator instr_end()
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isDynamicVGPREnabled() const
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
@ ID_DEALLOC_VGPRS_GFX11Plus
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
unsigned getStorecntBitMask(const IsaVersion &Version)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getSamplecntBitMask(const IsaVersion &Version)
unsigned getKmcntBitMask(const IsaVersion &Version)
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getXcntBitMask(const IsaVersion &Version)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getBvhcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getLoadcntBitMask(const IsaVersion &Version)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned getDscntBitMask(const IsaVersion &Version)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
FunctionAddr VTableAddr Count
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
DWARFExpression::Operation Op
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.
static constexpr bool is_iterable