LLVM 23.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "AMDGPUWaitcntUtils.h"
28#include "GCNSubtarget.h"
32#include "llvm/ADT/MapVector.h"
34#include "llvm/ADT/Sequence.h"
40#include "llvm/IR/Dominators.h"
44
45using namespace llvm;
46
47#define DEBUG_TYPE "si-insert-waitcnts"
48
49DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
50 "Force emit s_waitcnt expcnt(0) instrs");
51DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
52 "Force emit s_waitcnt lgkmcnt(0) instrs");
53DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
54 "Force emit s_waitcnt vmcnt(0) instrs");
55
56static cl::opt<bool>
57 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
58 cl::desc("Force all waitcnt instrs to be emitted as "
59 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
60 cl::init(false), cl::Hidden);
61
63 "amdgpu-waitcnt-load-forcezero",
64 cl::desc("Force all waitcnt load counters to wait until 0"),
65 cl::init(false), cl::Hidden);
66
68 "amdgpu-expert-scheduling-mode",
69 cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
70 cl::init(false), cl::Hidden);
71
72namespace {
73// Get the maximum wait count value for a given counter type.
74static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
76 switch (T) {
78 return Limits.LoadcntMax;
79 case AMDGPU::DS_CNT:
80 return Limits.DscntMax;
81 case AMDGPU::EXP_CNT:
82 return Limits.ExpcntMax;
84 return Limits.StorecntMax;
86 return Limits.SamplecntMax;
87 case AMDGPU::BVH_CNT:
88 return Limits.BvhcntMax;
89 case AMDGPU::KM_CNT:
90 return Limits.KmcntMax;
91 case AMDGPU::X_CNT:
92 return Limits.XcntMax;
93 case AMDGPU::VA_VDST:
94 return Limits.VaVdstMax;
95 case AMDGPU::VM_VSRC:
96 return Limits.VmVsrcMax;
97 default:
98 return 0;
99 }
100}
101
102/// Integer IDs used to track vector memory locations we may have to wait on.
103/// Encoded as u16 chunks:
104///
105/// [0, REGUNITS_END ): MCRegUnit
106/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
107///
108/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
109/// It gives (2 << 16) - 1 entries per category which is more than enough
110/// for all register units. MCPhysReg is u16 so we don't even support >u16
111/// physical register numbers at this time, let alone >u16 register units.
112/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
113/// is enough for all register units.
114using VMEMID = uint32_t;
115
116enum : VMEMID {
117 TRACKINGID_RANGE_LEN = (1 << 16),
118
119 // Important: MCRegUnits must always be tracked starting from 0, as we
120 // need to be able to convert between a MCRegUnit and a VMEMID freely.
121 REGUNITS_BEGIN = 0,
122 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
123
124 // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
125 // entry, which is updated for all LDS DMA operations encountered.
126 // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
127 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
128 LDSDMA_BEGIN = REGUNITS_END,
129 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
130};
131
132/// Convert a MCRegUnit to a VMEMID.
133static constexpr VMEMID toVMEMID(MCRegUnit RU) {
134 return static_cast<unsigned>(RU);
135}
136
137#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
138 DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \
139 DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
140 DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
141 DECL(GLOBAL_INV_ACCESS) /* GLOBAL_INV (gfx12+ only) */ \
142 DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
143 DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
144 DECL(VMEM_GROUP) /* vmem group */ \
145 DECL(LDS_ACCESS) /* lds read & write */ \
146 DECL(GDS_ACCESS) /* gds read & write */ \
147 DECL(SQ_MESSAGE) /* send message */ \
148 DECL(SCC_WRITE) /* write to SCC from barrier */ \
149 DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
150 DECL(SMEM_GROUP) /* scalar-memory group */ \
151 DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
152 DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
153 DECL(EXP_POS_ACCESS) /* write to export position */ \
154 DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
155 DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
156 DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */ \
157 DECL(VGPR_CSMACC_WRITE) /* write VGPR dest in Core/Side-MACC VALU */ \
158 DECL(VGPR_DPMACC_WRITE) /* write VGPR dest in DPMACC VALU */ \
159 DECL(VGPR_TRANS_WRITE) /* write VGPR dest in TRANS VALU */ \
160 DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \
161 DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
162 DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
163 DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */ \
164 DECL(ASYNC_ACCESS) /* access that uses ASYNC_CNT */
165
166// clang-format off
167#define AMDGPU_EVENT_ENUM(Name) Name,
168enum WaitEventType {
170 NUM_WAIT_EVENTS
171};
172#undef AMDGPU_EVENT_ENUM
173} // namespace
174
175namespace llvm {
176template <> struct enum_iteration_traits<WaitEventType> {
177 static constexpr bool is_iterable = true;
178};
179} // namespace llvm
180
181namespace {
182
183/// Return an iterator over all events between VMEM_ACCESS (the first event)
184/// and \c MaxEvent (exclusive, default value yields an enumeration over
185/// all counters).
186auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
187 return enum_seq(VMEM_ACCESS, MaxEvent);
188}
189
190#define AMDGPU_EVENT_NAME(Name) #Name,
191static constexpr StringLiteral WaitEventTypeName[] = {
193};
194#undef AMDGPU_EVENT_NAME
195static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
196 return WaitEventTypeName[Event];
197}
198// clang-format on
199
200// Enumerate different types of result-returning VMEM operations. Although
201// s_waitcnt orders them all with a single vmcnt counter, in the absence of
202// s_waitcnt only instructions of the same VmemType are guaranteed to write
203// their results in order -- so there is no need to insert an s_waitcnt between
204// two instructions of the same type that write the same vgpr.
205enum VmemType {
206 // BUF instructions and MIMG instructions without a sampler.
207 VMEM_NOSAMPLER,
208 // MIMG instructions with a sampler.
209 VMEM_SAMPLER,
210 // BVH instructions
211 VMEM_BVH,
212 NUM_VMEM_TYPES
213};
214
215// Maps values of InstCounterType to the instruction that waits on that
216// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
217// returns true, and does not cover VA_VDST or VM_VSRC.
218static const unsigned
219 instrsForExtendedCounterTypes[AMDGPU::NUM_EXTENDED_INST_CNTS] = {
220 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT,
221 AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT,
222 AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
223 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT,
224 AMDGPU::S_WAIT_ASYNCCNT};
225
226// ASYNCMARK and WAIT_ASYNCMARK are meta instructions that emit no hardware
227// code but still need to be processed by this pass for async vmcnt tracking.
228static bool isNonWaitcntMetaInst(const MachineInstr &MI) {
229 switch (MI.getOpcode()) {
230 case AMDGPU::ASYNCMARK:
231 case AMDGPU::WAIT_ASYNCMARK:
232 return false;
233 default:
234 return MI.isMetaInstruction();
235 }
236}
237
238static bool updateVMCntOnly(const MachineInstr &Inst) {
239 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
241}
242
243#ifndef NDEBUG
244static bool isNormalMode(AMDGPU::InstCounterType MaxCounter) {
245 return MaxCounter == AMDGPU::NUM_NORMAL_INST_CNTS;
246}
247#endif // NDEBUG
248
249VmemType getVmemType(const MachineInstr &Inst) {
250 assert(updateVMCntOnly(Inst));
251 if (!SIInstrInfo::isImage(Inst))
252 return VMEM_NOSAMPLER;
253 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
254 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
255 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
256
257 if (BaseInfo->BVH)
258 return VMEM_BVH;
259
260 // We have to make an additional check for isVSAMPLE here since some
261 // instructions don't have a sampler, but are still classified as sampler
262 // instructions for the purposes of e.g. waitcnt.
263 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
264 return VMEM_SAMPLER;
265
266 return VMEM_NOSAMPLER;
267}
268
269void addWait(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T, unsigned Count) {
270 Wait.set(T, std::min(Wait.get(T), Count));
271}
272
274 Wait.set(T, ~0u);
275}
276
277/// A small set of events.
278class WaitEventSet {
279 unsigned Mask = 0;
280
281public:
282 WaitEventSet() = default;
283 explicit constexpr WaitEventSet(WaitEventType Event) {
284 static_assert(NUM_WAIT_EVENTS <= sizeof(Mask) * 8,
285 "Not enough bits in Mask for all the events");
286 Mask |= 1 << Event;
287 }
288 constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
289 for (auto &E : Events) {
290 Mask |= 1 << E;
291 }
292 }
293 void insert(const WaitEventType &Event) { Mask |= 1 << Event; }
294 void remove(const WaitEventType &Event) { Mask &= ~(1 << Event); }
295 void remove(const WaitEventSet &Other) { Mask &= ~Other.Mask; }
296 bool contains(const WaitEventType &Event) const {
297 return Mask & (1 << Event);
298 }
299 /// \Returns true if this set contains all elements of \p Other.
300 bool contains(const WaitEventSet &Other) const {
301 return (~Mask & Other.Mask) == 0;
302 }
303 /// \Returns the intersection of this and \p Other.
304 WaitEventSet operator&(const WaitEventSet &Other) const {
305 auto Copy = *this;
306 Copy.Mask &= Other.Mask;
307 return Copy;
308 }
309 /// \Returns the union of this and \p Other.
310 WaitEventSet operator|(const WaitEventSet &Other) const {
311 auto Copy = *this;
312 Copy.Mask |= Other.Mask;
313 return Copy;
314 }
315 /// This set becomes the union of this and \p Other.
316 WaitEventSet &operator|=(const WaitEventSet &Other) {
317 Mask |= Other.Mask;
318 return *this;
319 }
320 /// This set becomes the intersection of this and \p Other.
321 WaitEventSet &operator&=(const WaitEventSet &Other) {
322 Mask &= Other.Mask;
323 return *this;
324 }
325 bool operator==(const WaitEventSet &Other) const {
326 return Mask == Other.Mask;
327 }
328 bool operator!=(const WaitEventSet &Other) const { return !(*this == Other); }
329 bool empty() const { return Mask == 0; }
330 /// \Returns true if the set contains more than one element.
331 bool twoOrMore() const { return Mask & (Mask - 1); }
332 operator bool() const { return !empty(); }
333 void print(raw_ostream &OS) const {
334 ListSeparator LS(", ");
335 for (WaitEventType Event : wait_events()) {
336 if (contains(Event))
337 OS << LS << getWaitEventTypeName(Event);
338 }
339 }
340 LLVM_DUMP_METHOD void dump() const;
341};
342
343void WaitEventSet::dump() const {
344 print(dbgs());
345 dbgs() << "\n";
346}
347
348class WaitcntBrackets;
349
350// This abstracts the logic for generating and updating S_WAIT* instructions
351// away from the analysis that determines where they are needed. This was
352// done because the set of counters and instructions for waiting on them
353// underwent a major shift with gfx12, sufficiently so that having this
354// abstraction allows the main analysis logic to be simpler than it would
355// otherwise have had to become.
356class WaitcntGenerator {
357protected:
358 const GCNSubtarget &ST;
359 const SIInstrInfo &TII;
360 AMDGPU::IsaVersion IV;
361 AMDGPU::InstCounterType MaxCounter;
362 bool OptNone;
363 bool ExpandWaitcntProfiling = false;
364 const AMDGPU::HardwareLimits &Limits;
365
366public:
367 WaitcntGenerator() = delete;
368 WaitcntGenerator(const WaitcntGenerator &) = delete;
369 WaitcntGenerator(const MachineFunction &MF,
370 AMDGPU::InstCounterType MaxCounter,
371 const AMDGPU::HardwareLimits &Limits)
372 : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
373 IV(AMDGPU::getIsaVersion(ST.getCPU())), MaxCounter(MaxCounter),
374 OptNone(MF.getFunction().hasOptNone() ||
375 MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
376 ExpandWaitcntProfiling(
377 MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),
378 Limits(Limits) {}
379
380 // Return true if the current function should be compiled with no
381 // optimization.
382 bool isOptNone() const { return OptNone; }
383
384 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
385
386 // Edits an existing sequence of wait count instructions according
387 // to an incoming Waitcnt value, which is itself updated to reflect
388 // any new wait count instructions which may need to be generated by
389 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
390 // were made.
391 //
392 // This editing will usually be merely updated operands, but it may also
393 // delete instructions if the incoming Wait value indicates they are not
394 // needed. It may also remove existing instructions for which a wait
395 // is needed if it can be determined that it is better to generate new
396 // instructions later, as can happen on gfx12.
397 virtual bool
398 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
399 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
401
402 // Transform a soft waitcnt into a normal one.
403 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
404
405 // Generates new wait count instructions according to the value of
406 // Wait, returning true if any new instructions were created.
407 // ScoreBrackets is used for profiling expansion.
408 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
410 AMDGPU::Waitcnt Wait,
411 const WaitcntBrackets &ScoreBrackets) = 0;
412
413 // Returns the WaitEventSet that corresponds to counter \p T.
414 virtual const WaitEventSet &
415 getWaitEvents(AMDGPU::InstCounterType T) const = 0;
416
417 /// \returns the counter that corresponds to event \p E.
418 AMDGPU::InstCounterType getCounterFromEvent(WaitEventType E) const {
419 for (auto T : AMDGPU::inst_counter_types()) {
420 if (getWaitEvents(T).contains(E))
421 return T;
422 }
423 llvm_unreachable("event type has no associated counter");
424 }
425
426 // Returns a new waitcnt with all counters except VScnt set to 0. If
427 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
428 // AsyncCnt always defaults to ~0u (don't wait for it). It is only updated
429 // when a call to @llvm.amdgcn.wait.asyncmark() is processed.
430 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
431
432 virtual ~WaitcntGenerator() = default;
433};
434
435class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
436 static constexpr const WaitEventSet
437 WaitEventMaskForInstPreGFX12[AMDGPU::NUM_INST_CNTS] = {
438 WaitEventSet(
439 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
440 WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
441 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
442 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
443 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
444 WaitEventSet(),
445 WaitEventSet(),
446 WaitEventSet(),
447 WaitEventSet(),
448 WaitEventSet(),
449 WaitEventSet()};
450
451public:
452 using WaitcntGenerator::WaitcntGenerator;
453 bool
454 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
455 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
456 MachineBasicBlock::instr_iterator It) const override;
457
458 bool createNewWaitcnt(MachineBasicBlock &Block,
460 AMDGPU::Waitcnt Wait,
461 const WaitcntBrackets &ScoreBrackets) override;
462
463 const WaitEventSet &getWaitEvents(AMDGPU::InstCounterType T) const override {
464 return WaitEventMaskForInstPreGFX12[T];
465 }
466
467 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
468};
469
470class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
471protected:
472 bool IsExpertMode;
473 static constexpr const WaitEventSet
474 WaitEventMaskForInstGFX12Plus[AMDGPU::NUM_INST_CNTS] = {
475 WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
476 WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
477 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
478 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
479 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
480 WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),
481 WaitEventSet({VMEM_BVH_READ_ACCESS}),
482 WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
483 WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
484 WaitEventSet({ASYNC_ACCESS}),
485 WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
486 VGPR_XDL_WRITE}),
487 WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
488
489public:
490 WaitcntGeneratorGFX12Plus() = delete;
491 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
492 AMDGPU::InstCounterType MaxCounter,
493 const AMDGPU::HardwareLimits &Limits,
494 bool IsExpertMode)
495 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
496
497 bool
498 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
499 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
500 MachineBasicBlock::instr_iterator It) const override;
501
502 bool createNewWaitcnt(MachineBasicBlock &Block,
504 AMDGPU::Waitcnt Wait,
505 const WaitcntBrackets &ScoreBrackets) override;
506
507 const WaitEventSet &getWaitEvents(AMDGPU::InstCounterType T) const override {
508 return WaitEventMaskForInstGFX12Plus[T];
509 }
510
511 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
512};
513
514// Flags indicating which counters should be flushed in a loop preheader.
515struct PreheaderFlushFlags {
516 bool FlushVmCnt = false;
517 bool FlushDsCnt = false;
518};
519
520class SIInsertWaitcnts {
521 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
522 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
523 MachineLoopInfo &MLI;
524 MachinePostDominatorTree &PDT;
525 AliasAnalysis *AA = nullptr;
526 MachineFunction &MF;
527
528 struct BlockInfo {
529 std::unique_ptr<WaitcntBrackets> Incoming;
530 bool Dirty = true;
531 BlockInfo() = default;
532 BlockInfo(BlockInfo &&) = default;
533 BlockInfo &operator=(BlockInfo &&) = default;
534 ~BlockInfo();
535 };
536
537 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
538
539 bool ForceEmitWaitcnt[AMDGPU::NUM_INST_CNTS] = {};
540
541 std::unique_ptr<WaitcntGenerator> WCG;
542
543 // Remember call and return instructions in the function.
544 DenseSet<MachineInstr *> CallInsts;
545 DenseSet<MachineInstr *> ReturnInsts;
546
547 // Remember all S_ENDPGM instructions. The boolean flag is true if there might
548 // be outstanding stores but definitely no outstanding scratch stores, to help
549 // with insertion of DEALLOC_VGPRS messages.
550 DenseMap<MachineInstr *, bool> EndPgmInsts;
551
552 AMDGPU::HardwareLimits Limits;
553
554public:
555 const GCNSubtarget &ST;
556 const SIInstrInfo &TII;
557 const SIRegisterInfo &TRI;
558 const MachineRegisterInfo &MRI;
559 AMDGPU::InstCounterType SmemAccessCounter;
560 AMDGPU::InstCounterType MaxCounter;
561 bool IsExpertMode = false;
562
563 SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,
564 AliasAnalysis *AA, MachineFunction &MF)
565 : MLI(MLI), PDT(PDT), AA(AA), MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),
566 TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()),
567 MRI(MF.getRegInfo()) {
568 (void)ForceExpCounter;
569 (void)ForceLgkmCounter;
570 (void)ForceVMCounter;
571 }
572
573 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
574
575 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
576 const WaitcntBrackets &Brackets);
577 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
578 const WaitcntBrackets &ScoreBrackets);
579 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
580 bool isDSRead(const MachineInstr &MI) const;
581 bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;
582 bool run();
583
584 void setForceEmitWaitcnt() {
585// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
586// For debug builds, get the debug counter info and adjust if need be
587#ifndef NDEBUG
588 if (DebugCounter::isCounterSet(ForceExpCounter) &&
589 DebugCounter::shouldExecute(ForceExpCounter)) {
590 ForceEmitWaitcnt[AMDGPU::EXP_CNT] = true;
591 } else {
592 ForceEmitWaitcnt[AMDGPU::EXP_CNT] = false;
593 }
594
595 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
596 DebugCounter::shouldExecute(ForceLgkmCounter)) {
597 ForceEmitWaitcnt[AMDGPU::DS_CNT] = true;
598 ForceEmitWaitcnt[AMDGPU::KM_CNT] = true;
599 } else {
600 ForceEmitWaitcnt[AMDGPU::DS_CNT] = false;
601 ForceEmitWaitcnt[AMDGPU::KM_CNT] = false;
602 }
603
604 if (DebugCounter::isCounterSet(ForceVMCounter) &&
605 DebugCounter::shouldExecute(ForceVMCounter)) {
606 ForceEmitWaitcnt[AMDGPU::LOAD_CNT] = true;
607 ForceEmitWaitcnt[AMDGPU::SAMPLE_CNT] = true;
608 ForceEmitWaitcnt[AMDGPU::BVH_CNT] = true;
609 } else {
610 ForceEmitWaitcnt[AMDGPU::LOAD_CNT] = false;
611 ForceEmitWaitcnt[AMDGPU::SAMPLE_CNT] = false;
612 ForceEmitWaitcnt[AMDGPU::BVH_CNT] = false;
613 }
614
615 ForceEmitWaitcnt[AMDGPU::VA_VDST] = false;
616 ForceEmitWaitcnt[AMDGPU::VM_VSRC] = false;
617#endif // NDEBUG
618 }
619
620 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
621 // instruction.
622 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
623 switch (Inst.getOpcode()) {
624 // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
625 case AMDGPU::GLOBAL_INV:
626 return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write
627 // VGPRs
628 case AMDGPU::GLOBAL_WB:
629 case AMDGPU::GLOBAL_WBINV:
630 return VMEM_WRITE_ACCESS; // tracked using storecnt
631 default:
632 break;
633 }
634
635 // Maps VMEM access types to their corresponding WaitEventType.
636 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
637 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
638
640 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
641 // these should use VM_CNT.
642 if (!ST.hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
643 return VMEM_ACCESS;
644 if (Inst.mayStore() &&
645 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
646 if (TII.mayAccessScratch(Inst))
647 return SCRATCH_WRITE_ACCESS;
648 return VMEM_WRITE_ACCESS;
649 }
650 if (!ST.hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
651 return VMEM_ACCESS;
652 return VmemReadMapping[getVmemType(Inst)];
653 }
654
655 std::optional<WaitEventType>
656 getExpertSchedulingEventType(const MachineInstr &Inst) const;
657
658 bool isAsync(const MachineInstr &MI) const {
660 return false;
662 return true;
663 const MachineOperand *Async =
664 TII.getNamedOperand(MI, AMDGPU::OpName::IsAsync);
665 return Async && (Async->getImm());
666 }
667
668 bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const {
669 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI);
670 }
671
672 bool isAsyncLdsDmaWrite(const MachineInstr &MI) const {
673 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
674 }
675
676 bool shouldUpdateAsyncMark(const MachineInstr &MI,
678 if (!isAsyncLdsDmaWrite(MI))
679 return false;
681 return T == AMDGPU::ASYNC_CNT;
682 return T == AMDGPU::LOAD_CNT;
683 }
684
685 bool isVmemAccess(const MachineInstr &MI) const;
686 bool generateWaitcntInstBefore(MachineInstr &MI,
687 WaitcntBrackets &ScoreBrackets,
688 MachineInstr *OldWaitcntInstr,
689 PreheaderFlushFlags FlushFlags);
690 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
692 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
693 MachineInstr *OldWaitcntInstr);
694 /// \returns all events that correspond to \p Inst.
695 WaitEventSet getEventsFor(const MachineInstr &Inst) const;
696 void updateEventWaitcntAfter(MachineInstr &Inst,
697 WaitcntBrackets *ScoreBrackets);
698 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
699 MachineBasicBlock *Block) const;
700 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
701 WaitcntBrackets &ScoreBrackets);
702 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
703 WaitcntBrackets &ScoreBrackets);
704 /// Removes redundant Soft Xcnt Waitcnts in \p Block emitted by the Memory
705 /// Legalizer. Returns true if block was modified.
706 bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
707 void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
708 bool ExpertMode) const;
709 const WaitEventSet &getWaitEvents(AMDGPU::InstCounterType T) const {
710 return WCG->getWaitEvents(T);
711 }
712 AMDGPU::InstCounterType getCounterFromEvent(WaitEventType E) const {
713 return WCG->getCounterFromEvent(E);
714 }
715};
716
717// This objects maintains the current score brackets of each wait counter, and
718// a per-register scoreboard for each wait counter.
719//
720// We also maintain the latest score for every event type that can change the
721// waitcnt in order to know if there are multiple types of events within
722// the brackets. When multiple types of event happen in the bracket,
723// wait count may get decreased out of order, therefore we need to put in
724// "s_waitcnt 0" before use.
725class WaitcntBrackets {
726public:
727 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
728 assert(Context->TRI.getNumRegUnits() < REGUNITS_END);
729 }
730
731#ifndef NDEBUG
732 ~WaitcntBrackets() {
733 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
734 for (auto &[ID, Val] : VMem) {
735 if (Val.empty())
736 ++NumUnusedVmem;
737 }
738 for (auto &[ID, Val] : SGPRs) {
739 if (Val.empty())
740 ++NumUnusedSGPRs;
741 }
742
743 if (NumUnusedVmem || NumUnusedSGPRs) {
744 errs() << "WaitcntBracket had unused entries at destruction time: "
745 << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
746 << " SGPR unused entries\n";
747 std::abort();
748 }
749 }
750#endif
751
752 bool isSmemCounter(AMDGPU::InstCounterType T) const {
753 return T == Context->SmemAccessCounter || T == AMDGPU::X_CNT;
754 }
755
756 unsigned getOutstanding(AMDGPU::InstCounterType T) const {
757 return ScoreUBs[T] - ScoreLBs[T];
758 }
759
760 bool hasPendingVMEM(VMEMID ID, AMDGPU::InstCounterType T) const {
761 return getVMemScore(ID, T) > getScoreLB(T);
762 }
763
764 /// \Return true if we have no score entries for counter \p T.
765 bool empty(AMDGPU::InstCounterType T) const { return getScoreRange(T) == 0; }
766
767private:
768 unsigned getScoreLB(AMDGPU::InstCounterType T) const {
770 return ScoreLBs[T];
771 }
772
773 unsigned getScoreUB(AMDGPU::InstCounterType T) const {
775 return ScoreUBs[T];
776 }
777
778 unsigned getScoreRange(AMDGPU::InstCounterType T) const {
779 return getScoreUB(T) - getScoreLB(T);
780 }
781
782 unsigned getSGPRScore(MCRegUnit RU, AMDGPU::InstCounterType T) const {
783 auto It = SGPRs.find(RU);
784 return It != SGPRs.end() ? It->second.get(T) : 0;
785 }
786
787 unsigned getVMemScore(VMEMID TID, AMDGPU::InstCounterType T) const {
788 auto It = VMem.find(TID);
789 return It != VMem.end() ? It->second.Scores[T] : 0;
790 }
791
792public:
793 bool merge(const WaitcntBrackets &Other);
794
795 bool counterOutOfOrder(AMDGPU::InstCounterType T) const;
796 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
797 simplifyWaitcnt(Wait, Wait);
798 }
799 void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
800 AMDGPU::Waitcnt &UpdateWait) const;
801 void simplifyWaitcnt(AMDGPU::InstCounterType T, unsigned &Count) const;
802 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T) const;
803 void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
804 AMDGPU::Waitcnt &UpdateWait) const;
805 void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
806 AMDGPU::Waitcnt &UpdateWait) const;
807
808 void determineWaitForPhysReg(AMDGPU::InstCounterType T, MCPhysReg Reg,
809 AMDGPU::Waitcnt &Wait,
810 const MachineInstr &MI) const;
811 MCPhysReg determineVGPR16Dependency(const MachineInstr &MI,
813 MCPhysReg Reg) const;
814 void determineWaitForLDSDMA(AMDGPU::InstCounterType T, VMEMID TID,
815 AMDGPU::Waitcnt &Wait) const;
816 AMDGPU::Waitcnt determineAsyncWait(unsigned N);
817 void tryClearSCCWriteEvent(MachineInstr *Inst);
818
819 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
820 void applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count);
821 void applyWaitcnt(const AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T);
822 void updateByEvent(WaitEventType E, MachineInstr &MI);
823 void recordAsyncMark(MachineInstr &MI);
824
825 bool hasPendingEvent() const { return !PendingEvents.empty(); }
826 bool hasPendingEvent(WaitEventType E) const {
827 return PendingEvents.contains(E);
828 }
829 bool hasPendingEvent(AMDGPU::InstCounterType T) const {
830 bool HasPending = PendingEvents & Context->getWaitEvents(T);
831 assert(HasPending == !empty(T) &&
832 "Expected pending events iff scoreboard is not empty");
833 return HasPending;
834 }
835
836 bool hasMixedPendingEvents(AMDGPU::InstCounterType T) const {
837 WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
838 // Return true if more than one bit is set in Events.
839 return Events.twoOrMore();
840 }
841
842 bool hasPendingFlat() const {
843 return ((LastFlatDsCnt > ScoreLBs[AMDGPU::DS_CNT] &&
844 LastFlatDsCnt <= ScoreUBs[AMDGPU::DS_CNT]) ||
845 (LastFlatLoadCnt > ScoreLBs[AMDGPU::LOAD_CNT] &&
846 LastFlatLoadCnt <= ScoreUBs[AMDGPU::LOAD_CNT]));
847 }
848
849 void setPendingFlat() {
850 LastFlatLoadCnt = ScoreUBs[AMDGPU::LOAD_CNT];
851 LastFlatDsCnt = ScoreUBs[AMDGPU::DS_CNT];
852 }
853
854 bool hasPendingGDS() const {
855 return LastGDS > ScoreLBs[AMDGPU::DS_CNT] &&
856 LastGDS <= ScoreUBs[AMDGPU::DS_CNT];
857 }
858
859 unsigned getPendingGDSWait() const {
860 return std::min(getScoreUB(AMDGPU::DS_CNT) - LastGDS,
861 getWaitCountMax(Context->getLimits(), AMDGPU::DS_CNT) - 1);
862 }
863
864 void setPendingGDS() { LastGDS = ScoreUBs[AMDGPU::DS_CNT]; }
865
866 // Return true if there might be pending writes to the vgpr-interval by VMEM
867 // instructions with types different from V.
868 bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
869 for (MCRegUnit RU : regunits(Reg)) {
870 auto It = VMem.find(toVMEMID(RU));
871 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
872 return true;
873 }
874 return false;
875 }
876
877 void clearVgprVmemTypes(MCPhysReg Reg) {
878 for (MCRegUnit RU : regunits(Reg)) {
879 if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
880 It->second.VMEMTypes = 0;
881 if (It->second.empty())
882 VMem.erase(It);
883 }
884 }
885 }
886
887 void setStateOnFunctionEntryOrReturn() {
888 setScoreUB(AMDGPU::STORE_CNT,
889 getScoreUB(AMDGPU::STORE_CNT) +
890 getWaitCountMax(Context->getLimits(), AMDGPU::STORE_CNT));
891 PendingEvents |= Context->getWaitEvents(AMDGPU::STORE_CNT);
892 }
893
894 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
895 return LDSDMAStores;
896 }
897
898 bool hasPointSampleAccel(const MachineInstr &MI) const;
899 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
900 MCPhysReg RU) const;
901
902 void print(raw_ostream &) const;
903 void dump() const { print(dbgs()); }
904
905 // Free up memory by removing empty entries from the DenseMap that track event
906 // scores.
907 void purgeEmptyTrackingData();
908
909private:
910 struct MergeInfo {
911 unsigned OldLB;
912 unsigned OtherLB;
913 unsigned MyShift;
914 unsigned OtherShift;
915 };
916
917 using CounterValueArray = std::array<unsigned, AMDGPU::NUM_INST_CNTS>;
918
919 void determineWaitForScore(AMDGPU::InstCounterType T, unsigned Score,
920 AMDGPU::Waitcnt &Wait) const;
921
922 static bool mergeScore(const MergeInfo &M, unsigned &Score,
923 unsigned OtherScore);
924 bool mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
925 ArrayRef<CounterValueArray> OtherMarks);
926
928 assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
929 if (!Context->TRI.isInAllocatableClass(Reg))
930 return {{}, {}};
931 return Context->TRI.regunits(Reg);
932 }
933
934 void setScoreLB(AMDGPU::InstCounterType T, unsigned Val) {
936 ScoreLBs[T] = Val;
937 }
938
939 void setScoreUB(AMDGPU::InstCounterType T, unsigned Val) {
941 ScoreUBs[T] = Val;
942
943 if (T != AMDGPU::EXP_CNT)
944 return;
945
946 if (getScoreRange(AMDGPU::EXP_CNT) >
947 getWaitCountMax(Context->getLimits(), AMDGPU::EXP_CNT))
948 ScoreLBs[AMDGPU::EXP_CNT] =
949 ScoreUBs[AMDGPU::EXP_CNT] -
950 getWaitCountMax(Context->getLimits(), AMDGPU::EXP_CNT);
951 }
952
953 void setRegScore(MCPhysReg Reg, AMDGPU::InstCounterType T, unsigned Val) {
954 const SIRegisterInfo &TRI = Context->TRI;
955 if (Reg == AMDGPU::SCC) {
956 SCCScore = Val;
957 } else if (TRI.isVectorRegister(Context->MRI, Reg)) {
958 for (MCRegUnit RU : regunits(Reg))
959 VMem[toVMEMID(RU)].Scores[T] = Val;
960 } else if (TRI.isSGPRReg(Context->MRI, Reg)) {
961 for (MCRegUnit RU : regunits(Reg))
962 SGPRs[RU].get(T) = Val;
963 } else {
964 llvm_unreachable("Register cannot be tracked/unknown register!");
965 }
966 }
967
968 void setVMemScore(VMEMID TID, AMDGPU::InstCounterType T, unsigned Val) {
969 VMem[TID].Scores[T] = Val;
970 }
971
972 void setScoreByOperand(const MachineOperand &Op,
973 AMDGPU::InstCounterType CntTy, unsigned Val);
974
975 const SIInsertWaitcnts *Context;
976
977 unsigned ScoreLBs[AMDGPU::NUM_INST_CNTS] = {0};
978 unsigned ScoreUBs[AMDGPU::NUM_INST_CNTS] = {0};
979 WaitEventSet PendingEvents;
980 // Remember the last flat memory operation.
981 unsigned LastFlatDsCnt = 0;
982 unsigned LastFlatLoadCnt = 0;
983 // Remember the last GDS operation.
984 unsigned LastGDS = 0;
985
986 // The score tracking logic is fragmented as follows:
987 // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
988 // - SGPRs: SGPR RegUnits
989 // - SCC: Non-allocatable and not general purpose: not a SGPR.
990 //
991 // For the VMem case, if the key is within the range of LDS DMA IDs,
992 // then the corresponding index into the `LDSDMAStores` vector below is:
993 // Key - LDSDMA_BEGIN - 1
994 // This is because LDSDMA_BEGIN is a generic entry and does not have an
995 // associated MachineInstr.
996 //
997 // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
998
999 struct VMEMInfo {
1000 // Scores for all instruction counters. Zero-initialized.
1001 CounterValueArray Scores{};
1002 // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
1003 unsigned VMEMTypes = 0;
1004
1005 bool empty() const { return all_of(Scores, equal_to(0)) && !VMEMTypes; }
1006 };
1007
1008 /// Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
1009 /// pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
1010 class SGPRInfo {
1011 /// Either DS_CNT or KM_CNT score.
1012 unsigned ScoreDsKmCnt = 0;
1013 unsigned ScoreXCnt = 0;
1014
1015 public:
1016 unsigned get(AMDGPU::InstCounterType T) const {
1017 assert(
1018 (T == AMDGPU::DS_CNT || T == AMDGPU::KM_CNT || T == AMDGPU::X_CNT) &&
1019 "Invalid counter");
1020 return T == AMDGPU::X_CNT ? ScoreXCnt : ScoreDsKmCnt;
1021 }
1022 unsigned &get(AMDGPU::InstCounterType T) {
1023 assert(
1024 (T == AMDGPU::DS_CNT || T == AMDGPU::KM_CNT || T == AMDGPU::X_CNT) &&
1025 "Invalid counter");
1026 return T == AMDGPU::X_CNT ? ScoreXCnt : ScoreDsKmCnt;
1027 }
1028
1029 bool empty() const { return !ScoreDsKmCnt && !ScoreXCnt; }
1030 };
1031
1032 DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
1033 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
1034
1035 // Reg score for SCC.
1036 unsigned SCCScore = 0;
1037 // The unique instruction that has an SCC write pending, if there is one.
1038 const MachineInstr *PendingSCCWrite = nullptr;
1039
1040 // Store representative LDS DMA operations. The only useful info here is
1041 // alias info. One store is kept per unique AAInfo.
1042 SmallVector<const MachineInstr *> LDSDMAStores;
1043
1044 // State of all counters at each async mark encountered so far.
1046
1047 // But in the rare pathological case, a nest of loops that pushes marks
1048 // without waiting on any mark can cause AsyncMarks to grow very large. We cap
1049 // it to a reasonable limit. We can tune this later or potentially introduce a
1050 // user option to control the value.
1051 static constexpr unsigned MaxAsyncMarks = 16;
1052
1053 // Track the upper bound score for async operations that are not part of a
1054 // mark yet. Initialized to all zeros.
1055 CounterValueArray AsyncScore{};
1056};
1057
1058SIInsertWaitcnts::BlockInfo::~BlockInfo() = default;
1059
1060class SIInsertWaitcntsLegacy : public MachineFunctionPass {
1061public:
1062 static char ID;
1063 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
1064
1065 bool runOnMachineFunction(MachineFunction &MF) override;
1066
1067 StringRef getPassName() const override {
1068 return "SI insert wait instructions";
1069 }
1070
1071 void getAnalysisUsage(AnalysisUsage &AU) const override {
1072 AU.setPreservesCFG();
1073 AU.addRequired<MachineLoopInfoWrapperPass>();
1074 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
1075 AU.addUsedIfAvailable<AAResultsWrapperPass>();
1076 AU.addPreserved<AAResultsWrapperPass>();
1078 }
1079};
1080
1081} // end anonymous namespace
1082
1083void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
1085 unsigned Score) {
1086 setRegScore(Op.getReg().asMCReg(), CntTy, Score);
1087}
1088
1089// Return true if the subtarget is one that enables Point Sample Acceleration
1090// and the MachineInstr passed in is one to which it might be applied (the
1091// hardware makes this decision based on several factors, but we can't determine
1092// this at compile time, so we have to assume it might be applied if the
1093// instruction supports it).
1094bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
1095 if (!Context->ST.hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
1096 return false;
1097
1098 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
1099 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
1101 return BaseInfo->PointSampleAccel;
1102}
1103
1104// Return true if the subtarget enables Point Sample Acceleration, the supplied
1105// MachineInstr is one to which it might be applied and the supplied interval is
1106// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
1107// (this is the type that a point sample accelerated instruction effectively
1108// becomes)
1109bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
1110 MCPhysReg Reg) const {
1111 if (!hasPointSampleAccel(MI))
1112 return false;
1113
1114 return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER);
1115}
1116
1117void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
1118 AMDGPU::InstCounterType T = Context->getCounterFromEvent(E);
1119 assert(T < Context->MaxCounter);
1120
1121 unsigned UB = getScoreUB(T);
1122 unsigned Increment = 1;
1124 // V_WMMA_SCALE instructions use VOP3PX2 encoding. Hardware treats this as
1125 // two VOP3P instructions and increments VA_VDST twice.
1126 Increment = 2;
1127 }
1128 unsigned CurrScore = UB + Increment;
1129 if (CurrScore == 0)
1130 report_fatal_error("InsertWaitcnt score wraparound");
1131 // PendingEvents and ScoreUB need to be update regardless if this event
1132 // changes the score of a register or not.
1133 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
1134 PendingEvents.insert(E);
1135 setScoreUB(T, CurrScore);
1136
1137 const SIRegisterInfo &TRI = Context->TRI;
1138 const MachineRegisterInfo &MRI = Context->MRI;
1139 const SIInstrInfo &TII = Context->TII;
1140
1141 if (T == AMDGPU::EXP_CNT) {
1142 // Put score on the source vgprs. If this is a store, just use those
1143 // specific register(s).
1144 if (TII.isDS(Inst) && Inst.mayLoadOrStore()) {
1145 // All GDS operations must protect their address register (same as
1146 // export.)
1147 if (const auto *AddrOp = TII.getNamedOperand(Inst, AMDGPU::OpName::addr))
1148 setScoreByOperand(*AddrOp, AMDGPU::EXP_CNT, CurrScore);
1149
1150 if (Inst.mayStore()) {
1151 if (const auto *Data0 =
1152 TII.getNamedOperand(Inst, AMDGPU::OpName::data0))
1153 setScoreByOperand(*Data0, AMDGPU::EXP_CNT, CurrScore);
1154 if (const auto *Data1 =
1155 TII.getNamedOperand(Inst, AMDGPU::OpName::data1))
1156 setScoreByOperand(*Data1, AMDGPU::EXP_CNT, CurrScore);
1157 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
1158 Inst.getOpcode() != AMDGPU::DS_APPEND &&
1159 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
1160 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1161 for (const MachineOperand &Op : Inst.all_uses()) {
1162 if (TRI.isVectorRegister(MRI, Op.getReg()))
1163 setScoreByOperand(Op, AMDGPU::EXP_CNT, CurrScore);
1164 }
1165 }
1166 } else if (TII.isFLAT(Inst)) {
1167 if (Inst.mayStore()) {
1168 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1169 AMDGPU::EXP_CNT, CurrScore);
1170 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1171 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1172 AMDGPU::EXP_CNT, CurrScore);
1173 }
1174 } else if (TII.isMIMG(Inst)) {
1175 if (Inst.mayStore()) {
1176 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
1177 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1178 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1179 AMDGPU::EXP_CNT, CurrScore);
1180 }
1181 } else if (TII.isMTBUF(Inst)) {
1182 if (Inst.mayStore())
1183 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
1184 } else if (TII.isMUBUF(Inst)) {
1185 if (Inst.mayStore()) {
1186 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
1187 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1188 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1189 AMDGPU::EXP_CNT, CurrScore);
1190 }
1191 } else if (TII.isLDSDIR(Inst)) {
1192 // LDSDIR instructions attach the score to the destination.
1193 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::vdst),
1194 AMDGPU::EXP_CNT, CurrScore);
1195 } else {
1196 if (TII.isEXP(Inst)) {
1197 // For export the destination registers are really temps that
1198 // can be used as the actual source after export patching, so
1199 // we need to treat them like sources and set the EXP_CNT
1200 // score.
1201 for (MachineOperand &DefMO : Inst.all_defs()) {
1202 if (TRI.isVGPR(MRI, DefMO.getReg())) {
1203 setScoreByOperand(DefMO, AMDGPU::EXP_CNT, CurrScore);
1204 }
1205 }
1206 }
1207 for (const MachineOperand &Op : Inst.all_uses()) {
1208 if (TRI.isVectorRegister(MRI, Op.getReg()))
1209 setScoreByOperand(Op, AMDGPU::EXP_CNT, CurrScore);
1210 }
1211 }
1212 } else if (T == AMDGPU::X_CNT) {
1213 WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1214 if (PendingEvents.contains(OtherEvent)) {
1215 // Hardware inserts an implicit xcnt between interleaved
1216 // SMEM and VMEM operations. So there will never be
1217 // outstanding address translations for both SMEM and
1218 // VMEM at the same time.
1219 setScoreLB(T, getScoreUB(T) - 1);
1220 PendingEvents.remove(OtherEvent);
1221 }
1222 for (const MachineOperand &Op : Inst.all_uses())
1223 setScoreByOperand(Op, T, CurrScore);
1224 } else if (T == AMDGPU::VA_VDST || T == AMDGPU::VM_VSRC) {
1225 // Match the score to the VGPR destination or source registers as
1226 // appropriate
1227 for (const MachineOperand &Op : Inst.operands()) {
1228 if (!Op.isReg() || (T == AMDGPU::VA_VDST && Op.isUse()) ||
1229 (T == AMDGPU::VM_VSRC && Op.isDef()))
1230 continue;
1231 if (TRI.isVectorRegister(Context->MRI, Op.getReg()))
1232 setScoreByOperand(Op, T, CurrScore);
1233 }
1234 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1235 // Match the score to the destination registers.
1236 //
1237 // Check only explicit operands. Stores, especially spill stores, include
1238 // implicit uses and defs of their super registers which would create an
1239 // artificial dependency, while these are there only for register liveness
1240 // accounting purposes.
1241 //
1242 // Special cases where implicit register defs exists, such as M0 or VCC,
1243 // but none with memory instructions.
1244 for (const MachineOperand &Op : Inst.defs()) {
1245 if (T == AMDGPU::LOAD_CNT || T == AMDGPU::SAMPLE_CNT ||
1246 T == AMDGPU::BVH_CNT) {
1247 if (!TRI.isVectorRegister(MRI, Op.getReg())) // TODO: add wrapper
1248 continue;
1249 if (updateVMCntOnly(Inst)) {
1250 // updateVMCntOnly should only leave us with VGPRs
1251 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1252 // defs. That's required for a sane index into `VgprMemTypes` below
1253 assert(TRI.isVectorRegister(MRI, Op.getReg()));
1254 VmemType V = getVmemType(Inst);
1255 unsigned char TypesMask = 1 << V;
1256 // If instruction can have Point Sample Accel applied, we have to flag
1257 // this with another potential dependency
1258 if (hasPointSampleAccel(Inst))
1259 TypesMask |= 1 << VMEM_NOSAMPLER;
1260 for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))
1261 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1262 }
1263 }
1264 setScoreByOperand(Op, T, CurrScore);
1265 }
1266 if (Inst.mayStore() &&
1267 (TII.isDS(Inst) || Context->isNonAsyncLdsDmaWrite(Inst))) {
1268 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1269 // written can be accessed. A load from LDS to VMEM does not need a wait.
1270 //
1271 // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
1272 // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
1273 // store. The "Slot" is the index into LDSDMAStores + 1.
1274 unsigned Slot = 0;
1275 for (const auto *MemOp : Inst.memoperands()) {
1276 if (!MemOp->isStore() ||
1277 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1278 continue;
1279 // Comparing just AA info does not guarantee memoperands are equal
1280 // in general, but this is so for LDS DMA in practice.
1281 auto AAI = MemOp->getAAInfo();
1282 // Alias scope information gives a way to definitely identify an
1283 // original memory object and practically produced in the module LDS
1284 // lowering pass. If there is no scope available we will not be able
1285 // to disambiguate LDS aliasing as after the module lowering all LDS
1286 // is squashed into a single big object.
1287 if (!AAI || !AAI.Scope)
1288 break;
1289 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1290 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1291 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1292 Slot = I + 1;
1293 break;
1294 }
1295 }
1296 }
1297 if (Slot)
1298 break;
1299 // The slot may not be valid because it can be >= NUM_LDSDMA which
1300 // means the scoreboard cannot track it. We still want to preserve the
1301 // MI in order to check alias information, though.
1302 LDSDMAStores.push_back(&Inst);
1303 Slot = LDSDMAStores.size();
1304 break;
1305 }
1306 setVMemScore(LDSDMA_BEGIN, T, CurrScore);
1307 if (Slot && Slot < NUM_LDSDMA)
1308 setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
1309 }
1310
1311 if (Context->shouldUpdateAsyncMark(Inst, T)) {
1312 AsyncScore[T] = CurrScore;
1313 }
1314
1316 setRegScore(AMDGPU::SCC, T, CurrScore);
1317 PendingSCCWrite = &Inst;
1318 }
1319 }
1320}
1321
1322void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1323 // In the absence of loops, AsyncMarks can grow linearly with the program
1324 // until we encounter an ASYNCMARK_WAIT. We could drop the oldest mark above a
1325 // limit every time we push a new mark, but that seems like unnecessary work
1326 // in practical cases. We do separately truncate the array when processing a
1327 // loop, which should be sufficient.
1328 AsyncMarks.push_back(AsyncScore);
1329 AsyncScore = {};
1330 LLVM_DEBUG({
1331 dbgs() << "recordAsyncMark:\n" << Inst;
1332 for (const auto &Mark : AsyncMarks) {
1333 llvm::interleaveComma(Mark, dbgs());
1334 dbgs() << '\n';
1335 }
1336 });
1337}
1338
1339void WaitcntBrackets::print(raw_ostream &OS) const {
1340 const GCNSubtarget &ST = Context->ST;
1341
1342 for (auto T : inst_counter_types(Context->MaxCounter)) {
1343 unsigned SR = getScoreRange(T);
1344 switch (T) {
1345 case AMDGPU::LOAD_CNT:
1346 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1347 << SR << "):";
1348 break;
1349 case AMDGPU::DS_CNT:
1350 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1351 << SR << "):";
1352 break;
1353 case AMDGPU::EXP_CNT:
1354 OS << " EXP_CNT(" << SR << "):";
1355 break;
1356 case AMDGPU::STORE_CNT:
1357 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1358 << SR << "):";
1359 break;
1360 case AMDGPU::SAMPLE_CNT:
1361 OS << " SAMPLE_CNT(" << SR << "):";
1362 break;
1363 case AMDGPU::BVH_CNT:
1364 OS << " BVH_CNT(" << SR << "):";
1365 break;
1366 case AMDGPU::KM_CNT:
1367 OS << " KM_CNT(" << SR << "):";
1368 break;
1369 case AMDGPU::X_CNT:
1370 OS << " X_CNT(" << SR << "):";
1371 break;
1372 case AMDGPU::ASYNC_CNT:
1373 OS << " ASYNC_CNT(" << SR << "):";
1374 break;
1375 case AMDGPU::VA_VDST:
1376 OS << " VA_VDST(" << SR << "): ";
1377 break;
1378 case AMDGPU::VM_VSRC:
1379 OS << " VM_VSRC(" << SR << "): ";
1380 break;
1381 default:
1382 OS << " UNKNOWN(" << SR << "):";
1383 break;
1384 }
1385
1386 if (SR != 0) {
1387 // Print vgpr scores.
1388 unsigned LB = getScoreLB(T);
1389
1390 SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
1391 sort(SortedVMEMIDs);
1392
1393 for (auto ID : SortedVMEMIDs) {
1394 unsigned RegScore = VMem.at(ID).Scores[T];
1395 if (RegScore <= LB)
1396 continue;
1397 unsigned RelScore = RegScore - LB - 1;
1398 if (ID < REGUNITS_END) {
1399 OS << ' ' << RelScore << ":vRU" << ID;
1400 } else {
1401 assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
1402 "Unhandled/unexpected ID value!");
1403 OS << ' ' << RelScore << ":LDSDMA" << ID;
1404 }
1405 }
1406
1407 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1408 if (isSmemCounter(T)) {
1409 SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
1410 sort(SortedSMEMIDs);
1411 for (auto ID : SortedSMEMIDs) {
1412 unsigned RegScore = SGPRs.at(ID).get(T);
1413 if (RegScore <= LB)
1414 continue;
1415 unsigned RelScore = RegScore - LB - 1;
1416 OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
1417 }
1418 }
1419
1420 if (T == AMDGPU::KM_CNT && SCCScore > 0)
1421 OS << ' ' << SCCScore << ":scc";
1422 }
1423 OS << '\n';
1424 }
1425
1426 OS << "Pending Events: ";
1427 if (hasPendingEvent()) {
1428 ListSeparator LS;
1429 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1430 if (hasPendingEvent((WaitEventType)I)) {
1431 OS << LS << WaitEventTypeName[I];
1432 }
1433 }
1434 } else {
1435 OS << "none";
1436 }
1437 OS << '\n';
1438
1439 OS << "Async score: ";
1440 if (AsyncScore.empty())
1441 OS << "none";
1442 else
1443 llvm::interleaveComma(AsyncScore, OS);
1444 OS << '\n';
1445
1446 OS << "Async marks: " << AsyncMarks.size() << '\n';
1447
1448 for (const auto &Mark : AsyncMarks) {
1449 for (auto T : AMDGPU::inst_counter_types()) {
1450 unsigned MarkedScore = Mark[T];
1451 switch (T) {
1452 case AMDGPU::LOAD_CNT:
1453 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM")
1454 << "_CNT: " << MarkedScore;
1455 break;
1456 case AMDGPU::DS_CNT:
1457 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM")
1458 << "_CNT: " << MarkedScore;
1459 break;
1460 case AMDGPU::EXP_CNT:
1461 OS << " EXP_CNT: " << MarkedScore;
1462 break;
1463 case AMDGPU::STORE_CNT:
1464 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS")
1465 << "_CNT: " << MarkedScore;
1466 break;
1467 case AMDGPU::SAMPLE_CNT:
1468 OS << " SAMPLE_CNT: " << MarkedScore;
1469 break;
1470 case AMDGPU::BVH_CNT:
1471 OS << " BVH_CNT: " << MarkedScore;
1472 break;
1473 case AMDGPU::KM_CNT:
1474 OS << " KM_CNT: " << MarkedScore;
1475 break;
1476 case AMDGPU::X_CNT:
1477 OS << " X_CNT: " << MarkedScore;
1478 break;
1479 case AMDGPU::ASYNC_CNT:
1480 OS << " ASYNC_CNT: " << MarkedScore;
1481 break;
1482 default:
1483 OS << " UNKNOWN: " << MarkedScore;
1484 break;
1485 }
1486 }
1487 OS << '\n';
1488 }
1489 OS << '\n';
1490}
1491
1492/// Simplify \p UpdateWait by removing waits that are redundant based on the
1493/// current WaitcntBrackets and any other waits specified in \p CheckWait.
1494void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
1495 AMDGPU::Waitcnt &UpdateWait) const {
1496 simplifyWaitcnt(UpdateWait, AMDGPU::LOAD_CNT);
1497 simplifyWaitcnt(UpdateWait, AMDGPU::EXP_CNT);
1498 simplifyWaitcnt(UpdateWait, AMDGPU::DS_CNT);
1499 simplifyWaitcnt(UpdateWait, AMDGPU::STORE_CNT);
1500 simplifyWaitcnt(UpdateWait, AMDGPU::SAMPLE_CNT);
1501 simplifyWaitcnt(UpdateWait, AMDGPU::BVH_CNT);
1502 simplifyWaitcnt(UpdateWait, AMDGPU::KM_CNT);
1503 simplifyXcnt(CheckWait, UpdateWait);
1504 simplifyWaitcnt(UpdateWait, AMDGPU::VA_VDST);
1505 simplifyVmVsrc(CheckWait, UpdateWait);
1506 simplifyWaitcnt(UpdateWait, AMDGPU::ASYNC_CNT);
1507}
1508
1509void WaitcntBrackets::simplifyWaitcnt(AMDGPU::InstCounterType T,
1510 unsigned &Count) const {
1511 // The number of outstanding events for this type, T, can be calculated
1512 // as (UB - LB). If the current Count is greater than or equal to the number
1513 // of outstanding events, then the wait for this counter is redundant.
1514 if (Count >= getScoreRange(T))
1515 Count = ~0u;
1516}
1517
1518void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait,
1519 AMDGPU::InstCounterType T) const {
1520 unsigned Cnt = Wait.get(T);
1521 simplifyWaitcnt(T, Cnt);
1522 Wait.set(T, Cnt);
1523}
1524
1525void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
1526 AMDGPU::Waitcnt &UpdateWait) const {
1527 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1528 // optimizations. On entry to a block with multiple predescessors, there may
1529 // be pending SMEM and VMEM events active at the same time.
1530 // In such cases, only clear one active event at a time.
1531 // TODO: Revisit xcnt optimizations for gfx1250.
1532 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1533 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1534 // zero.
1535 if (CheckWait.get(AMDGPU::KM_CNT) == 0 && hasPendingEvent(SMEM_GROUP))
1536 UpdateWait.set(AMDGPU::X_CNT, ~0u);
1537 // If we have pending store we cannot optimize XCnt because we do not wait for
1538 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1539 // decremented to the same number as LOADCnt.
1540 if (CheckWait.get(AMDGPU::LOAD_CNT) != ~0u && hasPendingEvent(VMEM_GROUP) &&
1541 !hasPendingEvent(AMDGPU::STORE_CNT) &&
1542 CheckWait.get(AMDGPU::X_CNT) >= CheckWait.get(AMDGPU::LOAD_CNT))
1543 UpdateWait.set(AMDGPU::X_CNT, ~0u);
1544 simplifyWaitcnt(UpdateWait, AMDGPU::X_CNT);
1545}
1546
1547void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
1548 AMDGPU::Waitcnt &UpdateWait) const {
1549 // Waiting for some counters implies waiting for VM_VSRC, since an
1550 // instruction that decrements a counter on completion would have
1551 // decremented VM_VSRC once its VGPR operands had been read.
1552 if (CheckWait.get(AMDGPU::VM_VSRC) >=
1553 std::min({CheckWait.get(AMDGPU::LOAD_CNT),
1554 CheckWait.get(AMDGPU::STORE_CNT),
1555 CheckWait.get(AMDGPU::SAMPLE_CNT),
1556 CheckWait.get(AMDGPU::BVH_CNT), CheckWait.get(AMDGPU::DS_CNT)}))
1557 UpdateWait.set(AMDGPU::VM_VSRC, ~0u);
1558 simplifyWaitcnt(UpdateWait, AMDGPU::VM_VSRC);
1559}
1560
1561void WaitcntBrackets::purgeEmptyTrackingData() {
1562 VMem.remove_if([](const auto &P) { return P.second.empty(); });
1563 SGPRs.remove_if([](const auto &P) { return P.second.empty(); });
1564}
1565
1566void WaitcntBrackets::determineWaitForScore(AMDGPU::InstCounterType T,
1567 unsigned ScoreToWait,
1568 AMDGPU::Waitcnt &Wait) const {
1569 const unsigned LB = getScoreLB(T);
1570 const unsigned UB = getScoreUB(T);
1571
1572 // If the score falls within the bracket, we need a waitcnt.
1573 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1574 if ((T == AMDGPU::LOAD_CNT || T == AMDGPU::DS_CNT) && hasPendingFlat() &&
1575 !Context->ST.hasFlatLgkmVMemCountInOrder()) {
1576 // If there is a pending FLAT operation, and this is a VMem or LGKM
1577 // waitcnt and the target can report early completion, then we need
1578 // to force a waitcnt 0.
1579 addWait(Wait, T, 0);
1580 } else if (counterOutOfOrder(T)) {
1581 // Counter can get decremented out-of-order when there
1582 // are multiple types event in the bracket. Also emit an s_wait counter
1583 // with a conservative value of 0 for the counter.
1584 addWait(Wait, T, 0);
1585 } else {
1586 // If a counter has been maxed out avoid overflow by waiting for
1587 // MAX(CounterType) - 1 instead.
1588 unsigned NeededWait = std::min(
1589 UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1);
1590 addWait(Wait, T, NeededWait);
1591 }
1592 }
1593}
1594
1595AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
1596 LLVM_DEBUG({
1597 dbgs() << "Need " << N << " async marks. Found " << AsyncMarks.size()
1598 << ":\n";
1599 for (const auto &Mark : AsyncMarks) {
1600 llvm::interleaveComma(Mark, dbgs());
1601 dbgs() << '\n';
1602 }
1603 });
1604
1605 if (AsyncMarks.size() == MaxAsyncMarks) {
1606 // Enforcing MaxAsyncMarks here is unnecessary work because the size of
1607 // MaxAsyncMarks is linear when traversing straightline code. But we do
1608 // need to check if truncation may have occured at a merge, and adjust N
1609 // to ensure that a wait is generated.
1610 LLVM_DEBUG(dbgs() << "Possible truncation. Ensuring a non-trivial wait.\n");
1611 N = std::min(N, (unsigned)MaxAsyncMarks - 1);
1612 }
1613
1614 AMDGPU::Waitcnt Wait;
1615 if (AsyncMarks.size() <= N) {
1616 LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n");
1617 return Wait;
1618 }
1619
1620 size_t MarkIndex = AsyncMarks.size() - N - 1;
1621 const auto &RequiredMark = AsyncMarks[MarkIndex];
1623 determineWaitForScore(T, RequiredMark[T], Wait);
1624
1625 // Immediately remove the waited mark and all older ones
1626 // This happens BEFORE the wait is actually inserted, which is fine
1627 // because we've already extracted the wait requirements
1628 LLVM_DEBUG({
1629 dbgs() << "Removing " << (MarkIndex + 1)
1630 << " async marks after determining wait\n";
1631 });
1632 AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
1633
1634 LLVM_DEBUG(dbgs() << "Waits to add: " << Wait);
1635 return Wait;
1636}
1637
1638// With D16Write32BitVgpr, D16 inst might be clobbered by events running on the
1639// other half 16bit.
1640//
1641// Replace VGPR16 to VGPR32 for wait check if:
1642// 1. MI is a VALU, and there is a wait event on the other half
1643// 2. MI is a LdSt, and there is a wait event on the other half from different
1644// order group
1645MCPhysReg WaitcntBrackets::determineVGPR16Dependency(const MachineInstr &MI,
1647 MCPhysReg Reg) const {
1648 const TargetRegisterClass *RC = Context->TRI.getPhysRegBaseClass(Reg);
1649 unsigned Size = Context->TRI.getRegSizeInBits(*RC);
1650
1651 if (Size != 16 || !Context->ST.hasD16Writes32BitVgpr())
1652 return Reg;
1653
1654 // With D16Writes32BitVgpr, D16 Inst might clobber the whole vgpr32
1655 // check dependency on the other half
1656 Register Reg32 = Context->TRI.get32BitRegister(Reg);
1657 Register OtherHalf = Context->TRI.getSubReg(
1658 Reg32,
1659 AMDGPU::isHi16Reg(Reg, Context->TRI) ? AMDGPU::lo16 : AMDGPU::hi16);
1660
1661 AMDGPU::Waitcnt Wait;
1662 for (MCRegUnit RU : regunits(OtherHalf))
1663 determineWaitForScore(T, getVMemScore(toVMEMID(RU), T), Wait);
1664
1665 // No wait on otherhalf
1666 if (!Wait.hasWait())
1667 return Reg;
1668
1669 if (Context->TII.isVALU(MI))
1670 return Reg32;
1671
1672 // If hi/lo16 mixed events
1673 WaitEventSet MIEvents = Context->getEventsFor(MI);
1674 WaitEventSet OtherHalfEvents = Context->getWaitEvents(T);
1675 WaitEventSet Events = MIEvents & OtherHalfEvents;
1676 if (Events.twoOrMore())
1677 return Reg32;
1678 return Reg;
1679}
1680
1681void WaitcntBrackets::determineWaitForPhysReg(AMDGPU::InstCounterType T,
1682 MCPhysReg Reg,
1683 AMDGPU::Waitcnt &Wait,
1684 const MachineInstr &MI) const {
1685 if (Reg == AMDGPU::SCC) {
1686 determineWaitForScore(T, SCCScore, Wait);
1687 } else {
1688 bool IsVGPR = Context->TRI.isVectorRegister(Context->MRI, Reg);
1689 if (IsVGPR)
1690 Reg = determineVGPR16Dependency(MI, T, Reg);
1691 for (MCRegUnit RU : regunits(Reg))
1692 determineWaitForScore(
1693 T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
1694 Wait);
1695 }
1696}
1697
1698void WaitcntBrackets::determineWaitForLDSDMA(AMDGPU::InstCounterType T,
1699 VMEMID TID,
1700 AMDGPU::Waitcnt &Wait) const {
1701 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1702 determineWaitForScore(T, getVMemScore(TID, T), Wait);
1703}
1704
1705void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1706 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1707 // SCC has landed
1708 if (PendingSCCWrite &&
1709 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1710 PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
1711 WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
1712 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1713 if ((PendingEvents & Context->getWaitEvents(AMDGPU::KM_CNT)) ==
1714 SCC_WRITE_PendingEvent) {
1715 setScoreLB(AMDGPU::KM_CNT, getScoreUB(AMDGPU::KM_CNT));
1716 }
1717
1718 PendingEvents.remove(SCC_WRITE_PendingEvent);
1719 PendingSCCWrite = nullptr;
1720 }
1721}
1722
1723void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1725 applyWaitcnt(Wait, T);
1726}
1727
1728void WaitcntBrackets::applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count) {
1729 const unsigned UB = getScoreUB(T);
1730 if (Count >= UB)
1731 return;
1732 if (Count != 0) {
1733 if (counterOutOfOrder(T))
1734 return;
1735 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1736 } else {
1737 setScoreLB(T, UB);
1738 PendingEvents.remove(Context->getWaitEvents(T));
1739 }
1740
1741 if (T == AMDGPU::KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) {
1742 if (!hasMixedPendingEvents(AMDGPU::X_CNT))
1743 applyWaitcnt(AMDGPU::X_CNT, 0);
1744 else
1745 PendingEvents.remove(SMEM_GROUP);
1746 }
1747 if (T == AMDGPU::LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
1748 !hasPendingEvent(AMDGPU::STORE_CNT)) {
1749 if (!hasMixedPendingEvents(AMDGPU::X_CNT))
1750 applyWaitcnt(AMDGPU::X_CNT, Count);
1751 else if (Count == 0)
1752 PendingEvents.remove(VMEM_GROUP);
1753 }
1754}
1755
1756void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait,
1758 unsigned Cnt = Wait.get(T);
1759 applyWaitcnt(T, Cnt);
1760}
1761
1762// Where there are multiple types of event in the bracket of a counter,
1763// the decrement may go out of order.
1764bool WaitcntBrackets::counterOutOfOrder(AMDGPU::InstCounterType T) const {
1765 // Scalar memory read always can go out of order.
1766 if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1767 (T == AMDGPU::X_CNT && hasPendingEvent(SMEM_GROUP)))
1768 return true;
1769
1770 // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
1771 // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
1772 // out-of-order completion.
1773 if (T == AMDGPU::LOAD_CNT) {
1774 WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
1775 // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
1776 // events
1777 Events.remove(GLOBAL_INV_ACCESS);
1778 // Return true only if there are still multiple event types after removing
1779 // GLOBAL_INV
1780 return Events.twoOrMore();
1781 }
1782
1783 return hasMixedPendingEvents(T);
1784}
1785
1786INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1787 false, false)
1790INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1792
1793char SIInsertWaitcntsLegacy::ID = 0;
1794
1795char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1796
1798 return new SIInsertWaitcntsLegacy();
1799}
1800
1801static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1802 unsigned NewEnc) {
1803 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1804 assert(OpIdx >= 0);
1805
1806 MachineOperand &MO = MI.getOperand(OpIdx);
1807
1808 if (NewEnc == MO.getImm())
1809 return false;
1810
1811 MO.setImm(NewEnc);
1812 return true;
1813}
1814
1815/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1816/// and if so, which counter it is waiting on.
1817static std::optional<AMDGPU::InstCounterType>
1818counterTypeForInstr(unsigned Opcode) {
1819 switch (Opcode) {
1820 case AMDGPU::S_WAIT_LOADCNT:
1821 return AMDGPU::LOAD_CNT;
1822 case AMDGPU::S_WAIT_EXPCNT:
1823 return AMDGPU::EXP_CNT;
1824 case AMDGPU::S_WAIT_STORECNT:
1825 return AMDGPU::STORE_CNT;
1826 case AMDGPU::S_WAIT_SAMPLECNT:
1827 return AMDGPU::SAMPLE_CNT;
1828 case AMDGPU::S_WAIT_BVHCNT:
1829 return AMDGPU::BVH_CNT;
1830 case AMDGPU::S_WAIT_DSCNT:
1831 return AMDGPU::DS_CNT;
1832 case AMDGPU::S_WAIT_KMCNT:
1833 return AMDGPU::KM_CNT;
1834 case AMDGPU::S_WAIT_XCNT:
1835 return AMDGPU::X_CNT;
1836 case AMDGPU::S_WAIT_ASYNCCNT:
1837 return AMDGPU::ASYNC_CNT;
1838 default:
1839 return {};
1840 }
1841}
1842
1843bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1844 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1845 if (Opcode == Waitcnt->getOpcode())
1846 return false;
1847
1848 Waitcnt->setDesc(TII.get(Opcode));
1849 return true;
1850}
1851
1852/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1853/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1854/// from \p Wait that were added by previous passes. Currently this pass
1855/// conservatively assumes that these preexisting waits are required for
1856/// correctness.
1857bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1858 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1859 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1860 assert(isNormalMode(MaxCounter));
1861
1862 bool Modified = false;
1863 MachineInstr *WaitcntInstr = nullptr;
1864 MachineInstr *WaitcntVsCntInstr = nullptr;
1865
1866 LLVM_DEBUG({
1867 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1868 if (It.isEnd())
1869 dbgs() << "end of block\n";
1870 else
1871 dbgs() << *It;
1872 });
1873
1874 for (auto &II :
1875 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1876 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1877 if (isNonWaitcntMetaInst(II)) {
1878 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1879 continue;
1880 }
1881
1882 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1883 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1884
1885 // Update required wait count. If this is a soft waitcnt (= it was added
1886 // by an earlier pass), it may be entirely removed.
1887 if (Opcode == AMDGPU::S_WAITCNT) {
1888 unsigned IEnc = II.getOperand(0).getImm();
1889 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1890 if (TrySimplify)
1891 ScoreBrackets.simplifyWaitcnt(OldWait);
1892 Wait = Wait.combined(OldWait);
1893
1894 // Merge consecutive waitcnt of the same type by erasing multiples.
1895 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1896 II.eraseFromParent();
1897 Modified = true;
1898 } else
1899 WaitcntInstr = &II;
1900 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1901 assert(ST.hasVMemToLDSLoad());
1902 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1903 << "Before: " << Wait << '\n';);
1904 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, LDSDMA_BEGIN,
1905 Wait);
1906 LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
1907
1908 // It is possible (but unlikely) that this is the only wait instruction,
1909 // in which case, we exit this loop without a WaitcntInstr to consume
1910 // `Wait`. But that works because `Wait` was passed in by reference, and
1911 // the callee eventually calls createNewWaitcnt on it. We test this
1912 // possibility in an articial MIR test since such a situation cannot be
1913 // recreated by running the memory legalizer.
1914 II.eraseFromParent();
1915 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1916 unsigned N = II.getOperand(0).getImm();
1917 LLVM_DEBUG(dbgs() << "Processing WAIT_ASYNCMARK: " << II << '\n';);
1918 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
1919 Wait = Wait.combined(OldWait);
1920 } else {
1921 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1922 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1923
1924 unsigned OldVSCnt =
1925 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1926 if (TrySimplify)
1927 ScoreBrackets.simplifyWaitcnt(AMDGPU::STORE_CNT, OldVSCnt);
1929 std::min(Wait.get(AMDGPU::STORE_CNT), OldVSCnt));
1930
1931 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1932 II.eraseFromParent();
1933 Modified = true;
1934 } else
1935 WaitcntVsCntInstr = &II;
1936 }
1937 }
1938
1939 if (WaitcntInstr) {
1940 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1942 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1943
1944 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::LOAD_CNT);
1945 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::EXP_CNT);
1946 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::DS_CNT);
1947 Wait.set(AMDGPU::LOAD_CNT, ~0u);
1948 Wait.set(AMDGPU::EXP_CNT, ~0u);
1949 Wait.set(AMDGPU::DS_CNT, ~0u);
1950
1951 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1952 << "New Instr at block end: "
1953 << *WaitcntInstr << '\n'
1954 : dbgs() << "applied pre-existing waitcnt\n"
1955 << "Old Instr: " << *It
1956 << "New Instr: " << *WaitcntInstr << '\n');
1957 }
1958
1959 if (WaitcntVsCntInstr) {
1960 Modified |=
1961 updateOperandIfDifferent(*WaitcntVsCntInstr, AMDGPU::OpName::simm16,
1962 Wait.get(AMDGPU::STORE_CNT));
1963 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1964
1965 ScoreBrackets.applyWaitcnt(AMDGPU::STORE_CNT, Wait.get(AMDGPU::STORE_CNT));
1966 Wait.set(AMDGPU::STORE_CNT, ~0u);
1967
1968 LLVM_DEBUG(It.isEnd()
1969 ? dbgs() << "applied pre-existing waitcnt\n"
1970 << "New Instr at block end: " << *WaitcntVsCntInstr
1971 << '\n'
1972 : dbgs() << "applied pre-existing waitcnt\n"
1973 << "Old Instr: " << *It
1974 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1975 }
1976
1977 return Modified;
1978}
1979
1980/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1981/// required counters in \p Wait
1982bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1983 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1984 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
1985 assert(isNormalMode(MaxCounter));
1986
1987 bool Modified = false;
1988 const DebugLoc &DL = Block.findDebugLoc(It);
1989
1990 // Helper to emit expanded waitcnt sequence for profiling.
1991 // Emits waitcnts from (Outstanding-1) down to Target.
1992 // The EmitWaitcnt callback emits a single waitcnt.
1993 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
1994 auto EmitWaitcnt) {
1995 do {
1996 EmitWaitcnt(--Outstanding);
1997 } while (Outstanding > Target);
1998 Modified = true;
1999 };
2000
2001 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
2002 // single instruction while VScnt has its own instruction.
2003 if (Wait.hasWaitExceptStoreCnt()) {
2004 // If profiling expansion is enabled, emit an expanded sequence
2005 if (ExpandWaitcntProfiling) {
2006 // Check if any of the counters to be waited on are out-of-order.
2007 // If so, fall back to normal (non-expanded) behavior since expansion
2008 // would provide misleading profiling information.
2009 bool AnyOutOfOrder = false;
2010 for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
2011 unsigned WaitCnt = Wait.get(CT);
2012 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
2013 AnyOutOfOrder = true;
2014 break;
2015 }
2016 }
2017
2018 if (AnyOutOfOrder) {
2019 // Fall back to non-expanded wait
2020 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
2021 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
2022 Modified = true;
2023 } else {
2024 // All counters are in-order, safe to expand
2025 for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
2026 unsigned WaitCnt = Wait.get(CT);
2027 if (WaitCnt == ~0u)
2028 continue;
2029
2030 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
2031 getWaitCountMax(getLimits(), CT) - 1);
2032 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
2033 AMDGPU::Waitcnt W;
2034 W.set(CT, Count);
2035 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT))
2037 });
2038 }
2039 }
2040 } else {
2041 // Normal behavior: emit single combined waitcnt
2042 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
2043 [[maybe_unused]] auto SWaitInst =
2044 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
2045 Modified = true;
2046
2047 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
2048 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2049 dbgs() << "New Instr: " << *SWaitInst << '\n');
2050 }
2051 }
2052
2053 if (Wait.hasWaitStoreCnt()) {
2054 assert(ST.hasVscnt());
2055
2056 if (ExpandWaitcntProfiling && Wait.get(AMDGPU::STORE_CNT) != ~0u &&
2057 !ScoreBrackets.counterOutOfOrder(AMDGPU::STORE_CNT)) {
2058 // Only expand if counter is not out-of-order
2059 unsigned Outstanding =
2060 std::min(ScoreBrackets.getOutstanding(AMDGPU::STORE_CNT),
2061 getWaitCountMax(getLimits(), AMDGPU::STORE_CNT) - 1);
2062 EmitExpandedWaitcnt(
2063 Outstanding, Wait.get(AMDGPU::STORE_CNT), [&](unsigned Count) {
2064 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
2065 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2066 .addImm(Count);
2067 });
2068 } else {
2069 [[maybe_unused]] auto SWaitInst =
2070 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
2071 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2073 Modified = true;
2074
2075 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
2076 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2077 dbgs() << "New Instr: " << *SWaitInst << '\n');
2078 }
2079 }
2080
2081 return Modified;
2082}
2083
2084AMDGPU::Waitcnt
2085WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
2086 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST.hasVscnt() ? 0 : ~0u);
2087}
2088
2089AMDGPU::Waitcnt
2090WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
2091 unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
2092 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
2093 ~0u /* XCNT */, ~0u /* ASYNC_CNT */, ExpertVal,
2094 ExpertVal);
2095}
2096
2097/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
2098/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
2099/// were added by previous passes. Currently this pass conservatively
2100/// assumes that these preexisting waits are required for correctness.
2101bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
2102 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
2103 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
2104 assert(!isNormalMode(MaxCounter));
2105
2106 bool Modified = false;
2107 MachineInstr *CombinedLoadDsCntInstr = nullptr;
2108 MachineInstr *CombinedStoreDsCntInstr = nullptr;
2109 MachineInstr *WaitcntDepctrInstr = nullptr;
2110 MachineInstr *WaitInstrs[AMDGPU::NUM_EXTENDED_INST_CNTS] = {};
2111
2112 LLVM_DEBUG({
2113 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
2114 if (It.isEnd())
2115 dbgs() << "end of block\n";
2116 else
2117 dbgs() << *It;
2118 });
2119
2120 // Accumulate waits that should not be simplified.
2121 AMDGPU::Waitcnt RequiredWait;
2122
2123 for (auto &II :
2124 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
2125 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
2126 if (isNonWaitcntMetaInst(II)) {
2127 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
2128 continue;
2129 }
2130
2131 // Update required wait count. If this is a soft waitcnt (= it was added
2132 // by an earlier pass), it may be entirely removed.
2133
2134 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
2135 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
2136
2137 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
2138 // attempt to do more than that either.
2139 if (Opcode == AMDGPU::S_WAITCNT)
2140 continue;
2141
2142 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
2143 unsigned OldEnc =
2144 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2145 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
2146 if (TrySimplify)
2147 Wait = Wait.combined(OldWait);
2148 else
2149 RequiredWait = RequiredWait.combined(OldWait);
2150 // Keep the first wait_loadcnt, erase the rest.
2151 if (CombinedLoadDsCntInstr == nullptr) {
2152 CombinedLoadDsCntInstr = &II;
2153 } else {
2154 II.eraseFromParent();
2155 Modified = true;
2156 }
2157 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
2158 unsigned OldEnc =
2159 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2160 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
2161 if (TrySimplify)
2162 Wait = Wait.combined(OldWait);
2163 else
2164 RequiredWait = RequiredWait.combined(OldWait);
2165 // Keep the first wait_storecnt, erase the rest.
2166 if (CombinedStoreDsCntInstr == nullptr) {
2167 CombinedStoreDsCntInstr = &II;
2168 } else {
2169 II.eraseFromParent();
2170 Modified = true;
2171 }
2172 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
2173 unsigned OldEnc =
2174 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2175 AMDGPU::Waitcnt OldWait;
2178 if (TrySimplify)
2179 ScoreBrackets.simplifyWaitcnt(OldWait);
2180 Wait = Wait.combined(OldWait);
2181 if (WaitcntDepctrInstr == nullptr) {
2182 WaitcntDepctrInstr = &II;
2183 } else {
2184 // S_WAITCNT_DEPCTR requires special care. Don't remove a
2185 // duplicate if it is waiting on things other than VA_VDST or
2186 // VM_VSRC. If that is the case, just make sure the VA_VDST and
2187 // VM_VSRC subfields of the operand are set to the "no wait"
2188 // values.
2189
2190 unsigned Enc =
2191 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2192 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u);
2193 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u);
2194
2195 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
2196 Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc);
2197 Modified |= promoteSoftWaitCnt(&II);
2198 } else {
2199 II.eraseFromParent();
2200 Modified = true;
2201 }
2202 }
2203 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
2204 // Architectures higher than GFX10 do not have direct loads to
2205 // LDS, so no work required here yet.
2206 II.eraseFromParent();
2207 Modified = true;
2208 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
2209 // Update the Waitcnt, but don't erase the wait.asyncmark() itself. It
2210 // shows up in the assembly as a comment with the original parameter N.
2211 unsigned N = II.getOperand(0).getImm();
2212 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
2213 Wait = Wait.combined(OldWait);
2214 } else {
2215 std::optional<AMDGPU::InstCounterType> CT = counterTypeForInstr(Opcode);
2216 assert(CT.has_value());
2217 unsigned OldCnt =
2218 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2219 if (TrySimplify)
2220 addWait(Wait, CT.value(), OldCnt);
2221 else
2222 addWait(RequiredWait, CT.value(), OldCnt);
2223 // Keep the first wait of its kind, erase the rest.
2224 if (WaitInstrs[CT.value()] == nullptr) {
2225 WaitInstrs[CT.value()] = &II;
2226 } else {
2227 II.eraseFromParent();
2228 Modified = true;
2229 }
2230 }
2231 }
2232
2233 ScoreBrackets.simplifyWaitcnt(Wait.combined(RequiredWait), Wait);
2234 Wait = Wait.combined(RequiredWait);
2235
2236 if (CombinedLoadDsCntInstr) {
2237 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
2238 // to be waited for. Otherwise, let the instruction be deleted so
2239 // the appropriate single counter wait instruction can be inserted
2240 // instead, when new S_WAIT_*CNT instructions are inserted by
2241 // createNewWaitcnt(). As a side effect, resetting the wait counts will
2242 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
2243 // the loop below that deals with single counter instructions.
2244 //
2245 // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
2246 // instructions that have decremented LOAD_CNT or DS_CNT on completion
2247 // will have needed to wait for their register sources to be available
2248 // first.
2249 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u && Wait.get(AMDGPU::DS_CNT) != ~0u) {
2250 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2251 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
2252 AMDGPU::OpName::simm16, NewEnc);
2253 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
2254 ScoreBrackets.applyWaitcnt(AMDGPU::LOAD_CNT, Wait.get(AMDGPU::LOAD_CNT));
2255 ScoreBrackets.applyWaitcnt(AMDGPU::DS_CNT, Wait.get(AMDGPU::DS_CNT));
2256 Wait.set(AMDGPU::LOAD_CNT, ~0u);
2257 Wait.set(AMDGPU::DS_CNT, ~0u);
2258
2259 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2260 << "New Instr at block end: "
2261 << *CombinedLoadDsCntInstr << '\n'
2262 : dbgs() << "applied pre-existing waitcnt\n"
2263 << "Old Instr: " << *It << "New Instr: "
2264 << *CombinedLoadDsCntInstr << '\n');
2265 } else {
2266 CombinedLoadDsCntInstr->eraseFromParent();
2267 Modified = true;
2268 }
2269 }
2270
2271 if (CombinedStoreDsCntInstr) {
2272 // Similarly for S_WAIT_STORECNT_DSCNT.
2273 if (Wait.get(AMDGPU::STORE_CNT) != ~0u && Wait.get(AMDGPU::DS_CNT) != ~0u) {
2274 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2275 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
2276 AMDGPU::OpName::simm16, NewEnc);
2277 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2278 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::STORE_CNT);
2279 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::DS_CNT);
2280 Wait.set(AMDGPU::STORE_CNT, ~0u);
2281 Wait.set(AMDGPU::DS_CNT, ~0u);
2282
2283 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2284 << "New Instr at block end: "
2285 << *CombinedStoreDsCntInstr << '\n'
2286 : dbgs() << "applied pre-existing waitcnt\n"
2287 << "Old Instr: " << *It << "New Instr: "
2288 << *CombinedStoreDsCntInstr << '\n');
2289 } else {
2290 CombinedStoreDsCntInstr->eraseFromParent();
2291 Modified = true;
2292 }
2293 }
2294
2295 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
2296 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
2297 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
2298 // instructions so that createNewWaitcnt() will create new combined
2299 // instructions to replace them.
2300
2301 if (Wait.get(AMDGPU::DS_CNT) != ~0u) {
2302 // This is a vector of addresses in WaitInstrs pointing to instructions
2303 // that should be removed if they are present.
2305
2306 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
2307 // both) need to be waited for, ensure that there are no existing
2308 // individual wait count instructions for these.
2309
2310 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u) {
2311 WaitsToErase.push_back(&WaitInstrs[AMDGPU::LOAD_CNT]);
2312 WaitsToErase.push_back(&WaitInstrs[AMDGPU::DS_CNT]);
2313 } else if (Wait.get(AMDGPU::STORE_CNT) != ~0u) {
2314 WaitsToErase.push_back(&WaitInstrs[AMDGPU::STORE_CNT]);
2315 WaitsToErase.push_back(&WaitInstrs[AMDGPU::DS_CNT]);
2316 }
2317
2318 for (MachineInstr **WI : WaitsToErase) {
2319 if (!*WI)
2320 continue;
2321
2322 (*WI)->eraseFromParent();
2323 *WI = nullptr;
2324 Modified = true;
2325 }
2326 }
2327
2329 if (!WaitInstrs[CT])
2330 continue;
2331
2332 unsigned NewCnt = Wait.get(CT);
2333 if (NewCnt != ~0u) {
2334 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
2335 AMDGPU::OpName::simm16, NewCnt);
2336 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2337
2338 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2339 setNoWait(Wait, CT);
2340
2341 LLVM_DEBUG(It.isEnd()
2342 ? dbgs() << "applied pre-existing waitcnt\n"
2343 << "New Instr at block end: " << *WaitInstrs[CT]
2344 << '\n'
2345 : dbgs() << "applied pre-existing waitcnt\n"
2346 << "Old Instr: " << *It
2347 << "New Instr: " << *WaitInstrs[CT] << '\n');
2348 } else {
2349 WaitInstrs[CT]->eraseFromParent();
2350 Modified = true;
2351 }
2352 }
2353
2354 if (WaitcntDepctrInstr) {
2355 // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
2356 // subfields with the new required values.
2357 unsigned Enc =
2358 TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2359 ->getImm();
2362
2363 ScoreBrackets.applyWaitcnt(AMDGPU::VA_VDST, Wait.get(AMDGPU::VA_VDST));
2364 ScoreBrackets.applyWaitcnt(AMDGPU::VM_VSRC, Wait.get(AMDGPU::VM_VSRC));
2365 Wait.set(AMDGPU::VA_VDST, ~0u);
2366 Wait.set(AMDGPU::VM_VSRC, ~0u);
2367
2368 // If that new encoded Depctr immediate would actually still wait
2369 // for anything, update the instruction's operand. Otherwise it can
2370 // just be deleted.
2371 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
2372 Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr,
2373 AMDGPU::OpName::simm16, Enc);
2374 LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n"
2375 << "New Instr at block end: "
2376 << *WaitcntDepctrInstr << '\n'
2377 : dbgs() << "applyPreexistingWaitcnt\n"
2378 << "Old Instr: " << *It << "New Instr: "
2379 << *WaitcntDepctrInstr << '\n');
2380 } else {
2381 WaitcntDepctrInstr->eraseFromParent();
2382 Modified = true;
2383 }
2384 }
2385
2386 return Modified;
2387}
2388
2389/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
2390bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2391 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
2392 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
2393 assert(!isNormalMode(MaxCounter));
2394
2395 bool Modified = false;
2396 const DebugLoc &DL = Block.findDebugLoc(It);
2397
2398 // Helper to emit expanded waitcnt sequence for profiling.
2399 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
2400 auto EmitWaitcnt) {
2401 for (unsigned I = Outstanding - 1; I > Target && I != ~0u; --I)
2402 EmitWaitcnt(I);
2403 EmitWaitcnt(Target);
2404 Modified = true;
2405 };
2406
2407 // For GFX12+, we use separate wait instructions, which makes expansion
2408 // simpler
2409 if (ExpandWaitcntProfiling) {
2411 unsigned Count = Wait.get(CT);
2412 if (Count == ~0u)
2413 continue;
2414
2415 // Skip expansion for out-of-order counters - emit normal wait instead
2416 if (ScoreBrackets.counterOutOfOrder(CT)) {
2417 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2418 .addImm(Count);
2419 Modified = true;
2420 continue;
2421 }
2422
2423 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
2424 getWaitCountMax(getLimits(), CT) - 1);
2425 EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
2426 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2427 .addImm(Val);
2428 });
2429 }
2430 return Modified;
2431 }
2432
2433 // Normal behavior (no expansion)
2434 // Check for opportunities to use combined wait instructions.
2435 if (Wait.get(AMDGPU::DS_CNT) != ~0u) {
2436 MachineInstr *SWaitInst = nullptr;
2437
2438 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u) {
2439 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2440
2441 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2442 .addImm(Enc);
2443
2444 Wait.set(AMDGPU::LOAD_CNT, ~0u);
2445 Wait.set(AMDGPU::DS_CNT, ~0u);
2446 } else if (Wait.get(AMDGPU::STORE_CNT) != ~0u) {
2447 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2448
2449 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_STORECNT_DSCNT))
2450 .addImm(Enc);
2451
2452 Wait.set(AMDGPU::STORE_CNT, ~0u);
2453 Wait.set(AMDGPU::DS_CNT, ~0u);
2454 }
2455
2456 if (SWaitInst) {
2457 Modified = true;
2458
2459 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2460 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2461 dbgs() << "New Instr: " << *SWaitInst << '\n');
2462 }
2463 }
2464
2465 // Generate an instruction for any remaining counter that needs
2466 // waiting for.
2467
2469 unsigned Count = Wait.get(CT);
2470 if (Count == ~0u)
2471 continue;
2472
2473 [[maybe_unused]] auto SWaitInst =
2474 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2475 .addImm(Count);
2476
2477 Modified = true;
2478
2479 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2480 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2481 dbgs() << "New Instr: " << *SWaitInst << '\n');
2482 }
2483
2484 if (Wait.hasWaitDepctr()) {
2485 assert(IsExpertMode);
2486 unsigned Enc =
2489
2490 [[maybe_unused]] auto SWaitInst =
2491 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_DEPCTR)).addImm(Enc);
2492
2493 Modified = true;
2494
2495 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
2496 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2497 dbgs() << "New Instr: " << *SWaitInst << '\n');
2498 }
2499
2500 return Modified;
2501}
2502
2503/// Generate s_waitcnt instruction to be placed before cur_Inst.
2504/// Instructions of a given type are returned in order,
2505/// but instructions of different types can complete out of order.
2506/// We rely on this in-order completion
2507/// and simply assign a score to the memory access instructions.
2508/// We keep track of the active "score bracket" to determine
2509/// if an access of a memory read requires an s_waitcnt
2510/// and if so what the value of each counter is.
2511/// The "score bracket" is bound by the lower bound and upper bound
2512/// scores (*_score_LB and *_score_ub respectively).
2513/// If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here.
2514/// If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here
2515/// (GFX12+ only, where DS_CNT is a separate counter).
2516bool SIInsertWaitcnts::generateWaitcntInstBefore(
2517 MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
2518 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2519 LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs()););
2520 setForceEmitWaitcnt();
2521
2522 assert(!isNonWaitcntMetaInst(MI));
2523
2524 AMDGPU::Waitcnt Wait;
2525 const unsigned Opc = MI.getOpcode();
2526
2527 switch (Opc) {
2528 case AMDGPU::BUFFER_WBINVL1:
2529 case AMDGPU::BUFFER_WBINVL1_SC:
2530 case AMDGPU::BUFFER_WBINVL1_VOL:
2531 case AMDGPU::BUFFER_GL0_INV:
2532 case AMDGPU::BUFFER_GL1_INV: {
2533 // FIXME: This should have already been handled by the memory legalizer.
2534 // Removing this currently doesn't affect any lit tests, but we need to
2535 // verify that nothing was relying on this. The number of buffer invalidates
2536 // being handled here should not be expanded.
2537 Wait.set(AMDGPU::LOAD_CNT, 0);
2538 break;
2539 }
2540 case AMDGPU::SI_RETURN_TO_EPILOG:
2541 case AMDGPU::SI_RETURN:
2542 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2543 case AMDGPU::S_SETPC_B64_return: {
2544 // All waits must be resolved at call return.
2545 // NOTE: this could be improved with knowledge of all call sites or
2546 // with knowledge of the called routines.
2547 ReturnInsts.insert(&MI);
2548 AMDGPU::Waitcnt AllZeroWait =
2549 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2550 // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
2551 // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
2552 // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
2553 // no need to wait for it at function boundaries.
2554 if (ST.hasExtendedWaitCounts() &&
2555 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
2556 AllZeroWait.set(AMDGPU::LOAD_CNT, ~0u);
2557 Wait = AllZeroWait;
2558 break;
2559 }
2560 case AMDGPU::S_ENDPGM:
2561 case AMDGPU::S_ENDPGM_SAVED: {
2562 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
2563 // Technically the hardware will do this on its own if we don't, but that
2564 // might cost extra cycles compared to doing it explicitly.
2565 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
2566 // have to wait for outstanding VMEM stores. In this case it can be useful
2567 // to send a message to explicitly release all VGPRs before the stores have
2568 // completed, but it is only safe to do this if there are no outstanding
2569 // scratch stores.
2570 EndPgmInsts[&MI] = !ScoreBrackets.empty(AMDGPU::STORE_CNT) &&
2571 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
2572 break;
2573 }
2574 case AMDGPU::S_SENDMSG:
2575 case AMDGPU::S_SENDMSGHALT: {
2576 if (ST.hasLegacyGeometry() &&
2577 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
2579 // Resolve vm waits before gs-done.
2580 Wait.set(AMDGPU::LOAD_CNT, 0);
2581 break;
2582 }
2583 [[fallthrough]];
2584 }
2585 default: {
2586
2587 // Export & GDS instructions do not read the EXEC mask until after the
2588 // export is granted (which can occur well after the instruction is issued).
2589 // The shader program must flush all EXP operations on the export-count
2590 // before overwriting the EXEC mask.
2591 if (MI.modifiesRegister(AMDGPU::EXEC, &TRI)) {
2592 // Export and GDS are tracked individually, either may trigger a waitcnt
2593 // for EXEC.
2594 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2595 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2596 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2597 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2598 Wait.set(AMDGPU::EXP_CNT, 0);
2599 }
2600 }
2601
2602 // Wait for any pending GDS instruction to complete before any
2603 // "Always GDS" instruction.
2604 if (TII.isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
2605 addWait(Wait, AMDGPU::DS_CNT, ScoreBrackets.getPendingGDSWait());
2606
2607 if (MI.isCall()) {
2608 // The function is going to insert a wait on everything in its prolog.
2609 // This still needs to be careful if the call target is a load (e.g. a GOT
2610 // load). We also need to check WAW dependency with saved PC.
2611 CallInsts.insert(&MI);
2612 Wait = AMDGPU::Waitcnt();
2613
2614 const MachineOperand &CallAddrOp = TII.getCalleeOperand(MI);
2615 if (CallAddrOp.isReg()) {
2616 ScoreBrackets.determineWaitForPhysReg(
2617 SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait, MI);
2618
2619 if (const auto *RtnAddrOp =
2620 TII.getNamedOperand(MI, AMDGPU::OpName::dst)) {
2621 ScoreBrackets.determineWaitForPhysReg(
2622 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait, MI);
2623 }
2624 }
2625 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
2626 ScoreBrackets.tryClearSCCWriteEvent(&MI);
2627 } else {
2628 // FIXME: Should not be relying on memoperands.
2629 // Look at the source operands of every instruction to see if
2630 // any of them results from a previous memory operation that affects
2631 // its current usage. If so, an s_waitcnt instruction needs to be
2632 // emitted.
2633 // If the source operand was defined by a load, add the s_waitcnt
2634 // instruction.
2635 //
2636 // Two cases are handled for destination operands:
2637 // 1) If the destination operand was defined by a load, add the s_waitcnt
2638 // instruction to guarantee the right WAW order.
2639 // 2) If a destination operand that was used by a recent export/store ins,
2640 // add s_waitcnt on exp_cnt to guarantee the WAR order.
2641
2642 for (const MachineMemOperand *Memop : MI.memoperands()) {
2643 const Value *Ptr = Memop->getValue();
2644 if (Memop->isStore()) {
2645 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
2646 addWait(Wait, SmemAccessCounter, 0);
2647 if (PDT.dominates(MI.getParent(), It->second))
2648 SLoadAddresses.erase(It);
2649 }
2650 }
2651 unsigned AS = Memop->getAddrSpace();
2653 continue;
2654 // No need to wait before load from VMEM to LDS.
2655 if (TII.mayWriteLDSThroughDMA(MI))
2656 continue;
2657
2658 // LOAD_CNT is only relevant to vgpr or LDS.
2659 unsigned TID = LDSDMA_BEGIN;
2660 if (Ptr && Memop->getAAInfo()) {
2661 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2662 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2663 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
2664 if ((I + 1) >= NUM_LDSDMA) {
2665 // We didn't have enough slot to track this LDS DMA store, it
2666 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
2667 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, TID,
2668 Wait);
2669 break;
2670 }
2671
2672 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT,
2673 TID + I + 1, Wait);
2674 }
2675 }
2676 } else {
2677 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, TID, Wait);
2678 }
2679 if (Memop->isStore()) {
2680 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::EXP_CNT, TID, Wait);
2681 }
2682 }
2683
2684 // Loop over use and def operands.
2685 for (const MachineOperand &Op : MI.operands()) {
2686 if (!Op.isReg())
2687 continue;
2688
2689 // If the instruction does not read tied source, skip the operand.
2690 if (Op.isTied() && Op.isUse() && TII.doesNotReadTiedSource(MI))
2691 continue;
2692
2693 MCPhysReg Reg = Op.getReg().asMCReg();
2694
2695 const bool IsVGPR = TRI.isVectorRegister(MRI, Op.getReg());
2696 if (IsVGPR) {
2697 // Implicit VGPR defs and uses are never a part of the memory
2698 // instructions description and usually present to account for
2699 // super-register liveness.
2700 // TODO: Most of the other instructions also have implicit uses
2701 // for the liveness accounting only.
2702 if (Op.isImplicit() && MI.mayLoadOrStore())
2703 continue;
2704
2705 ScoreBrackets.determineWaitForPhysReg(AMDGPU::VA_VDST, Reg, Wait, MI);
2706 if (Op.isDef())
2707 ScoreBrackets.determineWaitForPhysReg(AMDGPU::VM_VSRC, Reg, Wait,
2708 MI);
2709 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2710 // previous write and this write are the same type of VMEM
2711 // instruction, in which case they are (in some architectures)
2712 // guaranteed to write their results in order anyway.
2713 // Additionally check instructions where Point Sample Acceleration
2714 // might be applied.
2715 if (Op.isUse() || !updateVMCntOnly(MI) ||
2716 ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||
2717 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
2718 !ST.hasVmemWriteVgprInOrder()) {
2719 ScoreBrackets.determineWaitForPhysReg(AMDGPU::LOAD_CNT, Reg, Wait,
2720 MI);
2721 ScoreBrackets.determineWaitForPhysReg(AMDGPU::SAMPLE_CNT, Reg, Wait,
2722 MI);
2723 ScoreBrackets.determineWaitForPhysReg(AMDGPU::BVH_CNT, Reg, Wait,
2724 MI);
2725 ScoreBrackets.clearVgprVmemTypes(Reg);
2726 }
2727
2728 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2729 ScoreBrackets.determineWaitForPhysReg(AMDGPU::EXP_CNT, Reg, Wait,
2730 MI);
2731 }
2732 ScoreBrackets.determineWaitForPhysReg(AMDGPU::DS_CNT, Reg, Wait, MI);
2733 } else if (Op.getReg() == AMDGPU::SCC) {
2734 ScoreBrackets.determineWaitForPhysReg(AMDGPU::KM_CNT, Reg, Wait, MI);
2735 } else {
2736 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait,
2737 MI);
2738 }
2739
2740 if (ST.hasWaitXcnt() && Op.isDef())
2741 ScoreBrackets.determineWaitForPhysReg(AMDGPU::X_CNT, Reg, Wait, MI);
2742 }
2743 }
2744 }
2745 }
2746
2747 // Ensure safety against exceptions from outstanding memory operations while
2748 // waiting for a barrier:
2749 //
2750 // * Some subtargets safely handle backing off the barrier in hardware
2751 // when an exception occurs.
2752 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2753 // there can be no outstanding memory operations during the wait.
2754 // * Subtargets with split barriers don't need to back off the barrier; it
2755 // is up to the trap handler to preserve the user barrier state correctly.
2756 //
2757 // In all other cases, ensure safety by ensuring that there are no outstanding
2758 // memory operations.
2759 if (Opc == AMDGPU::S_BARRIER && !ST.hasAutoWaitcntBeforeBarrier() &&
2760 !ST.hasBackOffBarrier()) {
2761 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2762 }
2763
2764 // TODO: Remove this work-around, enable the assert for Bug 457939
2765 // after fixing the scheduler. Also, the Shader Compiler code is
2766 // independent of target.
2767 if (SIInstrInfo::isCBranchVCCZRead(MI) && ST.hasReadVCCZBug() &&
2768 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2769 Wait.set(AMDGPU::DS_CNT, 0);
2770 }
2771
2772 // Verify that the wait is actually needed.
2773 ScoreBrackets.simplifyWaitcnt(Wait);
2774
2775 // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
2776 // waits on VA_VDST if the instruction it would precede is not a VALU
2777 // instruction, since hardware handles VALU->VGPR->VALU hazards in
2778 // expert scheduling mode.
2779 if (TII.isVALU(MI))
2780 Wait.set(AMDGPU::VA_VDST, ~0u);
2781
2782 // Since the translation for VMEM addresses occur in-order, we can apply the
2783 // XCnt if the current instruction is of VMEM type and has a memory
2784 // dependency with another VMEM instruction in flight.
2785 if (Wait.get(AMDGPU::X_CNT) != ~0u && isVmemAccess(MI)) {
2786 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::X_CNT);
2787 Wait.set(AMDGPU::X_CNT, ~0u);
2788 }
2789
2790 // When forcing emit, we need to skip terminators because that would break the
2791 // terminators of the MBB if we emit a waitcnt between terminators.
2792 if (ForceEmitZeroFlag && !MI.isTerminator())
2793 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2794
2795 // If we force waitcnt then update Wait accordingly.
2797 if (!ForceEmitWaitcnt[T])
2798 continue;
2799 Wait.set(T, 0);
2800 }
2801
2802 if (FlushFlags.FlushVmCnt) {
2805 Wait.set(T, 0);
2806 }
2807
2808 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(AMDGPU::DS_CNT))
2809 Wait.set(AMDGPU::DS_CNT, 0);
2810
2811 if (ForceEmitZeroLoadFlag && Wait.get(AMDGPU::LOAD_CNT) != ~0u)
2812 Wait.set(AMDGPU::LOAD_CNT, 0);
2813
2814 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2815 OldWaitcntInstr);
2816}
2817
2818bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2820 MachineBasicBlock &Block,
2821 WaitcntBrackets &ScoreBrackets,
2822 MachineInstr *OldWaitcntInstr) {
2823 bool Modified = false;
2824
2825 if (OldWaitcntInstr)
2826 // Try to merge the required wait with preexisting waitcnt instructions.
2827 // Also erase redundant waitcnt.
2828 Modified =
2829 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2830
2831 // ExpCnt can be merged into VINTERP.
2832 if (Wait.get(AMDGPU::EXP_CNT) != ~0u && It != Block.instr_end() &&
2834 MachineOperand *WaitExp = TII.getNamedOperand(*It, AMDGPU::OpName::waitexp);
2835 if (Wait.get(AMDGPU::EXP_CNT) < WaitExp->getImm()) {
2836 WaitExp->setImm(Wait.get(AMDGPU::EXP_CNT));
2837 Modified = true;
2838 }
2839 // Apply ExpCnt before resetting it, so applyWaitcnt below sees all counts.
2840 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::EXP_CNT);
2841 Wait.set(AMDGPU::EXP_CNT, ~0u);
2842
2843 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2844 << "Update Instr: " << *It);
2845 }
2846
2847 if (WCG->createNewWaitcnt(Block, It, Wait, ScoreBrackets))
2848 Modified = true;
2849
2850 // Any counts that could have been applied to any existing waitcnt
2851 // instructions will have been done so, now deal with any remaining.
2852 ScoreBrackets.applyWaitcnt(Wait);
2853
2854 return Modified;
2855}
2856
2857std::optional<WaitEventType>
2858SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
2859 if (TII.isVALU(Inst)) {
2860 // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
2861 // out-of-order with respect to each other, so each of these classes
2862 // has its own event.
2863
2864 if (TII.isXDL(Inst))
2865 return VGPR_XDL_WRITE;
2866
2867 if (TII.isTRANS(Inst))
2868 return VGPR_TRANS_WRITE;
2869
2871 return VGPR_DPMACC_WRITE;
2872
2873 return VGPR_CSMACC_WRITE;
2874 }
2875
2876 // FLAT and LDS instructions may read their VGPR sources out-of-order
2877 // with respect to each other and all other VMEM instructions, so
2878 // each of these also has a separate event.
2879
2880 if (TII.isFLAT(Inst))
2881 return VGPR_FLAT_READ;
2882
2883 if (TII.isDS(Inst))
2884 return VGPR_LDS_READ;
2885
2886 if (TII.isVMEM(Inst) || TII.isVIMAGE(Inst) || TII.isVSAMPLE(Inst))
2887 return VGPR_VMEM_READ;
2888
2889 // Otherwise, no hazard.
2890
2891 return {};
2892}
2893
2894bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2895 return (TII.isFLAT(MI) && TII.mayAccessVMEMThroughFlat(MI)) ||
2896 (TII.isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2897}
2898
2899// Return true if the next instruction is S_ENDPGM, following fallthrough
2900// blocks if necessary.
2901bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2902 MachineBasicBlock *Block) const {
2903 auto BlockEnd = Block->getParent()->end();
2904 auto BlockIter = Block->getIterator();
2905
2906 while (true) {
2907 if (It.isEnd()) {
2908 if (++BlockIter != BlockEnd) {
2909 It = BlockIter->instr_begin();
2910 continue;
2911 }
2912
2913 return false;
2914 }
2915
2916 if (!It->isMetaInstruction())
2917 break;
2918
2919 It++;
2920 }
2921
2922 assert(!It.isEnd());
2923
2924 return It->getOpcode() == AMDGPU::S_ENDPGM;
2925}
2926
2927// Add a wait after an instruction if architecture requirements mandate one.
2928bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2929 MachineBasicBlock &Block,
2930 WaitcntBrackets &ScoreBrackets) {
2931 AMDGPU::Waitcnt Wait;
2932 bool NeedsEndPGMCheck = false;
2933
2934 if (ST.isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2935 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2937
2938 if (TII.isAlwaysGDS(Inst.getOpcode())) {
2939 Wait.set(AMDGPU::DS_CNT, 0);
2940 NeedsEndPGMCheck = true;
2941 }
2942
2943 ScoreBrackets.simplifyWaitcnt(Wait);
2944
2945 auto SuccessorIt = std::next(Inst.getIterator());
2946 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2947 /*OldWaitcntInstr=*/nullptr);
2948
2949 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2950 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII.get(AMDGPU::S_NOP))
2951 .addImm(0);
2952 }
2953
2954 return Result;
2955}
2956
2957WaitEventSet SIInsertWaitcnts::getEventsFor(const MachineInstr &Inst) const {
2958 WaitEventSet Events;
2959 if (IsExpertMode) {
2960 if (const auto ET = getExpertSchedulingEventType(Inst))
2961 Events.insert(*ET);
2962 }
2963
2964 if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
2965 if (TII.isAlwaysGDS(Inst.getOpcode()) ||
2966 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2967 Events.insert(GDS_ACCESS);
2968 Events.insert(GDS_GPR_LOCK);
2969 } else {
2970 Events.insert(LDS_ACCESS);
2971 }
2972 } else if (TII.isFLAT(Inst)) {
2974 Events.insert(getVmemWaitEventType(Inst));
2975 } else {
2976 assert(Inst.mayLoadOrStore());
2977 if (TII.mayAccessVMEMThroughFlat(Inst)) {
2978 if (ST.hasWaitXcnt())
2979 Events.insert(VMEM_GROUP);
2980 Events.insert(getVmemWaitEventType(Inst));
2981 }
2982 if (TII.mayAccessLDSThroughFlat(Inst))
2983 Events.insert(LDS_ACCESS);
2984 }
2985 } else if (SIInstrInfo::isVMEM(Inst) &&
2987 Inst.getOpcode() == AMDGPU::BUFFER_WBL2)) {
2988 // BUFFER_WBL2 is included here because unlike invalidates, has to be
2989 // followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has
2990 // completed.
2991 if (ST.hasWaitXcnt())
2992 Events.insert(VMEM_GROUP);
2993 Events.insert(getVmemWaitEventType(Inst));
2994 if (ST.vmemWriteNeedsExpWaitcnt() &&
2995 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2996 Events.insert(VMW_GPR_LOCK);
2997 }
2998 } else if (TII.isSMRD(Inst)) {
2999 if (ST.hasWaitXcnt())
3000 Events.insert(SMEM_GROUP);
3001 Events.insert(SMEM_ACCESS);
3002 } else if (SIInstrInfo::isLDSDIR(Inst)) {
3003 Events.insert(EXP_LDS_ACCESS);
3004 } else if (SIInstrInfo::isEXP(Inst)) {
3005 unsigned Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
3007 Events.insert(EXP_PARAM_ACCESS);
3008 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
3009 Events.insert(EXP_POS_ACCESS);
3010 else
3011 Events.insert(EXP_GPR_LOCK);
3012 } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
3013 Events.insert(SCC_WRITE);
3014 } else {
3015 switch (Inst.getOpcode()) {
3016 case AMDGPU::S_SENDMSG:
3017 case AMDGPU::S_SENDMSG_RTN_B32:
3018 case AMDGPU::S_SENDMSG_RTN_B64:
3019 case AMDGPU::S_SENDMSGHALT:
3020 Events.insert(SQ_MESSAGE);
3021 break;
3022 case AMDGPU::S_MEMTIME:
3023 case AMDGPU::S_MEMREALTIME:
3024 case AMDGPU::S_GET_BARRIER_STATE_M0:
3025 case AMDGPU::S_GET_BARRIER_STATE_IMM:
3026 Events.insert(SMEM_ACCESS);
3027 break;
3028 }
3029 }
3030 return Events;
3031}
3032
3033void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
3034 WaitcntBrackets *ScoreBrackets) {
3035
3036 WaitEventSet InstEvents = getEventsFor(Inst);
3037 for (WaitEventType E : wait_events()) {
3038 if (InstEvents.contains(E))
3039 ScoreBrackets->updateByEvent(E, Inst);
3040 }
3041
3042 if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
3043 if (TII.isAlwaysGDS(Inst.getOpcode()) ||
3044 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
3045 ScoreBrackets->setPendingGDS();
3046 }
3047 } else if (TII.isFLAT(Inst)) {
3048 if (Inst.mayLoadOrStore() && TII.mayAccessVMEMThroughFlat(Inst) &&
3049 TII.mayAccessLDSThroughFlat(Inst) && !SIInstrInfo::isLDSDMA(Inst)) {
3050 // Async/LDSDMA operations have FLAT encoding but do not actually use flat
3051 // pointers. They do have two operands that each access global and LDS,
3052 // thus making it appear at this point that they are using a flat pointer.
3053 // Filter them out, and for the rest, generate a dependency on flat
3054 // pointers so that both VM and LGKM counters are flushed.
3055 ScoreBrackets->setPendingFlat();
3056 }
3057 if (SIInstrInfo::usesASYNC_CNT(Inst)) {
3058 ScoreBrackets->updateByEvent(ASYNC_ACCESS, Inst);
3059 }
3060 } else if (Inst.isCall()) {
3061 // Act as a wait on everything, but AsyncCnt is never included in such
3062 // blanket waits.
3063 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
3064 ScoreBrackets->setStateOnFunctionEntryOrReturn();
3065 } else if (TII.isVINTERP(Inst)) {
3066 int64_t Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
3067 ScoreBrackets->applyWaitcnt(AMDGPU::EXP_CNT, Imm);
3068 }
3069
3070 // Set XCNT to zero in the bracket for instructions that implicitly drain
3071 // XCNT.
3072 if (ST.hasWaitXcnt() && SIInstrInfo::isXcntDrain(Inst))
3073 ScoreBrackets->applyWaitcnt(AMDGPU::X_CNT, 0);
3074}
3075
3076bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
3077 unsigned OtherScore) {
3078 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
3079 unsigned OtherShifted =
3080 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
3081 Score = std::max(MyShifted, OtherShifted);
3082 return OtherShifted > MyShifted;
3083}
3084
3085bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
3086 ArrayRef<CounterValueArray> OtherMarks) {
3087 bool StrictDom = false;
3088
3089 LLVM_DEBUG(dbgs() << "Merging async marks ...");
3090 // Early exit: nothing to merge when both sides are empty.
3091 if (AsyncMarks.empty() && OtherMarks.empty()) {
3092 LLVM_DEBUG(dbgs() << " nothing to merge\n");
3093 return false;
3094 }
3095 LLVM_DEBUG(dbgs() << '\n');
3096
3097 // Determine maximum length needed after merging
3098 auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.size());
3099 MaxSize = std::min(MaxSize, MaxAsyncMarks);
3100
3101 // Keep only the most recent marks within our limit.
3102 if (AsyncMarks.size() > MaxSize)
3103 AsyncMarks.erase(AsyncMarks.begin(),
3104 AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
3105
3106 // Pad with zero-filled marks if our list is shorter. Zero represents "no
3107 // pending async operations at this checkpoint" and acts as the identity
3108 // element for max() during merging. We pad at the beginning since the marks
3109 // need to be aligned in most-recent order.
3110 constexpr CounterValueArray ZeroMark{};
3111 AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
3112
3113 LLVM_DEBUG({
3114 dbgs() << "Before merge:\n";
3115 for (const auto &Mark : AsyncMarks) {
3116 llvm::interleaveComma(Mark, dbgs());
3117 dbgs() << '\n';
3118 }
3119 dbgs() << "Other marks:\n";
3120 for (const auto &Mark : OtherMarks) {
3121 llvm::interleaveComma(Mark, dbgs());
3122 dbgs() << '\n';
3123 }
3124 });
3125
3126 // Merge element-wise using the existing mergeScore function and the
3127 // appropriate MergeInfo for each counter type. Iterate only while we have
3128 // elements in both vectors.
3129 unsigned OtherSize = OtherMarks.size();
3130 unsigned OurSize = AsyncMarks.size();
3131 unsigned MergeCount = std::min(OtherSize, OurSize);
3132 // OtherMarks is empty -> OtherSize == 0 -> MergeCount == 0.
3133 // Our existing marks are the conservative result; return early to avoid
3134 // passing MergeCount == 0 to seq_inclusive which asserts Begin <= End.
3135 if (MergeCount == 0)
3136 return StrictDom;
3137 for (auto Idx : seq_inclusive<unsigned>(1, MergeCount)) {
3138 for (auto T : inst_counter_types(Context->MaxCounter)) {
3139 StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],
3140 OtherMarks[OtherSize - Idx][T]);
3141 }
3142 }
3143
3144 LLVM_DEBUG({
3145 dbgs() << "After merge:\n";
3146 for (const auto &Mark : AsyncMarks) {
3147 llvm::interleaveComma(Mark, dbgs());
3148 dbgs() << '\n';
3149 }
3150 });
3151
3152 return StrictDom;
3153}
3154
3155/// Merge the pending events and associater score brackets of \p Other into
3156/// this brackets status.
3157///
3158/// Returns whether the merge resulted in a change that requires tighter waits
3159/// (i.e. the merged brackets strictly dominate the original brackets).
3160bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
3161 bool StrictDom = false;
3162
3163 // Check if "other" has keys we don't have, and create default entries for
3164 // those. If they remain empty after merging, we will clean it up after.
3165 for (auto K : Other.VMem.keys())
3166 VMem.try_emplace(K);
3167 for (auto K : Other.SGPRs.keys())
3168 SGPRs.try_emplace(K);
3169
3170 // Array to store MergeInfo for each counter type
3171 MergeInfo MergeInfos[AMDGPU::NUM_INST_CNTS];
3172
3173 for (auto T : inst_counter_types(Context->MaxCounter)) {
3174 // Merge event flags for this counter
3175 const WaitEventSet &EventsForT = Context->getWaitEvents(T);
3176 const WaitEventSet OldEvents = PendingEvents & EventsForT;
3177 const WaitEventSet OtherEvents = Other.PendingEvents & EventsForT;
3178 if (!OldEvents.contains(OtherEvents))
3179 StrictDom = true;
3180 PendingEvents |= OtherEvents;
3181
3182 // Merge scores for this counter
3183 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
3184 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
3185 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
3186 if (NewUB < ScoreLBs[T])
3187 report_fatal_error("waitcnt score overflow");
3188
3189 MergeInfo &M = MergeInfos[T];
3190 M.OldLB = ScoreLBs[T];
3191 M.OtherLB = Other.ScoreLBs[T];
3192 M.MyShift = NewUB - ScoreUBs[T];
3193 M.OtherShift = NewUB - Other.ScoreUBs[T];
3194
3195 ScoreUBs[T] = NewUB;
3196
3197 if (T == AMDGPU::LOAD_CNT)
3198 StrictDom |= mergeScore(M, LastFlatLoadCnt, Other.LastFlatLoadCnt);
3199
3200 if (T == AMDGPU::DS_CNT) {
3201 StrictDom |= mergeScore(M, LastFlatDsCnt, Other.LastFlatDsCnt);
3202 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
3203 }
3204
3205 if (T == AMDGPU::KM_CNT) {
3206 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
3207 if (Other.hasPendingEvent(SCC_WRITE)) {
3208 if (!OldEvents.contains(SCC_WRITE)) {
3209 PendingSCCWrite = Other.PendingSCCWrite;
3210 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
3211 PendingSCCWrite = nullptr;
3212 }
3213 }
3214 }
3215
3216 for (auto &[RegID, Info] : VMem)
3217 StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));
3218
3219 if (isSmemCounter(T)) {
3220 for (auto &[RegID, Info] : SGPRs) {
3221 auto It = Other.SGPRs.find(RegID);
3222 unsigned OtherScore = (It != Other.SGPRs.end()) ? It->second.get(T) : 0;
3223 StrictDom |= mergeScore(M, Info.get(T), OtherScore);
3224 }
3225 }
3226 }
3227
3228 for (auto &[TID, Info] : VMem) {
3229 if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {
3230 unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;
3231 StrictDom |= NewVmemTypes != Info.VMEMTypes;
3232 Info.VMEMTypes = NewVmemTypes;
3233 }
3234 }
3235
3236 StrictDom |= mergeAsyncMarks(MergeInfos, Other.AsyncMarks);
3237 for (auto T : inst_counter_types(Context->MaxCounter))
3238 StrictDom |= mergeScore(MergeInfos[T], AsyncScore[T], Other.AsyncScore[T]);
3239
3240 purgeEmptyTrackingData();
3241 return StrictDom;
3242}
3243
3244static bool isWaitInstr(MachineInstr &Inst) {
3245 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
3246 return Opcode == AMDGPU::S_WAITCNT ||
3247 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
3248 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
3249 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
3250 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
3251 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
3252 Opcode == AMDGPU::WAIT_ASYNCMARK ||
3253 counterTypeForInstr(Opcode).has_value();
3254}
3255
3256void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
3258 bool ExpertMode) const {
3259 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
3261 BuildMI(MBB, I, DebugLoc(), TII.get(AMDGPU::S_SETREG_IMM32_B32))
3262 .addImm(ExpertMode ? 2 : 0)
3263 .addImm(EncodedReg);
3264}
3265
3266namespace {
3267// TODO: Remove this work-around after fixing the scheduler.
3268// There are two reasons why vccz might be incorrect; see ST.hasReadVCCZBug()
3269// and ST.partialVCCWritesUpdateVCCZ().
3270// i. VCCZBug: There is a hardware bug on CI/SI where SMRD instruction may
3271// corrupt vccz bit, so when we detect that an instruction may read from
3272// a corrupt vccz bit, we need to:
3273// 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
3274// operations to complete.
3275// 2. Recompute the correct value of vccz by writing the current value
3276// of vcc back to vcc.
3277// ii. Partial writes to vcc don't update vccz, so we need to recompute the
3278// correct value of vccz by reading vcc and writing it back to vcc.
3279// No waitcnt is needed in this case.
3280class VCCZWorkaround {
3281 const WaitcntBrackets &ScoreBrackets;
3282 const GCNSubtarget &ST;
3283 const SIInstrInfo &TII;
3284 const SIRegisterInfo &TRI;
3285 bool VCCZCorruptionBug = false;
3286 bool VCCZNotUpdatedByPartialWrites = false;
3287 /// vccz could be incorrect at a basic block boundary if a predecessor wrote
3288 /// to vcc and then issued an smem load, so initialize to true.
3289 bool MustRecomputeVCCZ = true;
3290
3291public:
3292 VCCZWorkaround(const WaitcntBrackets &ScoreBrackets, const GCNSubtarget &ST,
3293 const SIInstrInfo &TII, const SIRegisterInfo &TRI)
3294 : ScoreBrackets(ScoreBrackets), ST(ST), TII(TII), TRI(TRI) {
3295 VCCZCorruptionBug = ST.hasReadVCCZBug();
3296 VCCZNotUpdatedByPartialWrites = !ST.partialVCCWritesUpdateVCCZ();
3297 }
3298 /// If \p MI reads vccz and we must recompute it based on MustRecomputeVCCZ,
3299 /// then emit a vccz recompute instruction before \p MI. This needs to be
3300 /// called on every instruction in the basic block because it also tracks the
3301 /// state and updates MustRecomputeVCCZ accordingly. Returns true if it
3302 /// modified the IR.
3303 bool tryRecomputeVCCZ(MachineInstr &MI) {
3304 // No need to run this if neither bug is present.
3305 if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)
3306 return false;
3307
3308 // If MI is an SMEM and it can corrupt vccz on this target, then we need
3309 // both to emit a waitcnt and to recompute vccz.
3310 // But we don't actually emit a waitcnt here. This is done in
3311 // generateWaitcntInstBefore() because it tracks all the necessary waitcnt
3312 // state, and can either skip emitting a waitcnt if there is already one in
3313 // the IR, or emit an "optimized" combined waitcnt.
3314 // If this is an smem read, it could complete and clobber vccz at any time.
3315 MustRecomputeVCCZ |= VCCZCorruptionBug && TII.isSMRD(MI);
3316
3317 // If the target partial vcc writes don't update vccz, and MI is such an
3318 // instruction then we must recompute vccz.
3319 // Note: We are using PartiallyWritesToVCCOpt optional to avoid calling
3320 // `definesRegister()` more than needed, because it's not very cheap.
3321 std::optional<bool> PartiallyWritesToVCCOpt;
3322 auto PartiallyWritesToVCC = [](MachineInstr &MI) {
3323 return MI.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
3324 MI.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr);
3325 };
3326 if (VCCZNotUpdatedByPartialWrites) {
3327 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
3328 // If this is a partial VCC write but won't update vccz, then we must
3329 // recompute vccz.
3330 MustRecomputeVCCZ |= *PartiallyWritesToVCCOpt;
3331 }
3332
3333 // If MI is a vcc write with no pending smem, or there is a pending smem
3334 // but the target does not suffer from the vccz corruption bug, then we
3335 // don't need to recompute vccz as this write will recompute it anyway.
3336 if (!ScoreBrackets.hasPendingEvent(SMEM_ACCESS) || !VCCZCorruptionBug) {
3337 // Compute PartiallyWritesToVCCOpt if we haven't done so already.
3338 if (!PartiallyWritesToVCCOpt)
3339 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
3340 bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&
3341 MI.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr);
3342 // If we write to the full vcc or we write partially and the target
3343 // updates vccz on partial writes, then vccz will be updated correctly.
3344 bool UpdatesVCCZ = FullyWritesToVCC || (!VCCZNotUpdatedByPartialWrites &&
3345 *PartiallyWritesToVCCOpt);
3346 if (UpdatesVCCZ)
3347 MustRecomputeVCCZ = false;
3348 }
3349
3350 // If MI is a branch that reads VCCZ then emit a waitcnt and a vccz
3351 // restore instruction if either is needed.
3352 if (SIInstrInfo::isCBranchVCCZRead(MI) && MustRecomputeVCCZ) {
3353 // Recompute the vccz bit. Any time a value is written to vcc, the vccz
3354 // bit is updated, so we can restore the bit by reading the value of vcc
3355 // and then writing it back to the register.
3356 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
3357 TII.get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
3358 TRI.getVCC())
3359 .addReg(TRI.getVCC());
3360 MustRecomputeVCCZ = false;
3361 return true;
3362 }
3363 return false;
3364 }
3365};
3366
3367} // namespace
3368
3369// Generate s_waitcnt instructions where needed.
3370bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
3371 MachineBasicBlock &Block,
3372 WaitcntBrackets &ScoreBrackets) {
3373 bool Modified = false;
3374
3375 LLVM_DEBUG({
3376 dbgs() << "*** Begin Block: ";
3377 Block.printName(dbgs());
3378 ScoreBrackets.dump();
3379 });
3380 VCCZWorkaround VCCZW(ScoreBrackets, ST, TII, TRI);
3381
3382 // Walk over the instructions.
3383 MachineInstr *OldWaitcntInstr = nullptr;
3384
3385 // NOTE: We may append instrs after Inst while iterating.
3386 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
3387 E = Block.instr_end();
3388 Iter != E; ++Iter) {
3389 MachineInstr &Inst = *Iter;
3390 if (isNonWaitcntMetaInst(Inst))
3391 continue;
3392 // Track pre-existing waitcnts that were added in earlier iterations or by
3393 // the memory legalizer.
3394 if (isWaitInstr(Inst) ||
3395 (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3396 if (!OldWaitcntInstr)
3397 OldWaitcntInstr = &Inst;
3398 continue;
3399 }
3400
3401 PreheaderFlushFlags FlushFlags;
3402 if (Block.getFirstTerminator() == Inst)
3403 FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3404
3405 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
3406 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3407 FlushFlags);
3408 OldWaitcntInstr = nullptr;
3409
3410 if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
3411 // Asyncmarks record the current wait state and so should not allow
3412 // waitcnts that occur after them to be merged into waitcnts that occur
3413 // before.
3414 ScoreBrackets.recordAsyncMark(Inst);
3415 continue;
3416 }
3417
3418 if (TII.isSMRD(Inst)) {
3419 for (const MachineMemOperand *Memop : Inst.memoperands()) {
3420 // No need to handle invariant loads when avoiding WAR conflicts, as
3421 // there cannot be a vector store to the same memory location.
3422 if (!Memop->isInvariant()) {
3423 const Value *Ptr = Memop->getValue();
3424 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
3425 }
3426 }
3427 }
3428
3429 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3430
3431 // Note: insertForcedWaitAfter() may add instrs after Iter that need to be
3432 // visited by the loop.
3433 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
3434
3435 LLVM_DEBUG({
3436 Inst.print(dbgs());
3437 ScoreBrackets.dump();
3438 });
3439
3440 // If the target suffers from the vccz bugs, this may emit the necessary
3441 // vccz recompute instruction before \p Inst if needed.
3442 Modified |= VCCZW.tryRecomputeVCCZ(Inst);
3443 }
3444
3445 // Flush counters at the end of the block if needed (for preheaders with no
3446 // terminator).
3447 AMDGPU::Waitcnt Wait;
3448 if (Block.getFirstTerminator() == Block.end()) {
3449 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3450 if (FlushFlags.FlushVmCnt) {
3451 if (ScoreBrackets.hasPendingEvent(AMDGPU::LOAD_CNT))
3452 Wait.set(AMDGPU::LOAD_CNT, 0);
3453 if (ScoreBrackets.hasPendingEvent(AMDGPU::SAMPLE_CNT))
3454 Wait.set(AMDGPU::SAMPLE_CNT, 0);
3455 if (ScoreBrackets.hasPendingEvent(AMDGPU::BVH_CNT))
3456 Wait.set(AMDGPU::BVH_CNT, 0);
3457 }
3458 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(AMDGPU::DS_CNT))
3459 Wait.set(AMDGPU::DS_CNT, 0);
3460 }
3461
3462 // Combine or remove any redundant waitcnts at the end of the block.
3463 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
3464 OldWaitcntInstr);
3465
3466 LLVM_DEBUG({
3467 dbgs() << "*** End Block: ";
3468 Block.printName(dbgs());
3469 ScoreBrackets.dump();
3470 });
3471
3472 return Modified;
3473}
3474
3475bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &Block) {
3476 if (Block.size() <= 1)
3477 return false;
3478 // The Memory Legalizer conservatively inserts a soft xcnt before each
3479 // atomic RMW operation. However, for sequences of back-to-back atomic
3480 // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
3481 // the redundant soft xcnts.
3482 bool Modified = false;
3483 // Remember the last atomic with a soft xcnt right before it.
3484 MachineInstr *LastAtomicWithSoftXcnt = nullptr;
3485
3486 for (MachineInstr &MI : drop_begin(Block)) {
3487 // Ignore last atomic if non-LDS VMEM and SMEM.
3488 bool IsLDS =
3489 TII.isDS(MI) || (TII.isFLAT(MI) && TII.mayAccessLDSThroughFlat(MI));
3490 if (!IsLDS && (MI.mayLoad() ^ MI.mayStore()))
3491 LastAtomicWithSoftXcnt = nullptr;
3492
3493 bool IsAtomicRMW = (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) &&
3494 MI.mayLoad() && MI.mayStore();
3495 MachineInstr &PrevMI = *MI.getPrevNode();
3496 // This is an atomic with a soft xcnt.
3497 if (PrevMI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3498 // If we have already found an atomic with a soft xcnt, remove this soft
3499 // xcnt as it's redundant.
3500 if (LastAtomicWithSoftXcnt) {
3501 PrevMI.eraseFromParent();
3502 Modified = true;
3503 }
3504 LastAtomicWithSoftXcnt = &MI;
3505 }
3506 }
3507 return Modified;
3508}
3509
3510// Return flags indicating which counters should be flushed in the preheader.
3511PreheaderFlushFlags
3512SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
3513 const WaitcntBrackets &ScoreBrackets) {
3514 auto [Iterator, IsInserted] =
3515 PreheadersToFlush.try_emplace(&MBB, PreheaderFlushFlags());
3516 if (!IsInserted)
3517 return Iterator->second;
3518
3519 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
3520 if (!Succ)
3521 return PreheaderFlushFlags();
3522
3523 MachineLoop *Loop = MLI.getLoopFor(Succ);
3524 if (!Loop)
3525 return PreheaderFlushFlags();
3526
3527 if (Loop->getLoopPreheader() == &MBB) {
3528 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3529 return Iterator->second;
3530 }
3531
3532 return PreheaderFlushFlags();
3533}
3534
3535bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
3537 return TII.mayAccessVMEMThroughFlat(MI);
3538 return SIInstrInfo::isVMEM(MI);
3539}
3540
3541bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {
3542 return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore();
3543}
3544
3545// Check if instruction is a store to LDS that is counted via DSCNT
3546// (where that counter exists).
3547bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {
3548 return MI.mayStore() && SIInstrInfo::isDS(MI);
3549}
3550
3551// Return flags indicating which counters should be flushed in the preheader of
3552// the given loop. We currently decide to flush in the following situations:
3553// For VMEM (FlushVmCnt):
3554// 1. The loop contains vmem store(s), no vmem load and at least one use of a
3555// vgpr containing a value that is loaded outside of the loop. (Only on
3556// targets with no vscnt counter).
3557// 2. The loop contains vmem load(s), but the loaded values are not used in the
3558// loop, and at least one use of a vgpr containing a value that is loaded
3559// outside of the loop.
3560// For DS (FlushDsCnt, GFX12+ only):
3561// 3. The loop contains no DS reads, and at least one use of a vgpr containing
3562// a value that is DS read outside of the loop.
3563// 4. The loop contains DS read(s), loaded values are not used in the same
3564// iteration but in the next iteration (prefetch pattern), and at least one
3565// use of a vgpr containing a value that is DS read outside of the loop.
3566// Flushing in preheader reduces wait overhead if the wait requirement in
3567// iteration 1 would otherwise be more strict (but unfortunately preheader
3568// flush decision is taken before knowing that).
3569// 5. (Single-block loops only) The loop has DS prefetch reads with flush point
3570// tracking. Some DS reads may be used in the same iteration (creating
3571// "flush points"), but others remain unflushed at the backedge. When a DS
3572// read is consumed in the same iteration, it and all prior reads are
3573// "flushed" (FIFO order). No DS writes are allowed in the loop.
3574// TODO: Find a way to extend to multi-block loops.
3575PreheaderFlushFlags
3576SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
3577 const WaitcntBrackets &Brackets) {
3578 PreheaderFlushFlags Flags;
3579 bool HasVMemLoad = false;
3580 bool HasVMemStore = false;
3581 bool UsesVgprVMEMLoadedOutside = false;
3582 bool UsesVgprDSReadOutside = false;
3583 bool VMemInvalidated = false;
3584 // DS optimization only applies to GFX12+ where DS_CNT is separate.
3585 // Tracking status for "no DS read in loop" or "pure DS prefetch
3586 // (use only in next iteration)".
3587 bool TrackSimpleDSOpt = ST.hasExtendedWaitCounts();
3588 DenseSet<MCRegUnit> VgprUse;
3589 DenseSet<MCRegUnit> VgprDefVMEM;
3590 DenseSet<MCRegUnit> VgprDefDS;
3591
3592 // Track DS reads for prefetch pattern with flush points (single-block only).
3593 // Keeps track of the last DS read (position counted from the top of the loop)
3594 // to each VGPR. Read is considered consumed (and thus needs flushing) if
3595 // the dest register has a use or is overwritten (by any later opertions).
3596 DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
3597 unsigned DSReadPosition = 0;
3598 bool IsSingleBlock = ML->getNumBlocks() == 1;
3599 bool TrackDSFlushPoint = ST.hasExtendedWaitCounts() && IsSingleBlock;
3600 unsigned LastDSFlushPosition = 0;
3601
3602 for (MachineBasicBlock *MBB : ML->blocks()) {
3603 for (MachineInstr &MI : *MBB) {
3604 if (isVMEMOrFlatVMEM(MI)) {
3605 HasVMemLoad |= MI.mayLoad();
3606 HasVMemStore |= MI.mayStore();
3607 }
3608 // TODO: Can we relax DSStore check? There may be cases where
3609 // these DS stores are drained prior to the end of MBB (or loop).
3610 if (mayStoreIncrementingDSCNT(MI)) {
3611 // Early exit if none of the optimizations are feasible.
3612 // Otherwise, set tracking status appropriately and continue.
3613 if (VMemInvalidated)
3614 return Flags;
3615 TrackSimpleDSOpt = false;
3616 TrackDSFlushPoint = false;
3617 }
3618 bool IsDSRead = isDSRead(MI);
3619 if (IsDSRead)
3620 ++DSReadPosition;
3621
3622 // Helper: if RU has a pending DS read, update LastDSFlushPosition
3623 auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
3624 if (!TrackDSFlushPoint)
3625 return;
3626 if (auto It = LastDSReadPositionMap.find(RU);
3627 It != LastDSReadPositionMap.end()) {
3628 // RU defined by DSRead is used or overwritten. Need to complete
3629 // the read, if not already implied by a later DSRead (to any RU)
3630 // needing to complete in FIFO order.
3631 LastDSFlushPosition = std::max(LastDSFlushPosition, It->second);
3632 }
3633 };
3634
3635 for (const MachineOperand &Op : MI.all_uses()) {
3636 if (Op.isDebug() || !TRI.isVectorRegister(MRI, Op.getReg()))
3637 continue;
3638 // Vgpr use
3639 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3640 // If we find a register that is loaded inside the loop, 1. and 2.
3641 // are invalidated.
3642 if (VgprDefVMEM.contains(RU))
3643 VMemInvalidated = true;
3644
3645 // Check for DS reads used inside the loop
3646 if (VgprDefDS.contains(RU))
3647 TrackSimpleDSOpt = false;
3648
3649 // Early exit if all optimizations are invalidated
3650 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3651 return Flags;
3652
3653 // Check for flush points (DS read used in same iteration)
3654 updateDSReadFlushTracking(RU);
3655
3656 VgprUse.insert(RU);
3657 // Check if this register has a pending VMEM load from outside the
3658 // loop (value loaded outside and used inside).
3659 VMEMID ID = toVMEMID(RU);
3660 if (Brackets.hasPendingVMEM(ID, AMDGPU::LOAD_CNT) ||
3661 Brackets.hasPendingVMEM(ID, AMDGPU::SAMPLE_CNT) ||
3662 Brackets.hasPendingVMEM(ID, AMDGPU::BVH_CNT))
3663 UsesVgprVMEMLoadedOutside = true;
3664 // Check if loaded outside the loop via DS (not VMEM/FLAT).
3665 // Only consider it a DS read if there's no pending VMEM load for
3666 // this register, since FLAT can set both counters.
3667 else if (Brackets.hasPendingVMEM(ID, AMDGPU::DS_CNT))
3668 UsesVgprDSReadOutside = true;
3669 }
3670 }
3671
3672 // VMem load vgpr def
3673 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
3674 for (const MachineOperand &Op : MI.all_defs()) {
3675 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3676 // If we find a register that is loaded inside the loop, 1. and 2.
3677 // are invalidated.
3678 if (VgprUse.contains(RU))
3679 VMemInvalidated = true;
3680 VgprDefVMEM.insert(RU);
3681 }
3682 }
3683 // Early exit if all optimizations are invalidated
3684 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3685 return Flags;
3686 }
3687
3688 // DS read vgpr def
3689 // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
3690 // If USE comes before DEF, it's the prefetch pattern (use value from
3691 // previous iteration, read for next iteration). We should still flush
3692 // in preheader so iteration 1 doesn't need to wait inside the loop.
3693 // Only invalidate when DEF comes before USE (same-iteration consumption,
3694 // checked above when processing uses).
3695 if (IsDSRead || TrackDSFlushPoint) {
3696 for (const MachineOperand &Op : MI.all_defs()) {
3697 if (!TRI.isVectorRegister(MRI, Op.getReg()))
3698 continue;
3699 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3700 // Check for overwrite of pending DS read (flush point) by any
3701 // instruction
3702 updateDSReadFlushTracking(RU);
3703 if (IsDSRead) {
3704 VgprDefDS.insert(RU);
3705 if (TrackDSFlushPoint)
3706 LastDSReadPositionMap[RU] = DSReadPosition;
3707 }
3708 }
3709 }
3710 }
3711 }
3712 }
3713
3714 // VMEM flush decision
3715 if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
3716 ((!ST.hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3717 (HasVMemLoad && ST.hasVmemWriteVgprInOrder())))
3718 Flags.FlushVmCnt = true;
3719
3720 // DS flush decision:
3721 // Simple DS Opt: flush if loop uses DS read values from outside
3722 // and either has no DS reads in the loop, or DS reads whose results
3723 // are not used in the loop.
3724 bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
3725 // Prefetch with flush points: some DS reads used in same iteration,
3726 // but unflushed reads remain at backedge
3727 bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
3728 bool DSFlushPointPrefetch =
3729 TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
3730
3731 if (SimpleDSOpt || DSFlushPointPrefetch)
3732 Flags.FlushDsCnt = true;
3733
3734 return Flags;
3735}
3736
3737bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3738 auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3739 auto &PDT =
3740 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3741 AliasAnalysis *AA = nullptr;
3742 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3743 AA = &AAR->getAAResults();
3744
3745 return SIInsertWaitcnts(MLI, PDT, AA, MF).run();
3746}
3747
3748PreservedAnalyses
3751 auto &MLI = MFAM.getResult<MachineLoopAnalysis>(MF);
3752 auto &PDT = MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
3754 .getManager()
3755 .getCachedResult<AAManager>(MF.getFunction());
3756
3757 if (!SIInsertWaitcnts(MLI, PDT, AA, MF).run())
3758 return PreservedAnalyses::all();
3759
3762 .preserve<AAManager>();
3763}
3764
3765bool SIInsertWaitcnts::run() {
3767
3769
3770 // Initialize hardware limits first, as they're needed by the generators.
3771 Limits = AMDGPU::HardwareLimits(IV);
3772
3773 if (ST.hasExtendedWaitCounts()) {
3774 IsExpertMode = ST.hasExpertSchedulingMode() &&
3775 (ExpertSchedulingModeFlag.getNumOccurrences()
3777 : MF.getFunction()
3778 .getFnAttribute("amdgpu-expert-scheduling-mode")
3779 .getValueAsBool());
3780 MaxCounter = IsExpertMode ? AMDGPU::NUM_EXPERT_INST_CNTS
3782 // Initialize WCG per MF. It contains state that depends on MF attributes.
3783 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, Limits,
3784 IsExpertMode);
3785 } else {
3786 MaxCounter = AMDGPU::NUM_NORMAL_INST_CNTS;
3787 // Initialize WCG per MF. It contains state that depends on MF attributes.
3788 WCG = std::make_unique<WaitcntGeneratorPreGFX12>(
3789 MF, AMDGPU::NUM_NORMAL_INST_CNTS, Limits);
3790 }
3791
3792 SmemAccessCounter = getCounterFromEvent(SMEM_ACCESS);
3793
3794 bool Modified = false;
3795
3796 MachineBasicBlock &EntryBB = MF.front();
3797
3798 if (!MFI->isEntryFunction() &&
3799 !MF.getFunction().hasFnAttribute(Attribute::Naked)) {
3800 // Wait for any outstanding memory operations that the input registers may
3801 // depend on. We can't track them and it's better to do the wait after the
3802 // costly call sequence.
3803
3804 // TODO: Could insert earlier and schedule more liberally with operations
3805 // that only use caller preserved registers.
3807 while (I != EntryBB.end() && I->isMetaInstruction())
3808 ++I;
3809
3810 if (ST.hasExtendedWaitCounts()) {
3811 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
3812 .addImm(0);
3814 if (CT == AMDGPU::LOAD_CNT || CT == AMDGPU::DS_CNT ||
3815 CT == AMDGPU::STORE_CNT || CT == AMDGPU::X_CNT ||
3816 CT == AMDGPU::ASYNC_CNT)
3817 continue;
3818
3819 if (!ST.hasImageInsts() &&
3820 (CT == AMDGPU::EXP_CNT || CT == AMDGPU::SAMPLE_CNT ||
3821 CT == AMDGPU::BVH_CNT))
3822 continue;
3823
3824 BuildMI(EntryBB, I, DebugLoc(),
3825 TII.get(instrsForExtendedCounterTypes[CT]))
3826 .addImm(0);
3827 }
3828 if (IsExpertMode) {
3829 unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, ST);
3831 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3832 .addImm(Enc);
3833 }
3834 } else {
3835 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT)).addImm(0);
3836 }
3837
3838 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
3839 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3840 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3841
3842 Modified = true;
3843 }
3844
3845 // Keep iterating over the blocks in reverse post order, inserting and
3846 // updating s_waitcnt where needed, until a fix point is reached.
3847 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3848 BlockInfos.try_emplace(MBB);
3849
3850 std::unique_ptr<WaitcntBrackets> Brackets;
3851 bool Repeat;
3852 do {
3853 Repeat = false;
3854
3855 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
3856 ++BII) {
3857 MachineBasicBlock *MBB = BII->first;
3858 BlockInfo &BI = BII->second;
3859 if (!BI.Dirty)
3860 continue;
3861
3862 if (BI.Incoming) {
3863 if (!Brackets)
3864 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3865 else
3866 *Brackets = *BI.Incoming;
3867 } else {
3868 if (!Brackets) {
3869 Brackets = std::make_unique<WaitcntBrackets>(this);
3870 } else {
3871 // Reinitialize in-place. N.B. do not do this by assigning from a
3872 // temporary because the WaitcntBrackets class is large and it could
3873 // cause this function to use an unreasonable amount of stack space.
3874 Brackets->~WaitcntBrackets();
3875 new (Brackets.get()) WaitcntBrackets(this);
3876 }
3877 }
3878
3879 if (ST.hasWaitXcnt())
3880 Modified |= removeRedundantSoftXcnts(*MBB);
3881 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
3882 BI.Dirty = false;
3883
3884 if (Brackets->hasPendingEvent()) {
3885 BlockInfo *MoveBracketsToSucc = nullptr;
3886 for (MachineBasicBlock *Succ : MBB->successors()) {
3887 auto *SuccBII = BlockInfos.find(Succ);
3888 BlockInfo &SuccBI = SuccBII->second;
3889 if (!SuccBI.Incoming) {
3890 SuccBI.Dirty = true;
3891 if (SuccBII <= BII) {
3892 LLVM_DEBUG(dbgs() << "Repeat on backedge without merge\n");
3893 Repeat = true;
3894 }
3895 if (!MoveBracketsToSucc) {
3896 MoveBracketsToSucc = &SuccBI;
3897 } else {
3898 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3899 }
3900 } else {
3901 LLVM_DEBUG({
3902 dbgs() << "Try to merge ";
3903 MBB->printName(dbgs());
3904 dbgs() << " into ";
3905 Succ->printName(dbgs());
3906 dbgs() << '\n';
3907 });
3908 if (SuccBI.Incoming->merge(*Brackets)) {
3909 SuccBI.Dirty = true;
3910 if (SuccBII <= BII) {
3911 LLVM_DEBUG(dbgs() << "Repeat on backedge with merge\n");
3912 Repeat = true;
3913 }
3914 }
3915 }
3916 }
3917 if (MoveBracketsToSucc)
3918 MoveBracketsToSucc->Incoming = std::move(Brackets);
3919 }
3920 }
3921 } while (Repeat);
3922
3923 if (ST.hasScalarStores()) {
3924 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3925 bool HaveScalarStores = false;
3926
3927 for (MachineBasicBlock &MBB : MF) {
3928 for (MachineInstr &MI : MBB) {
3929 if (!HaveScalarStores && TII.isScalarStore(MI))
3930 HaveScalarStores = true;
3931
3932 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
3933 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3934 EndPgmBlocks.push_back(&MBB);
3935 }
3936 }
3937
3938 if (HaveScalarStores) {
3939 // If scalar writes are used, the cache must be flushed or else the next
3940 // wave to reuse the same scratch memory can be clobbered.
3941 //
3942 // Insert s_dcache_wb at wave termination points if there were any scalar
3943 // stores, and only if the cache hasn't already been flushed. This could
3944 // be improved by looking across blocks for flushes in postdominating
3945 // blocks from the stores but an explicitly requested flush is probably
3946 // very rare.
3947 for (MachineBasicBlock *MBB : EndPgmBlocks) {
3948 bool SeenDCacheWB = false;
3949
3950 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
3951 I != E; ++I) {
3952 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
3953 SeenDCacheWB = true;
3954 else if (TII.isScalarStore(*I))
3955 SeenDCacheWB = false;
3956
3957 // FIXME: It would be better to insert this before a waitcnt if any.
3958 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
3959 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3960 !SeenDCacheWB) {
3961 Modified = true;
3962 BuildMI(*MBB, I, I->getDebugLoc(), TII.get(AMDGPU::S_DCACHE_WB));
3963 }
3964 }
3965 }
3966 }
3967 }
3968
3969 if (IsExpertMode) {
3970 // Enable expert scheduling on function entry. To satisfy ABI requirements
3971 // and to allow calls between function with different expert scheduling
3972 // settings, disable it around calls and before returns.
3973
3975 while (I != EntryBB.end() && I->isMetaInstruction())
3976 ++I;
3977 setSchedulingMode(EntryBB, I, true);
3978
3979 for (MachineInstr *MI : CallInsts) {
3980 MachineBasicBlock &MBB = *MI->getParent();
3981 setSchedulingMode(MBB, MI, false);
3982 setSchedulingMode(MBB, std::next(MI->getIterator()), true);
3983 }
3984
3985 for (MachineInstr *MI : ReturnInsts)
3986 setSchedulingMode(*MI->getParent(), MI, false);
3987
3988 Modified = true;
3989 }
3990
3991 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
3992 // This is done in different ways depending on how the VGPRs were allocated
3993 // (i.e. whether we're in dynamic VGPR mode or not).
3994 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
3995 // waveslot limited kernel runs slower with the deallocation.
3996 if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) {
3997 for (auto [MI, _] : EndPgmInsts) {
3998 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3999 TII.get(AMDGPU::S_ALLOC_VGPR))
4000 .addImm(0);
4001 Modified = true;
4002 }
4003 } else if (!WCG->isOptNone() &&
4004 ST.getGeneration() >= AMDGPUSubtarget::GFX11 &&
4005 (MF.getFrameInfo().hasCalls() ||
4006 ST.getOccupancyWithNumVGPRs(
4007 TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass),
4008 /*IsDynamicVGPR=*/false) <
4010 for (auto [MI, Flag] : EndPgmInsts) {
4011 if (Flag) {
4012 if (ST.requiresNopBeforeDeallocVGPRs()) {
4013 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
4014 TII.get(AMDGPU::S_NOP))
4015 .addImm(0);
4016 }
4017 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
4018 TII.get(AMDGPU::S_SENDMSG))
4020 Modified = true;
4021 }
4022 }
4023 }
4024
4025 return Modified;
4026}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
#define _
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
#define AMDGPU_EVENT_NAME(Name)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static std::optional< AMDGPU::InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static bool isWaitInstr(MachineInstr &Inst)
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
Provides some synthesis utilities to produce sequences of values.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
Represents the counter values to wait for in an s_waitcnt instruction.
unsigned get(InstCounterType T) const
void set(InstCounterType T, unsigned Val)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:254
bool erase(const KeyT &Val)
Definition DenseMap.h:332
iterator end()
Definition DenseMap.h:85
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:239
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:728
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
LLVM_ABI void printName(raw_ostream &os, unsigned printNameFlags=PrintNameIr, ModuleSlotTracker *moduleSlotTracker=nullptr) const
Print the basic block's name as:
MachineInstrBundleIterator< MachineInstr > iterator
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator begin()
Definition MapVector.h:67
iterator find(const KeyT &Key)
Definition MapVector.h:156
iterator end()
Definition MapVector.h:69
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
Definition Pass.cpp:140
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool isXcntDrain(const MachineInstr &MI)
True if MI implicitly drains XCNT.
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool usesASYNC_CNT(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:882
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:185
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isDPMACCInstruction(unsigned Opc)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
iota_range< InstCounterType > inst_counter_types(InstCounterType MaxCounter)
unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getHasMatrixScale(unsigned Opc)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
bool empty() const
Definition BasicBlock.h:101
LLVM_ABI std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto seq_inclusive(T Begin, T End)
Iterate over an integral type from Begin to End inclusive.
Definition Sequence.h:325
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
APInt operator&(APInt a, const APInt &b)
Definition APInt.h:2152
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition Sequence.h:337
@ Wait
Definition Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2142
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2312
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
@ Async
"Asynchronous" unwind tables (instr precise)
Definition CodeGen.h:157
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool operator|=(SparseBitVector< ElementSize > &LHS, const SparseBitVector< ElementSize > *RHS)
APInt operator|(APInt a, const APInt &b)
Definition APInt.h:2172
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
#define N
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.