LLVM 23.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "AMDGPUWaitcntUtils.h"
28#include "GCNSubtarget.h"
32#include "llvm/ADT/MapVector.h"
34#include "llvm/ADT/Sequence.h"
40#include "llvm/IR/Dominators.h"
44
45using namespace llvm;
46
47#define DEBUG_TYPE "si-insert-waitcnts"
48
49DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
50 "Force emit s_waitcnt expcnt(0) instrs");
51DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
52 "Force emit s_waitcnt lgkmcnt(0) instrs");
53DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
54 "Force emit s_waitcnt vmcnt(0) instrs");
55
56static cl::opt<bool>
57 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
58 cl::desc("Force all waitcnt instrs to be emitted as "
59 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
60 cl::init(false), cl::Hidden);
61
63 "amdgpu-waitcnt-load-forcezero",
64 cl::desc("Force all waitcnt load counters to wait until 0"),
65 cl::init(false), cl::Hidden);
66
68 "amdgpu-expert-scheduling-mode",
69 cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
70 cl::init(false), cl::Hidden);
71
72namespace {
73// Get the maximum wait count value for a given counter type.
74static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
76 switch (T) {
78 return Limits.LoadcntMax;
79 case AMDGPU::DS_CNT:
80 return Limits.DscntMax;
81 case AMDGPU::EXP_CNT:
82 return Limits.ExpcntMax;
84 return Limits.StorecntMax;
86 return Limits.SamplecntMax;
87 case AMDGPU::BVH_CNT:
88 return Limits.BvhcntMax;
89 case AMDGPU::KM_CNT:
90 return Limits.KmcntMax;
91 case AMDGPU::X_CNT:
92 return Limits.XcntMax;
93 case AMDGPU::VA_VDST:
94 return Limits.VaVdstMax;
95 case AMDGPU::VM_VSRC:
96 return Limits.VmVsrcMax;
97 default:
98 return 0;
99 }
100}
101
102/// Integer IDs used to track vector memory locations we may have to wait on.
103/// Encoded as u16 chunks:
104///
105/// [0, REGUNITS_END ): MCRegUnit
106/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
107///
108/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
109/// It gives (2 << 16) - 1 entries per category which is more than enough
110/// for all register units. MCPhysReg is u16 so we don't even support >u16
111/// physical register numbers at this time, let alone >u16 register units.
112/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
113/// is enough for all register units.
114using VMEMID = uint32_t;
115
116enum : VMEMID {
117 TRACKINGID_RANGE_LEN = (1 << 16),
118
119 // Important: MCRegUnits must always be tracked starting from 0, as we
120 // need to be able to convert between a MCRegUnit and a VMEMID freely.
121 REGUNITS_BEGIN = 0,
122 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
123
124 // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
125 // entry, which is updated for all LDS DMA operations encountered.
126 // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
127 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
128 LDSDMA_BEGIN = REGUNITS_END,
129 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
130};
131
132/// Convert a MCRegUnit to a VMEMID.
133static constexpr VMEMID toVMEMID(MCRegUnit RU) {
134 return static_cast<unsigned>(RU);
135}
136
137#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
138 DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \
139 DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
140 DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
141 DECL(GLOBAL_INV_ACCESS) /* GLOBAL_INV (gfx12+ only) */ \
142 DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
143 DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
144 DECL(VMEM_GROUP) /* vmem group */ \
145 DECL(LDS_ACCESS) /* lds read & write */ \
146 DECL(GDS_ACCESS) /* gds read & write */ \
147 DECL(SQ_MESSAGE) /* send message */ \
148 DECL(SCC_WRITE) /* write to SCC from barrier */ \
149 DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
150 DECL(SMEM_GROUP) /* scalar-memory group */ \
151 DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
152 DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
153 DECL(EXP_POS_ACCESS) /* write to export position */ \
154 DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
155 DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
156 DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */ \
157 DECL(VGPR_CSMACC_WRITE) /* write VGPR dest in Core/Side-MACC VALU */ \
158 DECL(VGPR_DPMACC_WRITE) /* write VGPR dest in DPMACC VALU */ \
159 DECL(VGPR_TRANS_WRITE) /* write VGPR dest in TRANS VALU */ \
160 DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \
161 DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
162 DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
163 DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */ \
164 DECL(ASYNC_ACCESS) /* access that uses ASYNC_CNT */
165
166// clang-format off
167#define AMDGPU_EVENT_ENUM(Name) Name,
168enum WaitEventType {
170 NUM_WAIT_EVENTS
171};
172#undef AMDGPU_EVENT_ENUM
173} // namespace
174
175namespace llvm {
176template <> struct enum_iteration_traits<WaitEventType> {
177 static constexpr bool is_iterable = true;
178};
179} // namespace llvm
180
181namespace {
182
183/// Return an iterator over all events between VMEM_ACCESS (the first event)
184/// and \c MaxEvent (exclusive, default value yields an enumeration over
185/// all counters).
186auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
187 return enum_seq(VMEM_ACCESS, MaxEvent);
188}
189
190#define AMDGPU_EVENT_NAME(Name) #Name,
191static constexpr StringLiteral WaitEventTypeName[] = {
193};
194#undef AMDGPU_EVENT_NAME
195static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
196 return WaitEventTypeName[Event];
197}
198// clang-format on
199
200// Enumerate different types of result-returning VMEM operations. Although
201// s_waitcnt orders them all with a single vmcnt counter, in the absence of
202// s_waitcnt only instructions of the same VmemType are guaranteed to write
203// their results in order -- so there is no need to insert an s_waitcnt between
204// two instructions of the same type that write the same vgpr.
205enum VmemType {
206 // BUF instructions and MIMG instructions without a sampler.
207 VMEM_NOSAMPLER,
208 // MIMG instructions with a sampler.
209 VMEM_SAMPLER,
210 // BVH instructions
211 VMEM_BVH,
212 NUM_VMEM_TYPES
213};
214
215// Maps values of InstCounterType to the instruction that waits on that
216// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
217// returns true, and does not cover VA_VDST or VM_VSRC.
218static const unsigned
219 instrsForExtendedCounterTypes[AMDGPU::NUM_EXTENDED_INST_CNTS] = {
220 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT,
221 AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT,
222 AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
223 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT,
224 AMDGPU::S_WAIT_ASYNCCNT};
225
226// ASYNCMARK and WAIT_ASYNCMARK are meta instructions that emit no hardware
227// code but still need to be processed by this pass for async vmcnt tracking.
228static bool isNonWaitcntMetaInst(const MachineInstr &MI) {
229 switch (MI.getOpcode()) {
230 case AMDGPU::ASYNCMARK:
231 case AMDGPU::WAIT_ASYNCMARK:
232 return false;
233 default:
234 return MI.isMetaInstruction();
235 }
236}
237
238static bool updateVMCntOnly(const MachineInstr &Inst) {
239 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
241}
242
243#ifndef NDEBUG
244static bool isNormalMode(AMDGPU::InstCounterType MaxCounter) {
245 return MaxCounter == AMDGPU::NUM_NORMAL_INST_CNTS;
246}
247#endif // NDEBUG
248
249VmemType getVmemType(const MachineInstr &Inst) {
250 assert(updateVMCntOnly(Inst));
251 if (!SIInstrInfo::isImage(Inst))
252 return VMEM_NOSAMPLER;
253 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
254 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
255 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
256
257 if (BaseInfo->BVH)
258 return VMEM_BVH;
259
260 // We have to make an additional check for isVSAMPLE here since some
261 // instructions don't have a sampler, but are still classified as sampler
262 // instructions for the purposes of e.g. waitcnt.
263 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
264 return VMEM_SAMPLER;
265
266 return VMEM_NOSAMPLER;
267}
268
269void addWait(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T, unsigned Count) {
270 Wait.set(T, std::min(Wait.get(T), Count));
271}
272
274 Wait.set(T, ~0u);
275}
276
277/// A small set of events.
278class WaitEventSet {
279 unsigned Mask = 0;
280
281public:
282 WaitEventSet() = default;
283 explicit constexpr WaitEventSet(WaitEventType Event) {
284 static_assert(NUM_WAIT_EVENTS <= sizeof(Mask) * 8,
285 "Not enough bits in Mask for all the events");
286 Mask |= 1 << Event;
287 }
288 constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
289 for (auto &E : Events) {
290 Mask |= 1 << E;
291 }
292 }
293 void insert(const WaitEventType &Event) { Mask |= 1 << Event; }
294 void remove(const WaitEventType &Event) { Mask &= ~(1 << Event); }
295 void remove(const WaitEventSet &Other) { Mask &= ~Other.Mask; }
296 bool contains(const WaitEventType &Event) const {
297 return Mask & (1 << Event);
298 }
299 /// \Returns true if this set contains all elements of \p Other.
300 bool contains(const WaitEventSet &Other) const {
301 return (~Mask & Other.Mask) == 0;
302 }
303 /// \Returns the intersection of this and \p Other.
304 WaitEventSet operator&(const WaitEventSet &Other) const {
305 auto Copy = *this;
306 Copy.Mask &= Other.Mask;
307 return Copy;
308 }
309 /// \Returns the union of this and \p Other.
310 WaitEventSet operator|(const WaitEventSet &Other) const {
311 auto Copy = *this;
312 Copy.Mask |= Other.Mask;
313 return Copy;
314 }
315 /// This set becomes the union of this and \p Other.
316 WaitEventSet &operator|=(const WaitEventSet &Other) {
317 Mask |= Other.Mask;
318 return *this;
319 }
320 /// This set becomes the intersection of this and \p Other.
321 WaitEventSet &operator&=(const WaitEventSet &Other) {
322 Mask &= Other.Mask;
323 return *this;
324 }
325 bool operator==(const WaitEventSet &Other) const {
326 return Mask == Other.Mask;
327 }
328 bool operator!=(const WaitEventSet &Other) const { return !(*this == Other); }
329 bool empty() const { return Mask == 0; }
330 /// \Returns true if the set contains more than one element.
331 bool twoOrMore() const { return Mask & (Mask - 1); }
332 operator bool() const { return !empty(); }
333 void print(raw_ostream &OS) const {
334 ListSeparator LS(", ");
335 for (WaitEventType Event : wait_events()) {
336 if (contains(Event))
337 OS << LS << getWaitEventTypeName(Event);
338 }
339 }
340 LLVM_DUMP_METHOD void dump() const;
341};
342
343void WaitEventSet::dump() const {
344 print(dbgs());
345 dbgs() << "\n";
346}
347
348class WaitcntBrackets;
349
350// This abstracts the logic for generating and updating S_WAIT* instructions
351// away from the analysis that determines where they are needed. This was
352// done because the set of counters and instructions for waiting on them
353// underwent a major shift with gfx12, sufficiently so that having this
354// abstraction allows the main analysis logic to be simpler than it would
355// otherwise have had to become.
356class WaitcntGenerator {
357protected:
358 const GCNSubtarget &ST;
359 const SIInstrInfo &TII;
360 AMDGPU::IsaVersion IV;
361 AMDGPU::InstCounterType MaxCounter;
362 bool OptNone;
363 bool ExpandWaitcntProfiling = false;
364 const AMDGPU::HardwareLimits &Limits;
365
366public:
367 WaitcntGenerator() = delete;
368 WaitcntGenerator(const WaitcntGenerator &) = delete;
369 WaitcntGenerator(const MachineFunction &MF,
370 AMDGPU::InstCounterType MaxCounter,
371 const AMDGPU::HardwareLimits &Limits)
372 : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
373 IV(AMDGPU::getIsaVersion(ST.getCPU())), MaxCounter(MaxCounter),
374 OptNone(MF.getFunction().hasOptNone() ||
375 MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
376 ExpandWaitcntProfiling(
377 MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),
378 Limits(Limits) {}
379
380 // Return true if the current function should be compiled with no
381 // optimization.
382 bool isOptNone() const { return OptNone; }
383
384 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
385
386 // Edits an existing sequence of wait count instructions according
387 // to an incoming Waitcnt value, which is itself updated to reflect
388 // any new wait count instructions which may need to be generated by
389 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
390 // were made.
391 //
392 // This editing will usually be merely updated operands, but it may also
393 // delete instructions if the incoming Wait value indicates they are not
394 // needed. It may also remove existing instructions for which a wait
395 // is needed if it can be determined that it is better to generate new
396 // instructions later, as can happen on gfx12.
397 virtual bool
398 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
399 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
401
402 // Transform a soft waitcnt into a normal one.
403 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
404
405 // Generates new wait count instructions according to the value of
406 // Wait, returning true if any new instructions were created.
407 // ScoreBrackets is used for profiling expansion.
408 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
410 AMDGPU::Waitcnt Wait,
411 const WaitcntBrackets &ScoreBrackets) = 0;
412
413 // Returns the WaitEventSet that corresponds to counter \p T.
414 virtual const WaitEventSet &
415 getWaitEvents(AMDGPU::InstCounterType T) const = 0;
416
417 /// \returns the counter that corresponds to event \p E.
418 AMDGPU::InstCounterType getCounterFromEvent(WaitEventType E) const {
419 for (auto T : AMDGPU::inst_counter_types()) {
420 if (getWaitEvents(T).contains(E))
421 return T;
422 }
423 llvm_unreachable("event type has no associated counter");
424 }
425
426 // Returns a new waitcnt with all counters except VScnt set to 0. If
427 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
428 // AsyncCnt always defaults to ~0u (don't wait for it). It is only updated
429 // when a call to @llvm.amdgcn.wait.asyncmark() is processed.
430 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
431
432 virtual ~WaitcntGenerator() = default;
433};
434
435class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
436 static constexpr const WaitEventSet
437 WaitEventMaskForInstPreGFX12[AMDGPU::NUM_INST_CNTS] = {
438 WaitEventSet(
439 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
440 WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
441 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
442 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
443 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
444 WaitEventSet(),
445 WaitEventSet(),
446 WaitEventSet(),
447 WaitEventSet(),
448 WaitEventSet(),
449 WaitEventSet()};
450
451public:
452 using WaitcntGenerator::WaitcntGenerator;
453 bool
454 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
455 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
456 MachineBasicBlock::instr_iterator It) const override;
457
458 bool createNewWaitcnt(MachineBasicBlock &Block,
460 AMDGPU::Waitcnt Wait,
461 const WaitcntBrackets &ScoreBrackets) override;
462
463 const WaitEventSet &getWaitEvents(AMDGPU::InstCounterType T) const override {
464 return WaitEventMaskForInstPreGFX12[T];
465 }
466
467 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
468};
469
470class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
471protected:
472 bool IsExpertMode;
473 static constexpr const WaitEventSet
474 WaitEventMaskForInstGFX12Plus[AMDGPU::NUM_INST_CNTS] = {
475 WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
476 WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
477 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
478 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
479 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
480 WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),
481 WaitEventSet({VMEM_BVH_READ_ACCESS}),
482 WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
483 WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
484 WaitEventSet({ASYNC_ACCESS}),
485 WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
486 VGPR_XDL_WRITE}),
487 WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
488
489public:
490 WaitcntGeneratorGFX12Plus() = delete;
491 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
492 AMDGPU::InstCounterType MaxCounter,
493 const AMDGPU::HardwareLimits &Limits,
494 bool IsExpertMode)
495 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
496
497 bool
498 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
499 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
500 MachineBasicBlock::instr_iterator It) const override;
501
502 bool createNewWaitcnt(MachineBasicBlock &Block,
504 AMDGPU::Waitcnt Wait,
505 const WaitcntBrackets &ScoreBrackets) override;
506
507 const WaitEventSet &getWaitEvents(AMDGPU::InstCounterType T) const override {
508 return WaitEventMaskForInstGFX12Plus[T];
509 }
510
511 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
512};
513
514// Flags indicating which counters should be flushed in a loop preheader.
515struct PreheaderFlushFlags {
516 bool FlushVmCnt = false;
517 bool FlushDsCnt = false;
518};
519
520class SIInsertWaitcnts {
521 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
522 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
523 MachineLoopInfo &MLI;
524 MachinePostDominatorTree &PDT;
525 AliasAnalysis *AA = nullptr;
526 MachineFunction &MF;
527
528 struct BlockInfo {
529 std::unique_ptr<WaitcntBrackets> Incoming;
530 bool Dirty = true;
531 BlockInfo() = default;
532 BlockInfo(BlockInfo &&) = default;
533 BlockInfo &operator=(BlockInfo &&) = default;
534 ~BlockInfo();
535 };
536
537 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
538
539 bool ForceEmitWaitcnt[AMDGPU::NUM_INST_CNTS] = {};
540
541 std::unique_ptr<WaitcntGenerator> WCG;
542
543 // Remember call and return instructions in the function.
544 DenseSet<MachineInstr *> CallInsts;
545 DenseSet<MachineInstr *> ReturnInsts;
546
547 // Remember all S_ENDPGM instructions. The boolean flag is true if there might
548 // be outstanding stores but definitely no outstanding scratch stores, to help
549 // with insertion of DEALLOC_VGPRS messages.
550 DenseMap<MachineInstr *, bool> EndPgmInsts;
551
552 AMDGPU::HardwareLimits Limits;
553
554public:
555 const GCNSubtarget &ST;
556 const SIInstrInfo &TII;
557 const SIRegisterInfo &TRI;
558 const MachineRegisterInfo &MRI;
559 AMDGPU::InstCounterType SmemAccessCounter;
560 AMDGPU::InstCounterType MaxCounter;
561 bool IsExpertMode = false;
562
563 SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,
564 AliasAnalysis *AA, MachineFunction &MF)
565 : MLI(MLI), PDT(PDT), AA(AA), MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),
566 TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()),
567 MRI(MF.getRegInfo()) {
568 (void)ForceExpCounter;
569 (void)ForceLgkmCounter;
570 (void)ForceVMCounter;
571 }
572
573 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
574
575 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
576 const WaitcntBrackets &Brackets);
577 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
578 const WaitcntBrackets &ScoreBrackets);
579 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
580 bool isDSRead(const MachineInstr &MI) const;
581 bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;
582 bool run();
583
584 void setForceEmitWaitcnt() {
585// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
586// For debug builds, get the debug counter info and adjust if need be
587#ifndef NDEBUG
588 if (DebugCounter::isCounterSet(ForceExpCounter) &&
589 DebugCounter::shouldExecute(ForceExpCounter)) {
590 ForceEmitWaitcnt[AMDGPU::EXP_CNT] = true;
591 } else {
592 ForceEmitWaitcnt[AMDGPU::EXP_CNT] = false;
593 }
594
595 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
596 DebugCounter::shouldExecute(ForceLgkmCounter)) {
597 ForceEmitWaitcnt[AMDGPU::DS_CNT] = true;
598 ForceEmitWaitcnt[AMDGPU::KM_CNT] = true;
599 } else {
600 ForceEmitWaitcnt[AMDGPU::DS_CNT] = false;
601 ForceEmitWaitcnt[AMDGPU::KM_CNT] = false;
602 }
603
604 if (DebugCounter::isCounterSet(ForceVMCounter) &&
605 DebugCounter::shouldExecute(ForceVMCounter)) {
606 ForceEmitWaitcnt[AMDGPU::LOAD_CNT] = true;
607 ForceEmitWaitcnt[AMDGPU::SAMPLE_CNT] = true;
608 ForceEmitWaitcnt[AMDGPU::BVH_CNT] = true;
609 } else {
610 ForceEmitWaitcnt[AMDGPU::LOAD_CNT] = false;
611 ForceEmitWaitcnt[AMDGPU::SAMPLE_CNT] = false;
612 ForceEmitWaitcnt[AMDGPU::BVH_CNT] = false;
613 }
614
615 ForceEmitWaitcnt[AMDGPU::VA_VDST] = false;
616 ForceEmitWaitcnt[AMDGPU::VM_VSRC] = false;
617#endif // NDEBUG
618 }
619
620 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
621 // instruction.
622 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
623 switch (Inst.getOpcode()) {
624 // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
625 case AMDGPU::GLOBAL_INV:
626 return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write
627 // VGPRs
628 case AMDGPU::GLOBAL_WB:
629 case AMDGPU::GLOBAL_WBINV:
630 return VMEM_WRITE_ACCESS; // tracked using storecnt
631 default:
632 break;
633 }
634
635 // Maps VMEM access types to their corresponding WaitEventType.
636 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
637 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
638
640 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
641 // these should use VM_CNT.
642 if (!ST.hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
643 return VMEM_ACCESS;
644 if (Inst.mayStore() &&
645 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
646 if (TII.mayAccessScratch(Inst))
647 return SCRATCH_WRITE_ACCESS;
648 return VMEM_WRITE_ACCESS;
649 }
650 if (!ST.hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
651 return VMEM_ACCESS;
652 return VmemReadMapping[getVmemType(Inst)];
653 }
654
655 std::optional<WaitEventType>
656 getExpertSchedulingEventType(const MachineInstr &Inst) const;
657
658 bool isAsync(const MachineInstr &MI) const {
660 return false;
662 return true;
663 const MachineOperand *Async =
664 TII.getNamedOperand(MI, AMDGPU::OpName::IsAsync);
665 return Async && (Async->getImm());
666 }
667
668 bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const {
669 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI);
670 }
671
672 bool isAsyncLdsDmaWrite(const MachineInstr &MI) const {
673 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
674 }
675
676 bool shouldUpdateAsyncMark(const MachineInstr &MI,
678 if (!isAsyncLdsDmaWrite(MI))
679 return false;
681 return T == AMDGPU::ASYNC_CNT;
682 return T == AMDGPU::LOAD_CNT;
683 }
684
685 bool isVmemAccess(const MachineInstr &MI) const;
686 bool generateWaitcntInstBefore(MachineInstr &MI,
687 WaitcntBrackets &ScoreBrackets,
688 MachineInstr *OldWaitcntInstr,
689 PreheaderFlushFlags FlushFlags);
690 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
692 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
693 MachineInstr *OldWaitcntInstr);
694 /// \returns all events that correspond to \p Inst.
695 WaitEventSet getEventsFor(const MachineInstr &Inst) const;
696 void updateEventWaitcntAfter(MachineInstr &Inst,
697 WaitcntBrackets *ScoreBrackets);
698 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
699 MachineBasicBlock *Block) const;
700 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
701 WaitcntBrackets &ScoreBrackets);
702 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
703 WaitcntBrackets &ScoreBrackets);
704 /// Removes redundant Soft Xcnt Waitcnts in \p Block emitted by the Memory
705 /// Legalizer. Returns true if block was modified.
706 bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
707 void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
708 bool ExpertMode) const;
709 const WaitEventSet &getWaitEvents(AMDGPU::InstCounterType T) const {
710 return WCG->getWaitEvents(T);
711 }
712 AMDGPU::InstCounterType getCounterFromEvent(WaitEventType E) const {
713 return WCG->getCounterFromEvent(E);
714 }
715};
716
717// This objects maintains the current score brackets of each wait counter, and
718// a per-register scoreboard for each wait counter.
719//
720// We also maintain the latest score for every event type that can change the
721// waitcnt in order to know if there are multiple types of events within
722// the brackets. When multiple types of event happen in the bracket,
723// wait count may get decreased out of order, therefore we need to put in
724// "s_waitcnt 0" before use.
725class WaitcntBrackets {
726public:
727 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
728 assert(Context->TRI.getNumRegUnits() < REGUNITS_END);
729 }
730
731#ifndef NDEBUG
732 ~WaitcntBrackets() {
733 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
734 for (auto &[ID, Val] : VMem) {
735 if (Val.empty())
736 ++NumUnusedVmem;
737 }
738 for (auto &[ID, Val] : SGPRs) {
739 if (Val.empty())
740 ++NumUnusedSGPRs;
741 }
742
743 if (NumUnusedVmem || NumUnusedSGPRs) {
744 errs() << "WaitcntBracket had unused entries at destruction time: "
745 << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
746 << " SGPR unused entries\n";
747 std::abort();
748 }
749 }
750#endif
751
752 bool isSmemCounter(AMDGPU::InstCounterType T) const {
753 return T == Context->SmemAccessCounter || T == AMDGPU::X_CNT;
754 }
755
756 unsigned getOutstanding(AMDGPU::InstCounterType T) const {
757 return ScoreUBs[T] - ScoreLBs[T];
758 }
759
760 bool hasPendingVMEM(VMEMID ID, AMDGPU::InstCounterType T) const {
761 return getVMemScore(ID, T) > getScoreLB(T);
762 }
763
764 /// \Return true if we have no score entries for counter \p T.
765 bool empty(AMDGPU::InstCounterType T) const { return getScoreRange(T) == 0; }
766
767private:
768 unsigned getScoreLB(AMDGPU::InstCounterType T) const {
770 return ScoreLBs[T];
771 }
772
773 unsigned getScoreUB(AMDGPU::InstCounterType T) const {
775 return ScoreUBs[T];
776 }
777
778 unsigned getScoreRange(AMDGPU::InstCounterType T) const {
779 return getScoreUB(T) - getScoreLB(T);
780 }
781
782 unsigned getSGPRScore(MCRegUnit RU, AMDGPU::InstCounterType T) const {
783 auto It = SGPRs.find(RU);
784 return It != SGPRs.end() ? It->second.get(T) : 0;
785 }
786
787 unsigned getVMemScore(VMEMID TID, AMDGPU::InstCounterType T) const {
788 auto It = VMem.find(TID);
789 return It != VMem.end() ? It->second.Scores[T] : 0;
790 }
791
792public:
793 bool merge(const WaitcntBrackets &Other);
794
795 bool counterOutOfOrder(AMDGPU::InstCounterType T) const;
796 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
797 simplifyWaitcnt(Wait, Wait);
798 }
799 void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
800 AMDGPU::Waitcnt &UpdateWait) const;
801 void simplifyWaitcnt(AMDGPU::InstCounterType T, unsigned &Count) const;
802 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T) const;
803 void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
804 AMDGPU::Waitcnt &UpdateWait) const;
805 void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
806 AMDGPU::Waitcnt &UpdateWait) const;
807
808 void determineWaitForPhysReg(AMDGPU::InstCounterType T, MCPhysReg Reg,
809 AMDGPU::Waitcnt &Wait,
810 const MachineInstr &MI) const;
811 MCPhysReg determineVGPR16Dependency(const MachineInstr &MI,
813 MCPhysReg Reg) const;
814 void determineWaitForLDSDMA(AMDGPU::InstCounterType T, VMEMID TID,
815 AMDGPU::Waitcnt &Wait) const;
816 AMDGPU::Waitcnt determineAsyncWait(unsigned N);
817 void tryClearSCCWriteEvent(MachineInstr *Inst);
818
819 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
820 void applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count);
821 void applyWaitcnt(const AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T);
822 void updateByEvent(WaitEventType E, MachineInstr &MI);
823 void recordAsyncMark(MachineInstr &MI);
824
825 bool hasPendingEvent() const { return !PendingEvents.empty(); }
826 bool hasPendingEvent(WaitEventType E) const {
827 return PendingEvents.contains(E);
828 }
829 bool hasPendingEvent(AMDGPU::InstCounterType T) const {
830 bool HasPending = PendingEvents & Context->getWaitEvents(T);
831 assert(HasPending == !empty(T) &&
832 "Expected pending events iff scoreboard is not empty");
833 return HasPending;
834 }
835
836 bool hasMixedPendingEvents(AMDGPU::InstCounterType T) const {
837 WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
838 // Return true if more than one bit is set in Events.
839 return Events.twoOrMore();
840 }
841
842 bool hasPendingFlat() const {
843 return ((LastFlatDsCnt > ScoreLBs[AMDGPU::DS_CNT] &&
844 LastFlatDsCnt <= ScoreUBs[AMDGPU::DS_CNT]) ||
845 (LastFlatLoadCnt > ScoreLBs[AMDGPU::LOAD_CNT] &&
846 LastFlatLoadCnt <= ScoreUBs[AMDGPU::LOAD_CNT]));
847 }
848
849 void setPendingFlat() {
850 LastFlatLoadCnt = ScoreUBs[AMDGPU::LOAD_CNT];
851 LastFlatDsCnt = ScoreUBs[AMDGPU::DS_CNT];
852 }
853
854 bool hasPendingGDS() const {
855 return LastGDS > ScoreLBs[AMDGPU::DS_CNT] &&
856 LastGDS <= ScoreUBs[AMDGPU::DS_CNT];
857 }
858
859 unsigned getPendingGDSWait() const {
860 return std::min(getScoreUB(AMDGPU::DS_CNT) - LastGDS,
861 getWaitCountMax(Context->getLimits(), AMDGPU::DS_CNT) - 1);
862 }
863
864 void setPendingGDS() { LastGDS = ScoreUBs[AMDGPU::DS_CNT]; }
865
866 // Return true if there might be pending writes to the vgpr-interval by VMEM
867 // instructions with types different from V.
868 bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
869 for (MCRegUnit RU : regunits(Reg)) {
870 auto It = VMem.find(toVMEMID(RU));
871 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
872 return true;
873 }
874 return false;
875 }
876
877 void clearVgprVmemTypes(MCPhysReg Reg) {
878 for (MCRegUnit RU : regunits(Reg)) {
879 if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
880 It->second.VMEMTypes = 0;
881 if (It->second.empty())
882 VMem.erase(It);
883 }
884 }
885 }
886
887 void setStateOnFunctionEntryOrReturn() {
888 setScoreUB(AMDGPU::STORE_CNT,
889 getScoreUB(AMDGPU::STORE_CNT) +
890 getWaitCountMax(Context->getLimits(), AMDGPU::STORE_CNT));
891 PendingEvents |= Context->getWaitEvents(AMDGPU::STORE_CNT);
892 }
893
894 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
895 return LDSDMAStores;
896 }
897
898 bool hasPointSampleAccel(const MachineInstr &MI) const;
899 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
900 MCPhysReg RU) const;
901
902 void print(raw_ostream &) const;
903 void dump() const { print(dbgs()); }
904
905 // Free up memory by removing empty entries from the DenseMap that track event
906 // scores.
907 void purgeEmptyTrackingData();
908
909private:
910 struct MergeInfo {
911 unsigned OldLB;
912 unsigned OtherLB;
913 unsigned MyShift;
914 unsigned OtherShift;
915 };
916
917 using CounterValueArray = std::array<unsigned, AMDGPU::NUM_INST_CNTS>;
918
919 void determineWaitForScore(AMDGPU::InstCounterType T, unsigned Score,
920 AMDGPU::Waitcnt &Wait) const;
921
922 static bool mergeScore(const MergeInfo &M, unsigned &Score,
923 unsigned OtherScore);
924 bool mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
925 ArrayRef<CounterValueArray> OtherMarks);
926
928 assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
929 if (!Context->TRI.isInAllocatableClass(Reg))
930 return {{}, {}};
931 return Context->TRI.regunits(Reg);
932 }
933
934 void setScoreLB(AMDGPU::InstCounterType T, unsigned Val) {
936 ScoreLBs[T] = Val;
937 }
938
939 void setScoreUB(AMDGPU::InstCounterType T, unsigned Val) {
941 ScoreUBs[T] = Val;
942
943 if (T != AMDGPU::EXP_CNT)
944 return;
945
946 if (getScoreRange(AMDGPU::EXP_CNT) >
947 getWaitCountMax(Context->getLimits(), AMDGPU::EXP_CNT))
948 ScoreLBs[AMDGPU::EXP_CNT] =
949 ScoreUBs[AMDGPU::EXP_CNT] -
950 getWaitCountMax(Context->getLimits(), AMDGPU::EXP_CNT);
951 }
952
953 void setRegScore(MCPhysReg Reg, AMDGPU::InstCounterType T, unsigned Val) {
954 const SIRegisterInfo &TRI = Context->TRI;
955 if (Reg == AMDGPU::SCC) {
956 SCCScore = Val;
957 } else if (TRI.isVectorRegister(Context->MRI, Reg)) {
958 for (MCRegUnit RU : regunits(Reg))
959 VMem[toVMEMID(RU)].Scores[T] = Val;
960 } else if (TRI.isSGPRReg(Context->MRI, Reg)) {
961 for (MCRegUnit RU : regunits(Reg))
962 SGPRs[RU].get(T) = Val;
963 } else {
964 llvm_unreachable("Register cannot be tracked/unknown register!");
965 }
966 }
967
968 void setVMemScore(VMEMID TID, AMDGPU::InstCounterType T, unsigned Val) {
969 VMem[TID].Scores[T] = Val;
970 }
971
972 void setScoreByOperand(const MachineOperand &Op,
973 AMDGPU::InstCounterType CntTy, unsigned Val);
974
975 const SIInsertWaitcnts *Context;
976
977 unsigned ScoreLBs[AMDGPU::NUM_INST_CNTS] = {0};
978 unsigned ScoreUBs[AMDGPU::NUM_INST_CNTS] = {0};
979 WaitEventSet PendingEvents;
980 // Remember the last flat memory operation.
981 unsigned LastFlatDsCnt = 0;
982 unsigned LastFlatLoadCnt = 0;
983 // Remember the last GDS operation.
984 unsigned LastGDS = 0;
985
986 // The score tracking logic is fragmented as follows:
987 // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
988 // - SGPRs: SGPR RegUnits
989 // - SCC: Non-allocatable and not general purpose: not a SGPR.
990 //
991 // For the VMem case, if the key is within the range of LDS DMA IDs,
992 // then the corresponding index into the `LDSDMAStores` vector below is:
993 // Key - LDSDMA_BEGIN - 1
994 // This is because LDSDMA_BEGIN is a generic entry and does not have an
995 // associated MachineInstr.
996 //
997 // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
998
999 struct VMEMInfo {
1000 // Scores for all instruction counters. Zero-initialized.
1001 CounterValueArray Scores{};
1002 // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
1003 unsigned VMEMTypes = 0;
1004
1005 bool empty() const { return all_of(Scores, equal_to(0)) && !VMEMTypes; }
1006 };
1007
1008 /// Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
1009 /// pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
1010 class SGPRInfo {
1011 /// Either DS_CNT or KM_CNT score.
1012 unsigned ScoreDsKmCnt = 0;
1013 unsigned ScoreXCnt = 0;
1014
1015 public:
1016 unsigned get(AMDGPU::InstCounterType T) const {
1017 assert(
1018 (T == AMDGPU::DS_CNT || T == AMDGPU::KM_CNT || T == AMDGPU::X_CNT) &&
1019 "Invalid counter");
1020 return T == AMDGPU::X_CNT ? ScoreXCnt : ScoreDsKmCnt;
1021 }
1022 unsigned &get(AMDGPU::InstCounterType T) {
1023 assert(
1024 (T == AMDGPU::DS_CNT || T == AMDGPU::KM_CNT || T == AMDGPU::X_CNT) &&
1025 "Invalid counter");
1026 return T == AMDGPU::X_CNT ? ScoreXCnt : ScoreDsKmCnt;
1027 }
1028
1029 bool empty() const { return !ScoreDsKmCnt && !ScoreXCnt; }
1030 };
1031
1032 DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
1033 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
1034
1035 // Reg score for SCC.
1036 unsigned SCCScore = 0;
1037 // The unique instruction that has an SCC write pending, if there is one.
1038 const MachineInstr *PendingSCCWrite = nullptr;
1039
1040 // Store representative LDS DMA operations. The only useful info here is
1041 // alias info. One store is kept per unique AAInfo.
1042 SmallVector<const MachineInstr *> LDSDMAStores;
1043
1044 // State of all counters at each async mark encountered so far.
1046
1047 // But in the rare pathological case, a nest of loops that pushes marks
1048 // without waiting on any mark can cause AsyncMarks to grow very large. We cap
1049 // it to a reasonable limit. We can tune this later or potentially introduce a
1050 // user option to control the value.
1051 static constexpr unsigned MaxAsyncMarks = 16;
1052
1053 // Track the upper bound score for async operations that are not part of a
1054 // mark yet. Initialized to all zeros.
1055 CounterValueArray AsyncScore{};
1056};
1057
1058SIInsertWaitcnts::BlockInfo::~BlockInfo() = default;
1059
1060class SIInsertWaitcntsLegacy : public MachineFunctionPass {
1061public:
1062 static char ID;
1063 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
1064
1065 bool runOnMachineFunction(MachineFunction &MF) override;
1066
1067 StringRef getPassName() const override {
1068 return "SI insert wait instructions";
1069 }
1070
1071 void getAnalysisUsage(AnalysisUsage &AU) const override {
1072 AU.setPreservesCFG();
1073 AU.addRequired<MachineLoopInfoWrapperPass>();
1074 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
1075 AU.addUsedIfAvailable<AAResultsWrapperPass>();
1076 AU.addPreserved<AAResultsWrapperPass>();
1078 }
1079};
1080
1081} // end anonymous namespace
1082
1083void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
1085 unsigned Score) {
1086 setRegScore(Op.getReg().asMCReg(), CntTy, Score);
1087}
1088
1089// Return true if the subtarget is one that enables Point Sample Acceleration
1090// and the MachineInstr passed in is one to which it might be applied (the
1091// hardware makes this decision based on several factors, but we can't determine
1092// this at compile time, so we have to assume it might be applied if the
1093// instruction supports it).
1094bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
1095 if (!Context->ST.hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
1096 return false;
1097
1098 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
1099 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
1101 return BaseInfo->PointSampleAccel;
1102}
1103
1104// Return true if the subtarget enables Point Sample Acceleration, the supplied
1105// MachineInstr is one to which it might be applied and the supplied interval is
1106// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
1107// (this is the type that a point sample accelerated instruction effectively
1108// becomes)
1109bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
1110 MCPhysReg Reg) const {
1111 if (!hasPointSampleAccel(MI))
1112 return false;
1113
1114 return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER);
1115}
1116
1117void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
1118 AMDGPU::InstCounterType T = Context->getCounterFromEvent(E);
1119 assert(T < Context->MaxCounter);
1120
1121 unsigned UB = getScoreUB(T);
1122 unsigned Increment = 1;
1124 // V_WMMA_SCALE instructions use VOP3PX2 encoding. Hardware treats this as
1125 // two VOP3P instructions and increments VA_VDST twice.
1126 Increment = 2;
1127 }
1128 unsigned CurrScore = UB + Increment;
1129 if (CurrScore == 0)
1130 report_fatal_error("InsertWaitcnt score wraparound");
1131 // PendingEvents and ScoreUB need to be update regardless if this event
1132 // changes the score of a register or not.
1133 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
1134 PendingEvents.insert(E);
1135 setScoreUB(T, CurrScore);
1136
1137 const SIRegisterInfo &TRI = Context->TRI;
1138 const MachineRegisterInfo &MRI = Context->MRI;
1139 const SIInstrInfo &TII = Context->TII;
1140
1141 if (T == AMDGPU::EXP_CNT) {
1142 // Put score on the source vgprs. If this is a store, just use those
1143 // specific register(s).
1144 if (TII.isDS(Inst) && Inst.mayLoadOrStore()) {
1145 // All GDS operations must protect their address register (same as
1146 // export.)
1147 if (const auto *AddrOp = TII.getNamedOperand(Inst, AMDGPU::OpName::addr))
1148 setScoreByOperand(*AddrOp, AMDGPU::EXP_CNT, CurrScore);
1149
1150 if (Inst.mayStore()) {
1151 if (const auto *Data0 =
1152 TII.getNamedOperand(Inst, AMDGPU::OpName::data0))
1153 setScoreByOperand(*Data0, AMDGPU::EXP_CNT, CurrScore);
1154 if (const auto *Data1 =
1155 TII.getNamedOperand(Inst, AMDGPU::OpName::data1))
1156 setScoreByOperand(*Data1, AMDGPU::EXP_CNT, CurrScore);
1157 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
1158 Inst.getOpcode() != AMDGPU::DS_APPEND &&
1159 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
1160 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1161 for (const MachineOperand &Op : Inst.all_uses()) {
1162 if (TRI.isVectorRegister(MRI, Op.getReg()))
1163 setScoreByOperand(Op, AMDGPU::EXP_CNT, CurrScore);
1164 }
1165 }
1166 } else if (TII.isFLAT(Inst)) {
1167 if (Inst.mayStore()) {
1168 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1169 AMDGPU::EXP_CNT, CurrScore);
1170 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1171 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1172 AMDGPU::EXP_CNT, CurrScore);
1173 }
1174 } else if (TII.isMIMG(Inst)) {
1175 if (Inst.mayStore()) {
1176 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
1177 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1178 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1179 AMDGPU::EXP_CNT, CurrScore);
1180 }
1181 } else if (TII.isMTBUF(Inst)) {
1182 if (Inst.mayStore())
1183 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
1184 } else if (TII.isMUBUF(Inst)) {
1185 if (Inst.mayStore()) {
1186 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
1187 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1188 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1189 AMDGPU::EXP_CNT, CurrScore);
1190 }
1191 } else if (TII.isLDSDIR(Inst)) {
1192 // LDSDIR instructions attach the score to the destination.
1193 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::vdst),
1194 AMDGPU::EXP_CNT, CurrScore);
1195 } else {
1196 if (TII.isEXP(Inst)) {
1197 // For export the destination registers are really temps that
1198 // can be used as the actual source after export patching, so
1199 // we need to treat them like sources and set the EXP_CNT
1200 // score.
1201 for (MachineOperand &DefMO : Inst.all_defs()) {
1202 if (TRI.isVGPR(MRI, DefMO.getReg())) {
1203 setScoreByOperand(DefMO, AMDGPU::EXP_CNT, CurrScore);
1204 }
1205 }
1206 }
1207 for (const MachineOperand &Op : Inst.all_uses()) {
1208 if (TRI.isVectorRegister(MRI, Op.getReg()))
1209 setScoreByOperand(Op, AMDGPU::EXP_CNT, CurrScore);
1210 }
1211 }
1212 } else if (T == AMDGPU::X_CNT) {
1213 WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1214 if (PendingEvents.contains(OtherEvent)) {
1215 // Hardware inserts an implicit xcnt between interleaved
1216 // SMEM and VMEM operations. So there will never be
1217 // outstanding address translations for both SMEM and
1218 // VMEM at the same time.
1219 setScoreLB(T, getScoreUB(T) - 1);
1220 PendingEvents.remove(OtherEvent);
1221 }
1222 for (const MachineOperand &Op : Inst.all_uses())
1223 setScoreByOperand(Op, T, CurrScore);
1224 } else if (T == AMDGPU::VA_VDST || T == AMDGPU::VM_VSRC) {
1225 // Match the score to the VGPR destination or source registers as
1226 // appropriate
1227 for (const MachineOperand &Op : Inst.operands()) {
1228 if (!Op.isReg() || (T == AMDGPU::VA_VDST && Op.isUse()) ||
1229 (T == AMDGPU::VM_VSRC && Op.isDef()))
1230 continue;
1231 if (TRI.isVectorRegister(Context->MRI, Op.getReg()))
1232 setScoreByOperand(Op, T, CurrScore);
1233 }
1234 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1235 // Match the score to the destination registers.
1236 //
1237 // Check only explicit operands. Stores, especially spill stores, include
1238 // implicit uses and defs of their super registers which would create an
1239 // artificial dependency, while these are there only for register liveness
1240 // accounting purposes.
1241 //
1242 // Special cases where implicit register defs exists, such as M0 or VCC,
1243 // but none with memory instructions.
1244 for (const MachineOperand &Op : Inst.defs()) {
1245 if (T == AMDGPU::LOAD_CNT || T == AMDGPU::SAMPLE_CNT ||
1246 T == AMDGPU::BVH_CNT) {
1247 if (!TRI.isVectorRegister(MRI, Op.getReg())) // TODO: add wrapper
1248 continue;
1249 if (updateVMCntOnly(Inst)) {
1250 // updateVMCntOnly should only leave us with VGPRs
1251 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1252 // defs. That's required for a sane index into `VgprMemTypes` below
1253 assert(TRI.isVectorRegister(MRI, Op.getReg()));
1254 VmemType V = getVmemType(Inst);
1255 unsigned char TypesMask = 1 << V;
1256 // If instruction can have Point Sample Accel applied, we have to flag
1257 // this with another potential dependency
1258 if (hasPointSampleAccel(Inst))
1259 TypesMask |= 1 << VMEM_NOSAMPLER;
1260 for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))
1261 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1262 }
1263 }
1264 setScoreByOperand(Op, T, CurrScore);
1265 }
1266 if (Inst.mayStore() &&
1267 (TII.isDS(Inst) || Context->isNonAsyncLdsDmaWrite(Inst))) {
1268 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1269 // written can be accessed. A load from LDS to VMEM does not need a wait.
1270 //
1271 // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
1272 // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
1273 // store. The "Slot" is the index into LDSDMAStores + 1.
1274 unsigned Slot = 0;
1275 for (const auto *MemOp : Inst.memoperands()) {
1276 if (!MemOp->isStore() ||
1277 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1278 continue;
1279 // Comparing just AA info does not guarantee memoperands are equal
1280 // in general, but this is so for LDS DMA in practice.
1281 auto AAI = MemOp->getAAInfo();
1282 // Alias scope information gives a way to definitely identify an
1283 // original memory object and practically produced in the module LDS
1284 // lowering pass. If there is no scope available we will not be able
1285 // to disambiguate LDS aliasing as after the module lowering all LDS
1286 // is squashed into a single big object.
1287 if (!AAI || !AAI.Scope)
1288 break;
1289 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1290 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1291 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1292 Slot = I + 1;
1293 break;
1294 }
1295 }
1296 }
1297 if (Slot)
1298 break;
1299 // The slot may not be valid because it can be >= NUM_LDSDMA which
1300 // means the scoreboard cannot track it. We still want to preserve the
1301 // MI in order to check alias information, though.
1302 LDSDMAStores.push_back(&Inst);
1303 Slot = LDSDMAStores.size();
1304 break;
1305 }
1306 setVMemScore(LDSDMA_BEGIN, T, CurrScore);
1307 if (Slot && Slot < NUM_LDSDMA)
1308 setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
1309 }
1310
1311 if (Context->shouldUpdateAsyncMark(Inst, T)) {
1312 AsyncScore[T] = CurrScore;
1313 }
1314
1316 setRegScore(AMDGPU::SCC, T, CurrScore);
1317 PendingSCCWrite = &Inst;
1318 }
1319 }
1320}
1321
1322void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1323 // In the absence of loops, AsyncMarks can grow linearly with the program
1324 // until we encounter an ASYNCMARK_WAIT. We could drop the oldest mark above a
1325 // limit every time we push a new mark, but that seems like unnecessary work
1326 // in practical cases. We do separately truncate the array when processing a
1327 // loop, which should be sufficient.
1328 AsyncMarks.push_back(AsyncScore);
1329 AsyncScore = {};
1330 LLVM_DEBUG({
1331 dbgs() << "recordAsyncMark:\n" << Inst;
1332 for (const auto &Mark : AsyncMarks) {
1333 llvm::interleaveComma(Mark, dbgs());
1334 dbgs() << '\n';
1335 }
1336 });
1337}
1338
1339void WaitcntBrackets::print(raw_ostream &OS) const {
1340 const GCNSubtarget &ST = Context->ST;
1341
1342 for (auto T : inst_counter_types(Context->MaxCounter)) {
1343 unsigned SR = getScoreRange(T);
1344 switch (T) {
1345 case AMDGPU::LOAD_CNT:
1346 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1347 << SR << "):";
1348 break;
1349 case AMDGPU::DS_CNT:
1350 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1351 << SR << "):";
1352 break;
1353 case AMDGPU::EXP_CNT:
1354 OS << " EXP_CNT(" << SR << "):";
1355 break;
1356 case AMDGPU::STORE_CNT:
1357 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1358 << SR << "):";
1359 break;
1360 case AMDGPU::SAMPLE_CNT:
1361 OS << " SAMPLE_CNT(" << SR << "):";
1362 break;
1363 case AMDGPU::BVH_CNT:
1364 OS << " BVH_CNT(" << SR << "):";
1365 break;
1366 case AMDGPU::KM_CNT:
1367 OS << " KM_CNT(" << SR << "):";
1368 break;
1369 case AMDGPU::X_CNT:
1370 OS << " X_CNT(" << SR << "):";
1371 break;
1372 case AMDGPU::ASYNC_CNT:
1373 OS << " ASYNC_CNT(" << SR << "):";
1374 break;
1375 case AMDGPU::VA_VDST:
1376 OS << " VA_VDST(" << SR << "): ";
1377 break;
1378 case AMDGPU::VM_VSRC:
1379 OS << " VM_VSRC(" << SR << "): ";
1380 break;
1381 default:
1382 OS << " UNKNOWN(" << SR << "):";
1383 break;
1384 }
1385
1386 if (SR != 0) {
1387 // Print vgpr scores.
1388 unsigned LB = getScoreLB(T);
1389
1390 SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
1391 sort(SortedVMEMIDs);
1392
1393 for (auto ID : SortedVMEMIDs) {
1394 unsigned RegScore = VMem.at(ID).Scores[T];
1395 if (RegScore <= LB)
1396 continue;
1397 unsigned RelScore = RegScore - LB - 1;
1398 if (ID < REGUNITS_END) {
1399 OS << ' ' << RelScore << ":vRU" << ID;
1400 } else {
1401 assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
1402 "Unhandled/unexpected ID value!");
1403 OS << ' ' << RelScore << ":LDSDMA" << ID;
1404 }
1405 }
1406
1407 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1408 if (isSmemCounter(T)) {
1409 SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
1410 sort(SortedSMEMIDs);
1411 for (auto ID : SortedSMEMIDs) {
1412 unsigned RegScore = SGPRs.at(ID).get(T);
1413 if (RegScore <= LB)
1414 continue;
1415 unsigned RelScore = RegScore - LB - 1;
1416 OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
1417 }
1418 }
1419
1420 if (T == AMDGPU::KM_CNT && SCCScore > 0)
1421 OS << ' ' << SCCScore << ":scc";
1422 }
1423 OS << '\n';
1424 }
1425
1426 OS << "Pending Events: ";
1427 if (hasPendingEvent()) {
1428 ListSeparator LS;
1429 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1430 if (hasPendingEvent((WaitEventType)I)) {
1431 OS << LS << WaitEventTypeName[I];
1432 }
1433 }
1434 } else {
1435 OS << "none";
1436 }
1437 OS << '\n';
1438
1439 OS << "Async score: ";
1440 if (AsyncScore.empty())
1441 OS << "none";
1442 else
1443 llvm::interleaveComma(AsyncScore, OS);
1444 OS << '\n';
1445
1446 OS << "Async marks: " << AsyncMarks.size() << '\n';
1447
1448 for (const auto &Mark : AsyncMarks) {
1449 for (auto T : AMDGPU::inst_counter_types()) {
1450 unsigned MarkedScore = Mark[T];
1451 switch (T) {
1452 case AMDGPU::LOAD_CNT:
1453 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM")
1454 << "_CNT: " << MarkedScore;
1455 break;
1456 case AMDGPU::DS_CNT:
1457 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM")
1458 << "_CNT: " << MarkedScore;
1459 break;
1460 case AMDGPU::EXP_CNT:
1461 OS << " EXP_CNT: " << MarkedScore;
1462 break;
1463 case AMDGPU::STORE_CNT:
1464 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS")
1465 << "_CNT: " << MarkedScore;
1466 break;
1467 case AMDGPU::SAMPLE_CNT:
1468 OS << " SAMPLE_CNT: " << MarkedScore;
1469 break;
1470 case AMDGPU::BVH_CNT:
1471 OS << " BVH_CNT: " << MarkedScore;
1472 break;
1473 case AMDGPU::KM_CNT:
1474 OS << " KM_CNT: " << MarkedScore;
1475 break;
1476 case AMDGPU::X_CNT:
1477 OS << " X_CNT: " << MarkedScore;
1478 break;
1479 case AMDGPU::ASYNC_CNT:
1480 OS << " ASYNC_CNT: " << MarkedScore;
1481 break;
1482 default:
1483 OS << " UNKNOWN: " << MarkedScore;
1484 break;
1485 }
1486 }
1487 OS << '\n';
1488 }
1489 OS << '\n';
1490}
1491
1492/// Simplify \p UpdateWait by removing waits that are redundant based on the
1493/// current WaitcntBrackets and any other waits specified in \p CheckWait.
1494void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
1495 AMDGPU::Waitcnt &UpdateWait) const {
1496 simplifyWaitcnt(UpdateWait, AMDGPU::LOAD_CNT);
1497 simplifyWaitcnt(UpdateWait, AMDGPU::EXP_CNT);
1498 simplifyWaitcnt(UpdateWait, AMDGPU::DS_CNT);
1499 simplifyWaitcnt(UpdateWait, AMDGPU::STORE_CNT);
1500 simplifyWaitcnt(UpdateWait, AMDGPU::SAMPLE_CNT);
1501 simplifyWaitcnt(UpdateWait, AMDGPU::BVH_CNT);
1502 simplifyWaitcnt(UpdateWait, AMDGPU::KM_CNT);
1503 simplifyXcnt(CheckWait, UpdateWait);
1504 simplifyWaitcnt(UpdateWait, AMDGPU::VA_VDST);
1505 simplifyVmVsrc(CheckWait, UpdateWait);
1506 simplifyWaitcnt(UpdateWait, AMDGPU::ASYNC_CNT);
1507}
1508
1509void WaitcntBrackets::simplifyWaitcnt(AMDGPU::InstCounterType T,
1510 unsigned &Count) const {
1511 // The number of outstanding events for this type, T, can be calculated
1512 // as (UB - LB). If the current Count is greater than or equal to the number
1513 // of outstanding events, then the wait for this counter is redundant.
1514 if (Count >= getScoreRange(T))
1515 Count = ~0u;
1516}
1517
1518void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait,
1519 AMDGPU::InstCounterType T) const {
1520 unsigned Cnt = Wait.get(T);
1521 simplifyWaitcnt(T, Cnt);
1522 Wait.set(T, Cnt);
1523}
1524
1525void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
1526 AMDGPU::Waitcnt &UpdateWait) const {
1527 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1528 // optimizations. On entry to a block with multiple predescessors, there may
1529 // be pending SMEM and VMEM events active at the same time.
1530 // In such cases, only clear one active event at a time.
1531 // TODO: Revisit xcnt optimizations for gfx1250.
1532 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1533 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1534 // zero.
1535 if (CheckWait.get(AMDGPU::KM_CNT) == 0 && hasPendingEvent(SMEM_GROUP))
1536 UpdateWait.set(AMDGPU::X_CNT, ~0u);
1537 // If we have pending store we cannot optimize XCnt because we do not wait for
1538 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1539 // decremented to the same number as LOADCnt.
1540 if (CheckWait.get(AMDGPU::LOAD_CNT) != ~0u && hasPendingEvent(VMEM_GROUP) &&
1541 !hasPendingEvent(AMDGPU::STORE_CNT) &&
1542 CheckWait.get(AMDGPU::X_CNT) >= CheckWait.get(AMDGPU::LOAD_CNT))
1543 UpdateWait.set(AMDGPU::X_CNT, ~0u);
1544 simplifyWaitcnt(UpdateWait, AMDGPU::X_CNT);
1545}
1546
1547void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
1548 AMDGPU::Waitcnt &UpdateWait) const {
1549 // Waiting for some counters implies waiting for VM_VSRC, since an
1550 // instruction that decrements a counter on completion would have
1551 // decremented VM_VSRC once its VGPR operands had been read.
1552 if (CheckWait.get(AMDGPU::VM_VSRC) >=
1553 std::min({CheckWait.get(AMDGPU::LOAD_CNT),
1554 CheckWait.get(AMDGPU::STORE_CNT),
1555 CheckWait.get(AMDGPU::SAMPLE_CNT),
1556 CheckWait.get(AMDGPU::BVH_CNT), CheckWait.get(AMDGPU::DS_CNT)}))
1557 UpdateWait.set(AMDGPU::VM_VSRC, ~0u);
1558 simplifyWaitcnt(UpdateWait, AMDGPU::VM_VSRC);
1559}
1560
1561void WaitcntBrackets::purgeEmptyTrackingData() {
1562 for (auto &[K, V] : make_early_inc_range(VMem)) {
1563 if (V.empty())
1564 VMem.erase(K);
1565 }
1566 for (auto &[K, V] : make_early_inc_range(SGPRs)) {
1567 if (V.empty())
1568 SGPRs.erase(K);
1569 }
1570}
1571
1572void WaitcntBrackets::determineWaitForScore(AMDGPU::InstCounterType T,
1573 unsigned ScoreToWait,
1574 AMDGPU::Waitcnt &Wait) const {
1575 const unsigned LB = getScoreLB(T);
1576 const unsigned UB = getScoreUB(T);
1577
1578 // If the score falls within the bracket, we need a waitcnt.
1579 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1580 if ((T == AMDGPU::LOAD_CNT || T == AMDGPU::DS_CNT) && hasPendingFlat() &&
1581 !Context->ST.hasFlatLgkmVMemCountInOrder()) {
1582 // If there is a pending FLAT operation, and this is a VMem or LGKM
1583 // waitcnt and the target can report early completion, then we need
1584 // to force a waitcnt 0.
1585 addWait(Wait, T, 0);
1586 } else if (counterOutOfOrder(T)) {
1587 // Counter can get decremented out-of-order when there
1588 // are multiple types event in the bracket. Also emit an s_wait counter
1589 // with a conservative value of 0 for the counter.
1590 addWait(Wait, T, 0);
1591 } else {
1592 // If a counter has been maxed out avoid overflow by waiting for
1593 // MAX(CounterType) - 1 instead.
1594 unsigned NeededWait = std::min(
1595 UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1);
1596 addWait(Wait, T, NeededWait);
1597 }
1598 }
1599}
1600
1601AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
1602 LLVM_DEBUG({
1603 dbgs() << "Need " << N << " async marks. Found " << AsyncMarks.size()
1604 << ":\n";
1605 for (const auto &Mark : AsyncMarks) {
1606 llvm::interleaveComma(Mark, dbgs());
1607 dbgs() << '\n';
1608 }
1609 });
1610
1611 if (AsyncMarks.size() == MaxAsyncMarks) {
1612 // Enforcing MaxAsyncMarks here is unnecessary work because the size of
1613 // MaxAsyncMarks is linear when traversing straightline code. But we do
1614 // need to check if truncation may have occured at a merge, and adjust N
1615 // to ensure that a wait is generated.
1616 LLVM_DEBUG(dbgs() << "Possible truncation. Ensuring a non-trivial wait.\n");
1617 N = std::min(N, (unsigned)MaxAsyncMarks - 1);
1618 }
1619
1620 AMDGPU::Waitcnt Wait;
1621 if (AsyncMarks.size() <= N) {
1622 LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n");
1623 return Wait;
1624 }
1625
1626 size_t MarkIndex = AsyncMarks.size() - N - 1;
1627 const auto &RequiredMark = AsyncMarks[MarkIndex];
1629 determineWaitForScore(T, RequiredMark[T], Wait);
1630
1631 // Immediately remove the waited mark and all older ones
1632 // This happens BEFORE the wait is actually inserted, which is fine
1633 // because we've already extracted the wait requirements
1634 LLVM_DEBUG({
1635 dbgs() << "Removing " << (MarkIndex + 1)
1636 << " async marks after determining wait\n";
1637 });
1638 AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
1639
1640 LLVM_DEBUG(dbgs() << "Waits to add: " << Wait);
1641 return Wait;
1642}
1643
1644// With D16Write32BitVgpr, D16 inst might be clobbered by events running on the
1645// other half 16bit.
1646//
1647// Replace VGPR16 to VGPR32 for wait check if:
1648// 1. MI is a VALU, and there is a wait event on the other half
1649// 2. MI is a LdSt, and there is a wait event on the other half from different
1650// order group
1651MCPhysReg WaitcntBrackets::determineVGPR16Dependency(const MachineInstr &MI,
1653 MCPhysReg Reg) const {
1654 const TargetRegisterClass *RC = Context->TRI.getPhysRegBaseClass(Reg);
1655 unsigned Size = Context->TRI.getRegSizeInBits(*RC);
1656
1657 if (Size != 16 || !Context->ST.hasD16Writes32BitVgpr())
1658 return Reg;
1659
1660 // With D16Writes32BitVgpr, D16 Inst might clobber the whole vgpr32
1661 // check dependency on the other half
1662 Register Reg32 = Context->TRI.get32BitRegister(Reg);
1663 Register OtherHalf = Context->TRI.getSubReg(
1664 Reg32,
1665 AMDGPU::isHi16Reg(Reg, Context->TRI) ? AMDGPU::lo16 : AMDGPU::hi16);
1666
1667 AMDGPU::Waitcnt Wait;
1668 for (MCRegUnit RU : regunits(OtherHalf))
1669 determineWaitForScore(T, getVMemScore(toVMEMID(RU), T), Wait);
1670
1671 // No wait on otherhalf
1672 if (!Wait.hasWait())
1673 return Reg;
1674
1675 if (Context->TII.isVALU(MI))
1676 return Reg32;
1677
1678 // If hi/lo16 mixed events
1679 WaitEventSet MIEvents = Context->getEventsFor(MI);
1680 WaitEventSet OtherHalfEvents = Context->getWaitEvents(T);
1681 WaitEventSet Events = MIEvents & OtherHalfEvents;
1682 if (Events.twoOrMore())
1683 return Reg32;
1684 return Reg;
1685}
1686
1687void WaitcntBrackets::determineWaitForPhysReg(AMDGPU::InstCounterType T,
1688 MCPhysReg Reg,
1689 AMDGPU::Waitcnt &Wait,
1690 const MachineInstr &MI) const {
1691 if (Reg == AMDGPU::SCC) {
1692 determineWaitForScore(T, SCCScore, Wait);
1693 } else {
1694 bool IsVGPR = Context->TRI.isVectorRegister(Context->MRI, Reg);
1695 if (IsVGPR)
1696 Reg = determineVGPR16Dependency(MI, T, Reg);
1697 for (MCRegUnit RU : regunits(Reg))
1698 determineWaitForScore(
1699 T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
1700 Wait);
1701 }
1702}
1703
1704void WaitcntBrackets::determineWaitForLDSDMA(AMDGPU::InstCounterType T,
1705 VMEMID TID,
1706 AMDGPU::Waitcnt &Wait) const {
1707 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1708 determineWaitForScore(T, getVMemScore(TID, T), Wait);
1709}
1710
1711void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1712 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1713 // SCC has landed
1714 if (PendingSCCWrite &&
1715 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1716 PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
1717 WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
1718 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1719 if ((PendingEvents & Context->getWaitEvents(AMDGPU::KM_CNT)) ==
1720 SCC_WRITE_PendingEvent) {
1721 setScoreLB(AMDGPU::KM_CNT, getScoreUB(AMDGPU::KM_CNT));
1722 }
1723
1724 PendingEvents.remove(SCC_WRITE_PendingEvent);
1725 PendingSCCWrite = nullptr;
1726 }
1727}
1728
1729void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1731 applyWaitcnt(Wait, T);
1732}
1733
1734void WaitcntBrackets::applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count) {
1735 const unsigned UB = getScoreUB(T);
1736 if (Count >= UB)
1737 return;
1738 if (Count != 0) {
1739 if (counterOutOfOrder(T))
1740 return;
1741 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1742 } else {
1743 setScoreLB(T, UB);
1744 PendingEvents.remove(Context->getWaitEvents(T));
1745 }
1746
1747 if (T == AMDGPU::KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) {
1748 if (!hasMixedPendingEvents(AMDGPU::X_CNT))
1749 applyWaitcnt(AMDGPU::X_CNT, 0);
1750 else
1751 PendingEvents.remove(SMEM_GROUP);
1752 }
1753 if (T == AMDGPU::LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
1754 !hasPendingEvent(AMDGPU::STORE_CNT)) {
1755 if (!hasMixedPendingEvents(AMDGPU::X_CNT))
1756 applyWaitcnt(AMDGPU::X_CNT, Count);
1757 else if (Count == 0)
1758 PendingEvents.remove(VMEM_GROUP);
1759 }
1760}
1761
1762void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait,
1764 unsigned Cnt = Wait.get(T);
1765 applyWaitcnt(T, Cnt);
1766}
1767
1768// Where there are multiple types of event in the bracket of a counter,
1769// the decrement may go out of order.
1770bool WaitcntBrackets::counterOutOfOrder(AMDGPU::InstCounterType T) const {
1771 // Scalar memory read always can go out of order.
1772 if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1773 (T == AMDGPU::X_CNT && hasPendingEvent(SMEM_GROUP)))
1774 return true;
1775
1776 // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
1777 // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
1778 // out-of-order completion.
1779 if (T == AMDGPU::LOAD_CNT) {
1780 WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
1781 // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
1782 // events
1783 Events.remove(GLOBAL_INV_ACCESS);
1784 // Return true only if there are still multiple event types after removing
1785 // GLOBAL_INV
1786 return Events.twoOrMore();
1787 }
1788
1789 return hasMixedPendingEvents(T);
1790}
1791
1792INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1793 false, false)
1796INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1798
1799char SIInsertWaitcntsLegacy::ID = 0;
1800
1801char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1802
1804 return new SIInsertWaitcntsLegacy();
1805}
1806
1807static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1808 unsigned NewEnc) {
1809 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1810 assert(OpIdx >= 0);
1811
1812 MachineOperand &MO = MI.getOperand(OpIdx);
1813
1814 if (NewEnc == MO.getImm())
1815 return false;
1816
1817 MO.setImm(NewEnc);
1818 return true;
1819}
1820
1821/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1822/// and if so, which counter it is waiting on.
1823static std::optional<AMDGPU::InstCounterType>
1824counterTypeForInstr(unsigned Opcode) {
1825 switch (Opcode) {
1826 case AMDGPU::S_WAIT_LOADCNT:
1827 return AMDGPU::LOAD_CNT;
1828 case AMDGPU::S_WAIT_EXPCNT:
1829 return AMDGPU::EXP_CNT;
1830 case AMDGPU::S_WAIT_STORECNT:
1831 return AMDGPU::STORE_CNT;
1832 case AMDGPU::S_WAIT_SAMPLECNT:
1833 return AMDGPU::SAMPLE_CNT;
1834 case AMDGPU::S_WAIT_BVHCNT:
1835 return AMDGPU::BVH_CNT;
1836 case AMDGPU::S_WAIT_DSCNT:
1837 return AMDGPU::DS_CNT;
1838 case AMDGPU::S_WAIT_KMCNT:
1839 return AMDGPU::KM_CNT;
1840 case AMDGPU::S_WAIT_XCNT:
1841 return AMDGPU::X_CNT;
1842 case AMDGPU::S_WAIT_ASYNCCNT:
1843 return AMDGPU::ASYNC_CNT;
1844 default:
1845 return {};
1846 }
1847}
1848
1849bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1850 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1851 if (Opcode == Waitcnt->getOpcode())
1852 return false;
1853
1854 Waitcnt->setDesc(TII.get(Opcode));
1855 return true;
1856}
1857
1858/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1859/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1860/// from \p Wait that were added by previous passes. Currently this pass
1861/// conservatively assumes that these preexisting waits are required for
1862/// correctness.
1863bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1864 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1865 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1866 assert(isNormalMode(MaxCounter));
1867
1868 bool Modified = false;
1869 MachineInstr *WaitcntInstr = nullptr;
1870 MachineInstr *WaitcntVsCntInstr = nullptr;
1871
1872 LLVM_DEBUG({
1873 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1874 if (It.isEnd())
1875 dbgs() << "end of block\n";
1876 else
1877 dbgs() << *It;
1878 });
1879
1880 for (auto &II :
1881 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1882 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1883 if (isNonWaitcntMetaInst(II)) {
1884 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1885 continue;
1886 }
1887
1888 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1889 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1890
1891 // Update required wait count. If this is a soft waitcnt (= it was added
1892 // by an earlier pass), it may be entirely removed.
1893 if (Opcode == AMDGPU::S_WAITCNT) {
1894 unsigned IEnc = II.getOperand(0).getImm();
1895 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1896 if (TrySimplify)
1897 ScoreBrackets.simplifyWaitcnt(OldWait);
1898 Wait = Wait.combined(OldWait);
1899
1900 // Merge consecutive waitcnt of the same type by erasing multiples.
1901 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1902 II.eraseFromParent();
1903 Modified = true;
1904 } else
1905 WaitcntInstr = &II;
1906 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1907 assert(ST.hasVMemToLDSLoad());
1908 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1909 << "Before: " << Wait << '\n';);
1910 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, LDSDMA_BEGIN,
1911 Wait);
1912 LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
1913
1914 // It is possible (but unlikely) that this is the only wait instruction,
1915 // in which case, we exit this loop without a WaitcntInstr to consume
1916 // `Wait`. But that works because `Wait` was passed in by reference, and
1917 // the callee eventually calls createNewWaitcnt on it. We test this
1918 // possibility in an articial MIR test since such a situation cannot be
1919 // recreated by running the memory legalizer.
1920 II.eraseFromParent();
1921 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1922 unsigned N = II.getOperand(0).getImm();
1923 LLVM_DEBUG(dbgs() << "Processing WAIT_ASYNCMARK: " << II << '\n';);
1924 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
1925 Wait = Wait.combined(OldWait);
1926 } else {
1927 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1928 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1929
1930 unsigned OldVSCnt =
1931 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1932 if (TrySimplify)
1933 ScoreBrackets.simplifyWaitcnt(AMDGPU::STORE_CNT, OldVSCnt);
1935 std::min(Wait.get(AMDGPU::STORE_CNT), OldVSCnt));
1936
1937 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1938 II.eraseFromParent();
1939 Modified = true;
1940 } else
1941 WaitcntVsCntInstr = &II;
1942 }
1943 }
1944
1945 if (WaitcntInstr) {
1946 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1948 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1949
1950 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::LOAD_CNT);
1951 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::EXP_CNT);
1952 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::DS_CNT);
1953 Wait.set(AMDGPU::LOAD_CNT, ~0u);
1954 Wait.set(AMDGPU::EXP_CNT, ~0u);
1955 Wait.set(AMDGPU::DS_CNT, ~0u);
1956
1957 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1958 << "New Instr at block end: "
1959 << *WaitcntInstr << '\n'
1960 : dbgs() << "applied pre-existing waitcnt\n"
1961 << "Old Instr: " << *It
1962 << "New Instr: " << *WaitcntInstr << '\n');
1963 }
1964
1965 if (WaitcntVsCntInstr) {
1966 Modified |=
1967 updateOperandIfDifferent(*WaitcntVsCntInstr, AMDGPU::OpName::simm16,
1968 Wait.get(AMDGPU::STORE_CNT));
1969 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1970
1971 ScoreBrackets.applyWaitcnt(AMDGPU::STORE_CNT, Wait.get(AMDGPU::STORE_CNT));
1972 Wait.set(AMDGPU::STORE_CNT, ~0u);
1973
1974 LLVM_DEBUG(It.isEnd()
1975 ? dbgs() << "applied pre-existing waitcnt\n"
1976 << "New Instr at block end: " << *WaitcntVsCntInstr
1977 << '\n'
1978 : dbgs() << "applied pre-existing waitcnt\n"
1979 << "Old Instr: " << *It
1980 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1981 }
1982
1983 return Modified;
1984}
1985
1986/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1987/// required counters in \p Wait
1988bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1989 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1990 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
1991 assert(isNormalMode(MaxCounter));
1992
1993 bool Modified = false;
1994 const DebugLoc &DL = Block.findDebugLoc(It);
1995
1996 // Helper to emit expanded waitcnt sequence for profiling.
1997 // Emits waitcnts from (Outstanding-1) down to Target.
1998 // The EmitWaitcnt callback emits a single waitcnt.
1999 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
2000 auto EmitWaitcnt) {
2001 do {
2002 EmitWaitcnt(--Outstanding);
2003 } while (Outstanding > Target);
2004 Modified = true;
2005 };
2006
2007 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
2008 // single instruction while VScnt has its own instruction.
2009 if (Wait.hasWaitExceptStoreCnt()) {
2010 // If profiling expansion is enabled, emit an expanded sequence
2011 if (ExpandWaitcntProfiling) {
2012 // Check if any of the counters to be waited on are out-of-order.
2013 // If so, fall back to normal (non-expanded) behavior since expansion
2014 // would provide misleading profiling information.
2015 bool AnyOutOfOrder = false;
2016 for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
2017 unsigned WaitCnt = Wait.get(CT);
2018 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
2019 AnyOutOfOrder = true;
2020 break;
2021 }
2022 }
2023
2024 if (AnyOutOfOrder) {
2025 // Fall back to non-expanded wait
2026 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
2027 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
2028 Modified = true;
2029 } else {
2030 // All counters are in-order, safe to expand
2031 for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
2032 unsigned WaitCnt = Wait.get(CT);
2033 if (WaitCnt == ~0u)
2034 continue;
2035
2036 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
2037 getWaitCountMax(getLimits(), CT) - 1);
2038 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
2039 AMDGPU::Waitcnt W;
2040 W.set(CT, Count);
2041 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT))
2043 });
2044 }
2045 }
2046 } else {
2047 // Normal behavior: emit single combined waitcnt
2048 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
2049 [[maybe_unused]] auto SWaitInst =
2050 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
2051 Modified = true;
2052
2053 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
2054 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2055 dbgs() << "New Instr: " << *SWaitInst << '\n');
2056 }
2057 }
2058
2059 if (Wait.hasWaitStoreCnt()) {
2060 assert(ST.hasVscnt());
2061
2062 if (ExpandWaitcntProfiling && Wait.get(AMDGPU::STORE_CNT) != ~0u &&
2063 !ScoreBrackets.counterOutOfOrder(AMDGPU::STORE_CNT)) {
2064 // Only expand if counter is not out-of-order
2065 unsigned Outstanding =
2066 std::min(ScoreBrackets.getOutstanding(AMDGPU::STORE_CNT),
2067 getWaitCountMax(getLimits(), AMDGPU::STORE_CNT) - 1);
2068 EmitExpandedWaitcnt(
2069 Outstanding, Wait.get(AMDGPU::STORE_CNT), [&](unsigned Count) {
2070 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
2071 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2072 .addImm(Count);
2073 });
2074 } else {
2075 [[maybe_unused]] auto SWaitInst =
2076 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
2077 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2079 Modified = true;
2080
2081 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
2082 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2083 dbgs() << "New Instr: " << *SWaitInst << '\n');
2084 }
2085 }
2086
2087 return Modified;
2088}
2089
2090AMDGPU::Waitcnt
2091WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
2092 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST.hasVscnt() ? 0 : ~0u);
2093}
2094
2095AMDGPU::Waitcnt
2096WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
2097 unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
2098 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
2099 ~0u /* XCNT */, ~0u /* ASYNC_CNT */, ExpertVal,
2100 ExpertVal);
2101}
2102
2103/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
2104/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
2105/// were added by previous passes. Currently this pass conservatively
2106/// assumes that these preexisting waits are required for correctness.
2107bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
2108 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
2109 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
2110 assert(!isNormalMode(MaxCounter));
2111
2112 bool Modified = false;
2113 MachineInstr *CombinedLoadDsCntInstr = nullptr;
2114 MachineInstr *CombinedStoreDsCntInstr = nullptr;
2115 MachineInstr *WaitcntDepctrInstr = nullptr;
2116 MachineInstr *WaitInstrs[AMDGPU::NUM_EXTENDED_INST_CNTS] = {};
2117
2118 LLVM_DEBUG({
2119 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
2120 if (It.isEnd())
2121 dbgs() << "end of block\n";
2122 else
2123 dbgs() << *It;
2124 });
2125
2126 // Accumulate waits that should not be simplified.
2127 AMDGPU::Waitcnt RequiredWait;
2128
2129 for (auto &II :
2130 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
2131 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
2132 if (isNonWaitcntMetaInst(II)) {
2133 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
2134 continue;
2135 }
2136
2137 // Update required wait count. If this is a soft waitcnt (= it was added
2138 // by an earlier pass), it may be entirely removed.
2139
2140 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
2141 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
2142
2143 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
2144 // attempt to do more than that either.
2145 if (Opcode == AMDGPU::S_WAITCNT)
2146 continue;
2147
2148 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
2149 unsigned OldEnc =
2150 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2151 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
2152 if (TrySimplify)
2153 Wait = Wait.combined(OldWait);
2154 else
2155 RequiredWait = RequiredWait.combined(OldWait);
2156 // Keep the first wait_loadcnt, erase the rest.
2157 if (CombinedLoadDsCntInstr == nullptr) {
2158 CombinedLoadDsCntInstr = &II;
2159 } else {
2160 II.eraseFromParent();
2161 Modified = true;
2162 }
2163 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
2164 unsigned OldEnc =
2165 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2166 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
2167 if (TrySimplify)
2168 Wait = Wait.combined(OldWait);
2169 else
2170 RequiredWait = RequiredWait.combined(OldWait);
2171 // Keep the first wait_storecnt, erase the rest.
2172 if (CombinedStoreDsCntInstr == nullptr) {
2173 CombinedStoreDsCntInstr = &II;
2174 } else {
2175 II.eraseFromParent();
2176 Modified = true;
2177 }
2178 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
2179 unsigned OldEnc =
2180 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2181 AMDGPU::Waitcnt OldWait;
2184 if (TrySimplify)
2185 ScoreBrackets.simplifyWaitcnt(OldWait);
2186 Wait = Wait.combined(OldWait);
2187 if (WaitcntDepctrInstr == nullptr) {
2188 WaitcntDepctrInstr = &II;
2189 } else {
2190 // S_WAITCNT_DEPCTR requires special care. Don't remove a
2191 // duplicate if it is waiting on things other than VA_VDST or
2192 // VM_VSRC. If that is the case, just make sure the VA_VDST and
2193 // VM_VSRC subfields of the operand are set to the "no wait"
2194 // values.
2195
2196 unsigned Enc =
2197 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2198 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u);
2199 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u);
2200
2201 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
2202 Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc);
2203 Modified |= promoteSoftWaitCnt(&II);
2204 } else {
2205 II.eraseFromParent();
2206 Modified = true;
2207 }
2208 }
2209 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
2210 // Architectures higher than GFX10 do not have direct loads to
2211 // LDS, so no work required here yet.
2212 II.eraseFromParent();
2213 Modified = true;
2214 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
2215 // Update the Waitcnt, but don't erase the wait.asyncmark() itself. It
2216 // shows up in the assembly as a comment with the original parameter N.
2217 unsigned N = II.getOperand(0).getImm();
2218 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
2219 Wait = Wait.combined(OldWait);
2220 } else {
2221 std::optional<AMDGPU::InstCounterType> CT = counterTypeForInstr(Opcode);
2222 assert(CT.has_value());
2223 unsigned OldCnt =
2224 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2225 if (TrySimplify)
2226 addWait(Wait, CT.value(), OldCnt);
2227 else
2228 addWait(RequiredWait, CT.value(), OldCnt);
2229 // Keep the first wait of its kind, erase the rest.
2230 if (WaitInstrs[CT.value()] == nullptr) {
2231 WaitInstrs[CT.value()] = &II;
2232 } else {
2233 II.eraseFromParent();
2234 Modified = true;
2235 }
2236 }
2237 }
2238
2239 ScoreBrackets.simplifyWaitcnt(Wait.combined(RequiredWait), Wait);
2240 Wait = Wait.combined(RequiredWait);
2241
2242 if (CombinedLoadDsCntInstr) {
2243 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
2244 // to be waited for. Otherwise, let the instruction be deleted so
2245 // the appropriate single counter wait instruction can be inserted
2246 // instead, when new S_WAIT_*CNT instructions are inserted by
2247 // createNewWaitcnt(). As a side effect, resetting the wait counts will
2248 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
2249 // the loop below that deals with single counter instructions.
2250 //
2251 // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
2252 // instructions that have decremented LOAD_CNT or DS_CNT on completion
2253 // will have needed to wait for their register sources to be available
2254 // first.
2255 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u && Wait.get(AMDGPU::DS_CNT) != ~0u) {
2256 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2257 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
2258 AMDGPU::OpName::simm16, NewEnc);
2259 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
2260 ScoreBrackets.applyWaitcnt(AMDGPU::LOAD_CNT, Wait.get(AMDGPU::LOAD_CNT));
2261 ScoreBrackets.applyWaitcnt(AMDGPU::DS_CNT, Wait.get(AMDGPU::DS_CNT));
2262 Wait.set(AMDGPU::LOAD_CNT, ~0u);
2263 Wait.set(AMDGPU::DS_CNT, ~0u);
2264
2265 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2266 << "New Instr at block end: "
2267 << *CombinedLoadDsCntInstr << '\n'
2268 : dbgs() << "applied pre-existing waitcnt\n"
2269 << "Old Instr: " << *It << "New Instr: "
2270 << *CombinedLoadDsCntInstr << '\n');
2271 } else {
2272 CombinedLoadDsCntInstr->eraseFromParent();
2273 Modified = true;
2274 }
2275 }
2276
2277 if (CombinedStoreDsCntInstr) {
2278 // Similarly for S_WAIT_STORECNT_DSCNT.
2279 if (Wait.get(AMDGPU::STORE_CNT) != ~0u && Wait.get(AMDGPU::DS_CNT) != ~0u) {
2280 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2281 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
2282 AMDGPU::OpName::simm16, NewEnc);
2283 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2284 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::STORE_CNT);
2285 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::DS_CNT);
2286 Wait.set(AMDGPU::STORE_CNT, ~0u);
2287 Wait.set(AMDGPU::DS_CNT, ~0u);
2288
2289 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2290 << "New Instr at block end: "
2291 << *CombinedStoreDsCntInstr << '\n'
2292 : dbgs() << "applied pre-existing waitcnt\n"
2293 << "Old Instr: " << *It << "New Instr: "
2294 << *CombinedStoreDsCntInstr << '\n');
2295 } else {
2296 CombinedStoreDsCntInstr->eraseFromParent();
2297 Modified = true;
2298 }
2299 }
2300
2301 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
2302 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
2303 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
2304 // instructions so that createNewWaitcnt() will create new combined
2305 // instructions to replace them.
2306
2307 if (Wait.get(AMDGPU::DS_CNT) != ~0u) {
2308 // This is a vector of addresses in WaitInstrs pointing to instructions
2309 // that should be removed if they are present.
2311
2312 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
2313 // both) need to be waited for, ensure that there are no existing
2314 // individual wait count instructions for these.
2315
2316 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u) {
2317 WaitsToErase.push_back(&WaitInstrs[AMDGPU::LOAD_CNT]);
2318 WaitsToErase.push_back(&WaitInstrs[AMDGPU::DS_CNT]);
2319 } else if (Wait.get(AMDGPU::STORE_CNT) != ~0u) {
2320 WaitsToErase.push_back(&WaitInstrs[AMDGPU::STORE_CNT]);
2321 WaitsToErase.push_back(&WaitInstrs[AMDGPU::DS_CNT]);
2322 }
2323
2324 for (MachineInstr **WI : WaitsToErase) {
2325 if (!*WI)
2326 continue;
2327
2328 (*WI)->eraseFromParent();
2329 *WI = nullptr;
2330 Modified = true;
2331 }
2332 }
2333
2335 if (!WaitInstrs[CT])
2336 continue;
2337
2338 unsigned NewCnt = Wait.get(CT);
2339 if (NewCnt != ~0u) {
2340 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
2341 AMDGPU::OpName::simm16, NewCnt);
2342 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2343
2344 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2345 setNoWait(Wait, CT);
2346
2347 LLVM_DEBUG(It.isEnd()
2348 ? dbgs() << "applied pre-existing waitcnt\n"
2349 << "New Instr at block end: " << *WaitInstrs[CT]
2350 << '\n'
2351 : dbgs() << "applied pre-existing waitcnt\n"
2352 << "Old Instr: " << *It
2353 << "New Instr: " << *WaitInstrs[CT] << '\n');
2354 } else {
2355 WaitInstrs[CT]->eraseFromParent();
2356 Modified = true;
2357 }
2358 }
2359
2360 if (WaitcntDepctrInstr) {
2361 // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
2362 // subfields with the new required values.
2363 unsigned Enc =
2364 TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2365 ->getImm();
2368
2369 ScoreBrackets.applyWaitcnt(AMDGPU::VA_VDST, Wait.get(AMDGPU::VA_VDST));
2370 ScoreBrackets.applyWaitcnt(AMDGPU::VM_VSRC, Wait.get(AMDGPU::VM_VSRC));
2371 Wait.set(AMDGPU::VA_VDST, ~0u);
2372 Wait.set(AMDGPU::VM_VSRC, ~0u);
2373
2374 // If that new encoded Depctr immediate would actually still wait
2375 // for anything, update the instruction's operand. Otherwise it can
2376 // just be deleted.
2377 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
2378 Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr,
2379 AMDGPU::OpName::simm16, Enc);
2380 LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n"
2381 << "New Instr at block end: "
2382 << *WaitcntDepctrInstr << '\n'
2383 : dbgs() << "applyPreexistingWaitcnt\n"
2384 << "Old Instr: " << *It << "New Instr: "
2385 << *WaitcntDepctrInstr << '\n');
2386 } else {
2387 WaitcntDepctrInstr->eraseFromParent();
2388 Modified = true;
2389 }
2390 }
2391
2392 return Modified;
2393}
2394
2395/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
2396bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2397 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
2398 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
2399 assert(!isNormalMode(MaxCounter));
2400
2401 bool Modified = false;
2402 const DebugLoc &DL = Block.findDebugLoc(It);
2403
2404 // Helper to emit expanded waitcnt sequence for profiling.
2405 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
2406 auto EmitWaitcnt) {
2407 for (unsigned I = Outstanding - 1; I > Target && I != ~0u; --I)
2408 EmitWaitcnt(I);
2409 EmitWaitcnt(Target);
2410 Modified = true;
2411 };
2412
2413 // For GFX12+, we use separate wait instructions, which makes expansion
2414 // simpler
2415 if (ExpandWaitcntProfiling) {
2417 unsigned Count = Wait.get(CT);
2418 if (Count == ~0u)
2419 continue;
2420
2421 // Skip expansion for out-of-order counters - emit normal wait instead
2422 if (ScoreBrackets.counterOutOfOrder(CT)) {
2423 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2424 .addImm(Count);
2425 Modified = true;
2426 continue;
2427 }
2428
2429 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
2430 getWaitCountMax(getLimits(), CT) - 1);
2431 EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
2432 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2433 .addImm(Val);
2434 });
2435 }
2436 return Modified;
2437 }
2438
2439 // Normal behavior (no expansion)
2440 // Check for opportunities to use combined wait instructions.
2441 if (Wait.get(AMDGPU::DS_CNT) != ~0u) {
2442 MachineInstr *SWaitInst = nullptr;
2443
2444 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u) {
2445 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2446
2447 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2448 .addImm(Enc);
2449
2450 Wait.set(AMDGPU::LOAD_CNT, ~0u);
2451 Wait.set(AMDGPU::DS_CNT, ~0u);
2452 } else if (Wait.get(AMDGPU::STORE_CNT) != ~0u) {
2453 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2454
2455 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_STORECNT_DSCNT))
2456 .addImm(Enc);
2457
2458 Wait.set(AMDGPU::STORE_CNT, ~0u);
2459 Wait.set(AMDGPU::DS_CNT, ~0u);
2460 }
2461
2462 if (SWaitInst) {
2463 Modified = true;
2464
2465 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2466 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2467 dbgs() << "New Instr: " << *SWaitInst << '\n');
2468 }
2469 }
2470
2471 // Generate an instruction for any remaining counter that needs
2472 // waiting for.
2473
2475 unsigned Count = Wait.get(CT);
2476 if (Count == ~0u)
2477 continue;
2478
2479 [[maybe_unused]] auto SWaitInst =
2480 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2481 .addImm(Count);
2482
2483 Modified = true;
2484
2485 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2486 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2487 dbgs() << "New Instr: " << *SWaitInst << '\n');
2488 }
2489
2490 if (Wait.hasWaitDepctr()) {
2491 assert(IsExpertMode);
2492 unsigned Enc =
2495
2496 [[maybe_unused]] auto SWaitInst =
2497 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_DEPCTR)).addImm(Enc);
2498
2499 Modified = true;
2500
2501 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
2502 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2503 dbgs() << "New Instr: " << *SWaitInst << '\n');
2504 }
2505
2506 return Modified;
2507}
2508
2509/// Generate s_waitcnt instruction to be placed before cur_Inst.
2510/// Instructions of a given type are returned in order,
2511/// but instructions of different types can complete out of order.
2512/// We rely on this in-order completion
2513/// and simply assign a score to the memory access instructions.
2514/// We keep track of the active "score bracket" to determine
2515/// if an access of a memory read requires an s_waitcnt
2516/// and if so what the value of each counter is.
2517/// The "score bracket" is bound by the lower bound and upper bound
2518/// scores (*_score_LB and *_score_ub respectively).
2519/// If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here.
2520/// If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here
2521/// (GFX12+ only, where DS_CNT is a separate counter).
2522bool SIInsertWaitcnts::generateWaitcntInstBefore(
2523 MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
2524 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2525 LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs()););
2526 setForceEmitWaitcnt();
2527
2528 assert(!isNonWaitcntMetaInst(MI));
2529
2530 AMDGPU::Waitcnt Wait;
2531 const unsigned Opc = MI.getOpcode();
2532
2533 switch (Opc) {
2534 case AMDGPU::BUFFER_WBINVL1:
2535 case AMDGPU::BUFFER_WBINVL1_SC:
2536 case AMDGPU::BUFFER_WBINVL1_VOL:
2537 case AMDGPU::BUFFER_GL0_INV:
2538 case AMDGPU::BUFFER_GL1_INV: {
2539 // FIXME: This should have already been handled by the memory legalizer.
2540 // Removing this currently doesn't affect any lit tests, but we need to
2541 // verify that nothing was relying on this. The number of buffer invalidates
2542 // being handled here should not be expanded.
2543 Wait.set(AMDGPU::LOAD_CNT, 0);
2544 break;
2545 }
2546 case AMDGPU::SI_RETURN_TO_EPILOG:
2547 case AMDGPU::SI_RETURN:
2548 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2549 case AMDGPU::S_SETPC_B64_return: {
2550 // All waits must be resolved at call return.
2551 // NOTE: this could be improved with knowledge of all call sites or
2552 // with knowledge of the called routines.
2553 ReturnInsts.insert(&MI);
2554 AMDGPU::Waitcnt AllZeroWait =
2555 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2556 // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
2557 // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
2558 // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
2559 // no need to wait for it at function boundaries.
2560 if (ST.hasExtendedWaitCounts() &&
2561 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
2562 AllZeroWait.set(AMDGPU::LOAD_CNT, ~0u);
2563 Wait = AllZeroWait;
2564 break;
2565 }
2566 case AMDGPU::S_ENDPGM:
2567 case AMDGPU::S_ENDPGM_SAVED: {
2568 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
2569 // Technically the hardware will do this on its own if we don't, but that
2570 // might cost extra cycles compared to doing it explicitly.
2571 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
2572 // have to wait for outstanding VMEM stores. In this case it can be useful
2573 // to send a message to explicitly release all VGPRs before the stores have
2574 // completed, but it is only safe to do this if there are no outstanding
2575 // scratch stores.
2576 EndPgmInsts[&MI] = !ScoreBrackets.empty(AMDGPU::STORE_CNT) &&
2577 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
2578 break;
2579 }
2580 case AMDGPU::S_SENDMSG:
2581 case AMDGPU::S_SENDMSGHALT: {
2582 if (ST.hasLegacyGeometry() &&
2583 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
2585 // Resolve vm waits before gs-done.
2586 Wait.set(AMDGPU::LOAD_CNT, 0);
2587 break;
2588 }
2589 [[fallthrough]];
2590 }
2591 default: {
2592
2593 // Export & GDS instructions do not read the EXEC mask until after the
2594 // export is granted (which can occur well after the instruction is issued).
2595 // The shader program must flush all EXP operations on the export-count
2596 // before overwriting the EXEC mask.
2597 if (MI.modifiesRegister(AMDGPU::EXEC, &TRI)) {
2598 // Export and GDS are tracked individually, either may trigger a waitcnt
2599 // for EXEC.
2600 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2601 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2602 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2603 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2604 Wait.set(AMDGPU::EXP_CNT, 0);
2605 }
2606 }
2607
2608 // Wait for any pending GDS instruction to complete before any
2609 // "Always GDS" instruction.
2610 if (TII.isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
2611 addWait(Wait, AMDGPU::DS_CNT, ScoreBrackets.getPendingGDSWait());
2612
2613 if (MI.isCall()) {
2614 // The function is going to insert a wait on everything in its prolog.
2615 // This still needs to be careful if the call target is a load (e.g. a GOT
2616 // load). We also need to check WAW dependency with saved PC.
2617 CallInsts.insert(&MI);
2618 Wait = AMDGPU::Waitcnt();
2619
2620 const MachineOperand &CallAddrOp = TII.getCalleeOperand(MI);
2621 if (CallAddrOp.isReg()) {
2622 ScoreBrackets.determineWaitForPhysReg(
2623 SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait, MI);
2624
2625 if (const auto *RtnAddrOp =
2626 TII.getNamedOperand(MI, AMDGPU::OpName::dst)) {
2627 ScoreBrackets.determineWaitForPhysReg(
2628 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait, MI);
2629 }
2630 }
2631 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
2632 ScoreBrackets.tryClearSCCWriteEvent(&MI);
2633 } else {
2634 // FIXME: Should not be relying on memoperands.
2635 // Look at the source operands of every instruction to see if
2636 // any of them results from a previous memory operation that affects
2637 // its current usage. If so, an s_waitcnt instruction needs to be
2638 // emitted.
2639 // If the source operand was defined by a load, add the s_waitcnt
2640 // instruction.
2641 //
2642 // Two cases are handled for destination operands:
2643 // 1) If the destination operand was defined by a load, add the s_waitcnt
2644 // instruction to guarantee the right WAW order.
2645 // 2) If a destination operand that was used by a recent export/store ins,
2646 // add s_waitcnt on exp_cnt to guarantee the WAR order.
2647
2648 for (const MachineMemOperand *Memop : MI.memoperands()) {
2649 const Value *Ptr = Memop->getValue();
2650 if (Memop->isStore()) {
2651 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
2652 addWait(Wait, SmemAccessCounter, 0);
2653 if (PDT.dominates(MI.getParent(), It->second))
2654 SLoadAddresses.erase(It);
2655 }
2656 }
2657 unsigned AS = Memop->getAddrSpace();
2659 continue;
2660 // No need to wait before load from VMEM to LDS.
2661 if (TII.mayWriteLDSThroughDMA(MI))
2662 continue;
2663
2664 // LOAD_CNT is only relevant to vgpr or LDS.
2665 unsigned TID = LDSDMA_BEGIN;
2666 if (Ptr && Memop->getAAInfo()) {
2667 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2668 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2669 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
2670 if ((I + 1) >= NUM_LDSDMA) {
2671 // We didn't have enough slot to track this LDS DMA store, it
2672 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
2673 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, TID,
2674 Wait);
2675 break;
2676 }
2677
2678 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT,
2679 TID + I + 1, Wait);
2680 }
2681 }
2682 } else {
2683 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, TID, Wait);
2684 }
2685 if (Memop->isStore()) {
2686 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::EXP_CNT, TID, Wait);
2687 }
2688 }
2689
2690 // Loop over use and def operands.
2691 for (const MachineOperand &Op : MI.operands()) {
2692 if (!Op.isReg())
2693 continue;
2694
2695 // If the instruction does not read tied source, skip the operand.
2696 if (Op.isTied() && Op.isUse() && TII.doesNotReadTiedSource(MI))
2697 continue;
2698
2699 MCPhysReg Reg = Op.getReg().asMCReg();
2700
2701 const bool IsVGPR = TRI.isVectorRegister(MRI, Op.getReg());
2702 if (IsVGPR) {
2703 // Implicit VGPR defs and uses are never a part of the memory
2704 // instructions description and usually present to account for
2705 // super-register liveness.
2706 // TODO: Most of the other instructions also have implicit uses
2707 // for the liveness accounting only.
2708 if (Op.isImplicit() && MI.mayLoadOrStore())
2709 continue;
2710
2711 ScoreBrackets.determineWaitForPhysReg(AMDGPU::VA_VDST, Reg, Wait, MI);
2712 if (Op.isDef())
2713 ScoreBrackets.determineWaitForPhysReg(AMDGPU::VM_VSRC, Reg, Wait,
2714 MI);
2715 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2716 // previous write and this write are the same type of VMEM
2717 // instruction, in which case they are (in some architectures)
2718 // guaranteed to write their results in order anyway.
2719 // Additionally check instructions where Point Sample Acceleration
2720 // might be applied.
2721 if (Op.isUse() || !updateVMCntOnly(MI) ||
2722 ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||
2723 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
2724 !ST.hasVmemWriteVgprInOrder()) {
2725 ScoreBrackets.determineWaitForPhysReg(AMDGPU::LOAD_CNT, Reg, Wait,
2726 MI);
2727 ScoreBrackets.determineWaitForPhysReg(AMDGPU::SAMPLE_CNT, Reg, Wait,
2728 MI);
2729 ScoreBrackets.determineWaitForPhysReg(AMDGPU::BVH_CNT, Reg, Wait,
2730 MI);
2731 ScoreBrackets.clearVgprVmemTypes(Reg);
2732 }
2733
2734 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2735 ScoreBrackets.determineWaitForPhysReg(AMDGPU::EXP_CNT, Reg, Wait,
2736 MI);
2737 }
2738 ScoreBrackets.determineWaitForPhysReg(AMDGPU::DS_CNT, Reg, Wait, MI);
2739 } else if (Op.getReg() == AMDGPU::SCC) {
2740 ScoreBrackets.determineWaitForPhysReg(AMDGPU::KM_CNT, Reg, Wait, MI);
2741 } else {
2742 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait,
2743 MI);
2744 }
2745
2746 if (ST.hasWaitXcnt() && Op.isDef())
2747 ScoreBrackets.determineWaitForPhysReg(AMDGPU::X_CNT, Reg, Wait, MI);
2748 }
2749 }
2750 }
2751 }
2752
2753 // Ensure safety against exceptions from outstanding memory operations while
2754 // waiting for a barrier:
2755 //
2756 // * Some subtargets safely handle backing off the barrier in hardware
2757 // when an exception occurs.
2758 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2759 // there can be no outstanding memory operations during the wait.
2760 // * Subtargets with split barriers don't need to back off the barrier; it
2761 // is up to the trap handler to preserve the user barrier state correctly.
2762 //
2763 // In all other cases, ensure safety by ensuring that there are no outstanding
2764 // memory operations.
2765 if (Opc == AMDGPU::S_BARRIER && !ST.hasAutoWaitcntBeforeBarrier() &&
2766 !ST.hasBackOffBarrier()) {
2767 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2768 }
2769
2770 // TODO: Remove this work-around, enable the assert for Bug 457939
2771 // after fixing the scheduler. Also, the Shader Compiler code is
2772 // independent of target.
2773 if (SIInstrInfo::isCBranchVCCZRead(MI) && ST.hasReadVCCZBug() &&
2774 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2775 Wait.set(AMDGPU::DS_CNT, 0);
2776 }
2777
2778 // Verify that the wait is actually needed.
2779 ScoreBrackets.simplifyWaitcnt(Wait);
2780
2781 // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
2782 // waits on VA_VDST if the instruction it would precede is not a VALU
2783 // instruction, since hardware handles VALU->VGPR->VALU hazards in
2784 // expert scheduling mode.
2785 if (TII.isVALU(MI))
2786 Wait.set(AMDGPU::VA_VDST, ~0u);
2787
2788 // Since the translation for VMEM addresses occur in-order, we can apply the
2789 // XCnt if the current instruction is of VMEM type and has a memory
2790 // dependency with another VMEM instruction in flight.
2791 if (Wait.get(AMDGPU::X_CNT) != ~0u && isVmemAccess(MI)) {
2792 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::X_CNT);
2793 Wait.set(AMDGPU::X_CNT, ~0u);
2794 }
2795
2796 // When forcing emit, we need to skip terminators because that would break the
2797 // terminators of the MBB if we emit a waitcnt between terminators.
2798 if (ForceEmitZeroFlag && !MI.isTerminator())
2799 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2800
2801 // If we force waitcnt then update Wait accordingly.
2803 if (!ForceEmitWaitcnt[T])
2804 continue;
2805 Wait.set(T, 0);
2806 }
2807
2808 if (FlushFlags.FlushVmCnt) {
2811 Wait.set(T, 0);
2812 }
2813
2814 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(AMDGPU::DS_CNT))
2815 Wait.set(AMDGPU::DS_CNT, 0);
2816
2817 if (ForceEmitZeroLoadFlag && Wait.get(AMDGPU::LOAD_CNT) != ~0u)
2818 Wait.set(AMDGPU::LOAD_CNT, 0);
2819
2820 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2821 OldWaitcntInstr);
2822}
2823
2824bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2826 MachineBasicBlock &Block,
2827 WaitcntBrackets &ScoreBrackets,
2828 MachineInstr *OldWaitcntInstr) {
2829 bool Modified = false;
2830
2831 if (OldWaitcntInstr)
2832 // Try to merge the required wait with preexisting waitcnt instructions.
2833 // Also erase redundant waitcnt.
2834 Modified =
2835 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2836
2837 // ExpCnt can be merged into VINTERP.
2838 if (Wait.get(AMDGPU::EXP_CNT) != ~0u && It != Block.instr_end() &&
2840 MachineOperand *WaitExp = TII.getNamedOperand(*It, AMDGPU::OpName::waitexp);
2841 if (Wait.get(AMDGPU::EXP_CNT) < WaitExp->getImm()) {
2842 WaitExp->setImm(Wait.get(AMDGPU::EXP_CNT));
2843 Modified = true;
2844 }
2845 // Apply ExpCnt before resetting it, so applyWaitcnt below sees all counts.
2846 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::EXP_CNT);
2847 Wait.set(AMDGPU::EXP_CNT, ~0u);
2848
2849 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2850 << "Update Instr: " << *It);
2851 }
2852
2853 if (WCG->createNewWaitcnt(Block, It, Wait, ScoreBrackets))
2854 Modified = true;
2855
2856 // Any counts that could have been applied to any existing waitcnt
2857 // instructions will have been done so, now deal with any remaining.
2858 ScoreBrackets.applyWaitcnt(Wait);
2859
2860 return Modified;
2861}
2862
2863std::optional<WaitEventType>
2864SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
2865 if (TII.isVALU(Inst)) {
2866 // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
2867 // out-of-order with respect to each other, so each of these classes
2868 // has its own event.
2869
2870 if (TII.isXDL(Inst))
2871 return VGPR_XDL_WRITE;
2872
2873 if (TII.isTRANS(Inst))
2874 return VGPR_TRANS_WRITE;
2875
2877 return VGPR_DPMACC_WRITE;
2878
2879 return VGPR_CSMACC_WRITE;
2880 }
2881
2882 // FLAT and LDS instructions may read their VGPR sources out-of-order
2883 // with respect to each other and all other VMEM instructions, so
2884 // each of these also has a separate event.
2885
2886 if (TII.isFLAT(Inst))
2887 return VGPR_FLAT_READ;
2888
2889 if (TII.isDS(Inst))
2890 return VGPR_LDS_READ;
2891
2892 if (TII.isVMEM(Inst) || TII.isVIMAGE(Inst) || TII.isVSAMPLE(Inst))
2893 return VGPR_VMEM_READ;
2894
2895 // Otherwise, no hazard.
2896
2897 return {};
2898}
2899
2900bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2901 return (TII.isFLAT(MI) && TII.mayAccessVMEMThroughFlat(MI)) ||
2902 (TII.isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2903}
2904
2905// Return true if the next instruction is S_ENDPGM, following fallthrough
2906// blocks if necessary.
2907bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2908 MachineBasicBlock *Block) const {
2909 auto BlockEnd = Block->getParent()->end();
2910 auto BlockIter = Block->getIterator();
2911
2912 while (true) {
2913 if (It.isEnd()) {
2914 if (++BlockIter != BlockEnd) {
2915 It = BlockIter->instr_begin();
2916 continue;
2917 }
2918
2919 return false;
2920 }
2921
2922 if (!It->isMetaInstruction())
2923 break;
2924
2925 It++;
2926 }
2927
2928 assert(!It.isEnd());
2929
2930 return It->getOpcode() == AMDGPU::S_ENDPGM;
2931}
2932
2933// Add a wait after an instruction if architecture requirements mandate one.
2934bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2935 MachineBasicBlock &Block,
2936 WaitcntBrackets &ScoreBrackets) {
2937 AMDGPU::Waitcnt Wait;
2938 bool NeedsEndPGMCheck = false;
2939
2940 if (ST.isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2941 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2943
2944 if (TII.isAlwaysGDS(Inst.getOpcode())) {
2945 Wait.set(AMDGPU::DS_CNT, 0);
2946 NeedsEndPGMCheck = true;
2947 }
2948
2949 ScoreBrackets.simplifyWaitcnt(Wait);
2950
2951 auto SuccessorIt = std::next(Inst.getIterator());
2952 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2953 /*OldWaitcntInstr=*/nullptr);
2954
2955 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2956 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII.get(AMDGPU::S_NOP))
2957 .addImm(0);
2958 }
2959
2960 return Result;
2961}
2962
2963WaitEventSet SIInsertWaitcnts::getEventsFor(const MachineInstr &Inst) const {
2964 WaitEventSet Events;
2965 if (IsExpertMode) {
2966 if (const auto ET = getExpertSchedulingEventType(Inst))
2967 Events.insert(*ET);
2968 }
2969
2970 if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
2971 if (TII.isAlwaysGDS(Inst.getOpcode()) ||
2972 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2973 Events.insert(GDS_ACCESS);
2974 Events.insert(GDS_GPR_LOCK);
2975 } else {
2976 Events.insert(LDS_ACCESS);
2977 }
2978 } else if (TII.isFLAT(Inst)) {
2980 Events.insert(getVmemWaitEventType(Inst));
2981 } else {
2982 assert(Inst.mayLoadOrStore());
2983 if (TII.mayAccessVMEMThroughFlat(Inst)) {
2984 if (ST.hasWaitXcnt())
2985 Events.insert(VMEM_GROUP);
2986 Events.insert(getVmemWaitEventType(Inst));
2987 }
2988 if (TII.mayAccessLDSThroughFlat(Inst))
2989 Events.insert(LDS_ACCESS);
2990 }
2991 } else if (SIInstrInfo::isVMEM(Inst) &&
2993 Inst.getOpcode() == AMDGPU::BUFFER_WBL2)) {
2994 // BUFFER_WBL2 is included here because unlike invalidates, has to be
2995 // followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has
2996 // completed.
2997 if (ST.hasWaitXcnt())
2998 Events.insert(VMEM_GROUP);
2999 Events.insert(getVmemWaitEventType(Inst));
3000 if (ST.vmemWriteNeedsExpWaitcnt() &&
3001 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
3002 Events.insert(VMW_GPR_LOCK);
3003 }
3004 } else if (TII.isSMRD(Inst)) {
3005 if (ST.hasWaitXcnt())
3006 Events.insert(SMEM_GROUP);
3007 Events.insert(SMEM_ACCESS);
3008 } else if (SIInstrInfo::isLDSDIR(Inst)) {
3009 Events.insert(EXP_LDS_ACCESS);
3010 } else if (SIInstrInfo::isEXP(Inst)) {
3011 unsigned Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
3013 Events.insert(EXP_PARAM_ACCESS);
3014 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
3015 Events.insert(EXP_POS_ACCESS);
3016 else
3017 Events.insert(EXP_GPR_LOCK);
3018 } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
3019 Events.insert(SCC_WRITE);
3020 } else {
3021 switch (Inst.getOpcode()) {
3022 case AMDGPU::S_SENDMSG:
3023 case AMDGPU::S_SENDMSG_RTN_B32:
3024 case AMDGPU::S_SENDMSG_RTN_B64:
3025 case AMDGPU::S_SENDMSGHALT:
3026 Events.insert(SQ_MESSAGE);
3027 break;
3028 case AMDGPU::S_MEMTIME:
3029 case AMDGPU::S_MEMREALTIME:
3030 case AMDGPU::S_GET_BARRIER_STATE_M0:
3031 case AMDGPU::S_GET_BARRIER_STATE_IMM:
3032 Events.insert(SMEM_ACCESS);
3033 break;
3034 }
3035 }
3036 return Events;
3037}
3038
3039void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
3040 WaitcntBrackets *ScoreBrackets) {
3041
3042 WaitEventSet InstEvents = getEventsFor(Inst);
3043 for (WaitEventType E : wait_events()) {
3044 if (InstEvents.contains(E))
3045 ScoreBrackets->updateByEvent(E, Inst);
3046 }
3047
3048 if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
3049 if (TII.isAlwaysGDS(Inst.getOpcode()) ||
3050 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
3051 ScoreBrackets->setPendingGDS();
3052 }
3053 } else if (TII.isFLAT(Inst)) {
3054 if (Inst.mayLoadOrStore() && TII.mayAccessVMEMThroughFlat(Inst) &&
3055 TII.mayAccessLDSThroughFlat(Inst) && !SIInstrInfo::isLDSDMA(Inst)) {
3056 // Async/LDSDMA operations have FLAT encoding but do not actually use flat
3057 // pointers. They do have two operands that each access global and LDS,
3058 // thus making it appear at this point that they are using a flat pointer.
3059 // Filter them out, and for the rest, generate a dependency on flat
3060 // pointers so that both VM and LGKM counters are flushed.
3061 ScoreBrackets->setPendingFlat();
3062 }
3063 if (SIInstrInfo::usesASYNC_CNT(Inst)) {
3064 ScoreBrackets->updateByEvent(ASYNC_ACCESS, Inst);
3065 }
3066 } else if (Inst.isCall()) {
3067 // Act as a wait on everything, but AsyncCnt is never included in such
3068 // blanket waits.
3069 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
3070 ScoreBrackets->setStateOnFunctionEntryOrReturn();
3071 } else if (TII.isVINTERP(Inst)) {
3072 int64_t Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
3073 ScoreBrackets->applyWaitcnt(AMDGPU::EXP_CNT, Imm);
3074 }
3075}
3076
3077bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
3078 unsigned OtherScore) {
3079 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
3080 unsigned OtherShifted =
3081 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
3082 Score = std::max(MyShifted, OtherShifted);
3083 return OtherShifted > MyShifted;
3084}
3085
3086bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
3087 ArrayRef<CounterValueArray> OtherMarks) {
3088 bool StrictDom = false;
3089
3090 LLVM_DEBUG(dbgs() << "Merging async marks ...");
3091 // Early exit: nothing to merge when both sides are empty.
3092 if (AsyncMarks.empty() && OtherMarks.empty()) {
3093 LLVM_DEBUG(dbgs() << " nothing to merge\n");
3094 return false;
3095 }
3096 LLVM_DEBUG(dbgs() << '\n');
3097
3098 // Determine maximum length needed after merging
3099 auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.size());
3100 MaxSize = std::min(MaxSize, MaxAsyncMarks);
3101
3102 // Keep only the most recent marks within our limit.
3103 if (AsyncMarks.size() > MaxSize)
3104 AsyncMarks.erase(AsyncMarks.begin(),
3105 AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
3106
3107 // Pad with zero-filled marks if our list is shorter. Zero represents "no
3108 // pending async operations at this checkpoint" and acts as the identity
3109 // element for max() during merging. We pad at the beginning since the marks
3110 // need to be aligned in most-recent order.
3111 constexpr CounterValueArray ZeroMark{};
3112 AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
3113
3114 LLVM_DEBUG({
3115 dbgs() << "Before merge:\n";
3116 for (const auto &Mark : AsyncMarks) {
3117 llvm::interleaveComma(Mark, dbgs());
3118 dbgs() << '\n';
3119 }
3120 dbgs() << "Other marks:\n";
3121 for (const auto &Mark : OtherMarks) {
3122 llvm::interleaveComma(Mark, dbgs());
3123 dbgs() << '\n';
3124 }
3125 });
3126
3127 // Merge element-wise using the existing mergeScore function and the
3128 // appropriate MergeInfo for each counter type. Iterate only while we have
3129 // elements in both vectors.
3130 unsigned OtherSize = OtherMarks.size();
3131 unsigned OurSize = AsyncMarks.size();
3132 unsigned MergeCount = std::min(OtherSize, OurSize);
3133 // OtherMarks is empty -> OtherSize == 0 -> MergeCount == 0.
3134 // Our existing marks are the conservative result; return early to avoid
3135 // passing MergeCount == 0 to seq_inclusive which asserts Begin <= End.
3136 if (MergeCount == 0)
3137 return StrictDom;
3138 for (auto Idx : seq_inclusive<unsigned>(1, MergeCount)) {
3139 for (auto T : inst_counter_types(Context->MaxCounter)) {
3140 StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],
3141 OtherMarks[OtherSize - Idx][T]);
3142 }
3143 }
3144
3145 LLVM_DEBUG({
3146 dbgs() << "After merge:\n";
3147 for (const auto &Mark : AsyncMarks) {
3148 llvm::interleaveComma(Mark, dbgs());
3149 dbgs() << '\n';
3150 }
3151 });
3152
3153 return StrictDom;
3154}
3155
3156/// Merge the pending events and associater score brackets of \p Other into
3157/// this brackets status.
3158///
3159/// Returns whether the merge resulted in a change that requires tighter waits
3160/// (i.e. the merged brackets strictly dominate the original brackets).
3161bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
3162 bool StrictDom = false;
3163
3164 // Check if "other" has keys we don't have, and create default entries for
3165 // those. If they remain empty after merging, we will clean it up after.
3166 for (auto K : Other.VMem.keys())
3167 VMem.try_emplace(K);
3168 for (auto K : Other.SGPRs.keys())
3169 SGPRs.try_emplace(K);
3170
3171 // Array to store MergeInfo for each counter type
3172 MergeInfo MergeInfos[AMDGPU::NUM_INST_CNTS];
3173
3174 for (auto T : inst_counter_types(Context->MaxCounter)) {
3175 // Merge event flags for this counter
3176 const WaitEventSet &EventsForT = Context->getWaitEvents(T);
3177 const WaitEventSet OldEvents = PendingEvents & EventsForT;
3178 const WaitEventSet OtherEvents = Other.PendingEvents & EventsForT;
3179 if (!OldEvents.contains(OtherEvents))
3180 StrictDom = true;
3181 PendingEvents |= OtherEvents;
3182
3183 // Merge scores for this counter
3184 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
3185 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
3186 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
3187 if (NewUB < ScoreLBs[T])
3188 report_fatal_error("waitcnt score overflow");
3189
3190 MergeInfo &M = MergeInfos[T];
3191 M.OldLB = ScoreLBs[T];
3192 M.OtherLB = Other.ScoreLBs[T];
3193 M.MyShift = NewUB - ScoreUBs[T];
3194 M.OtherShift = NewUB - Other.ScoreUBs[T];
3195
3196 ScoreUBs[T] = NewUB;
3197
3198 if (T == AMDGPU::LOAD_CNT)
3199 StrictDom |= mergeScore(M, LastFlatLoadCnt, Other.LastFlatLoadCnt);
3200
3201 if (T == AMDGPU::DS_CNT) {
3202 StrictDom |= mergeScore(M, LastFlatDsCnt, Other.LastFlatDsCnt);
3203 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
3204 }
3205
3206 if (T == AMDGPU::KM_CNT) {
3207 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
3208 if (Other.hasPendingEvent(SCC_WRITE)) {
3209 if (!OldEvents.contains(SCC_WRITE)) {
3210 PendingSCCWrite = Other.PendingSCCWrite;
3211 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
3212 PendingSCCWrite = nullptr;
3213 }
3214 }
3215 }
3216
3217 for (auto &[RegID, Info] : VMem)
3218 StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));
3219
3220 if (isSmemCounter(T)) {
3221 for (auto &[RegID, Info] : SGPRs) {
3222 auto It = Other.SGPRs.find(RegID);
3223 unsigned OtherScore = (It != Other.SGPRs.end()) ? It->second.get(T) : 0;
3224 StrictDom |= mergeScore(M, Info.get(T), OtherScore);
3225 }
3226 }
3227 }
3228
3229 for (auto &[TID, Info] : VMem) {
3230 if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {
3231 unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;
3232 StrictDom |= NewVmemTypes != Info.VMEMTypes;
3233 Info.VMEMTypes = NewVmemTypes;
3234 }
3235 }
3236
3237 StrictDom |= mergeAsyncMarks(MergeInfos, Other.AsyncMarks);
3238 for (auto T : inst_counter_types(Context->MaxCounter))
3239 StrictDom |= mergeScore(MergeInfos[T], AsyncScore[T], Other.AsyncScore[T]);
3240
3241 purgeEmptyTrackingData();
3242 return StrictDom;
3243}
3244
3245static bool isWaitInstr(MachineInstr &Inst) {
3246 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
3247 return Opcode == AMDGPU::S_WAITCNT ||
3248 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
3249 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
3250 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
3251 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
3252 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
3253 Opcode == AMDGPU::WAIT_ASYNCMARK ||
3254 counterTypeForInstr(Opcode).has_value();
3255}
3256
3257void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
3259 bool ExpertMode) const {
3260 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
3262 BuildMI(MBB, I, DebugLoc(), TII.get(AMDGPU::S_SETREG_IMM32_B32))
3263 .addImm(ExpertMode ? 2 : 0)
3264 .addImm(EncodedReg);
3265}
3266
3267namespace {
3268// TODO: Remove this work-around after fixing the scheduler.
3269// There are two reasons why vccz might be incorrect; see ST.hasReadVCCZBug()
3270// and ST.partialVCCWritesUpdateVCCZ().
3271// i. VCCZBug: There is a hardware bug on CI/SI where SMRD instruction may
3272// corrupt vccz bit, so when we detect that an instruction may read from
3273// a corrupt vccz bit, we need to:
3274// 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
3275// operations to complete.
3276// 2. Recompute the correct value of vccz by writing the current value
3277// of vcc back to vcc.
3278// ii. Partial writes to vcc don't update vccz, so we need to recompute the
3279// correct value of vccz by reading vcc and writing it back to vcc.
3280// No waitcnt is needed in this case.
3281class VCCZWorkaround {
3282 const WaitcntBrackets &ScoreBrackets;
3283 const GCNSubtarget &ST;
3284 const SIInstrInfo &TII;
3285 const SIRegisterInfo &TRI;
3286 bool VCCZCorruptionBug = false;
3287 bool VCCZNotUpdatedByPartialWrites = false;
3288 /// vccz could be incorrect at a basic block boundary if a predecessor wrote
3289 /// to vcc and then issued an smem load, so initialize to true.
3290 bool MustRecomputeVCCZ = true;
3291
3292public:
3293 VCCZWorkaround(const WaitcntBrackets &ScoreBrackets, const GCNSubtarget &ST,
3294 const SIInstrInfo &TII, const SIRegisterInfo &TRI)
3295 : ScoreBrackets(ScoreBrackets), ST(ST), TII(TII), TRI(TRI) {
3296 VCCZCorruptionBug = ST.hasReadVCCZBug();
3297 VCCZNotUpdatedByPartialWrites = !ST.partialVCCWritesUpdateVCCZ();
3298 }
3299 /// If \p MI reads vccz and we must recompute it based on MustRecomputeVCCZ,
3300 /// then emit a vccz recompute instruction before \p MI. This needs to be
3301 /// called on every instruction in the basic block because it also tracks the
3302 /// state and updates MustRecomputeVCCZ accordingly. Returns true if it
3303 /// modified the IR.
3304 bool tryRecomputeVCCZ(MachineInstr &MI) {
3305 // No need to run this if neither bug is present.
3306 if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)
3307 return false;
3308
3309 // If MI is an SMEM and it can corrupt vccz on this target, then we need
3310 // both to emit a waitcnt and to recompute vccz.
3311 // But we don't actually emit a waitcnt here. This is done in
3312 // generateWaitcntInstBefore() because it tracks all the necessary waitcnt
3313 // state, and can either skip emitting a waitcnt if there is already one in
3314 // the IR, or emit an "optimized" combined waitcnt.
3315 // If this is an smem read, it could complete and clobber vccz at any time.
3316 MustRecomputeVCCZ |= VCCZCorruptionBug && TII.isSMRD(MI);
3317
3318 // If the target partial vcc writes don't update vccz, and MI is such an
3319 // instruction then we must recompute vccz.
3320 // Note: We are using PartiallyWritesToVCCOpt optional to avoid calling
3321 // `definesRegister()` more than needed, because it's not very cheap.
3322 std::optional<bool> PartiallyWritesToVCCOpt;
3323 auto PartiallyWritesToVCC = [](MachineInstr &MI) {
3324 return MI.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
3325 MI.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr);
3326 };
3327 if (VCCZNotUpdatedByPartialWrites) {
3328 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
3329 // If this is a partial VCC write but won't update vccz, then we must
3330 // recompute vccz.
3331 MustRecomputeVCCZ |= *PartiallyWritesToVCCOpt;
3332 }
3333
3334 // If MI is a vcc write with no pending smem, or there is a pending smem
3335 // but the target does not suffer from the vccz corruption bug, then we
3336 // don't need to recompute vccz as this write will recompute it anyway.
3337 if (!ScoreBrackets.hasPendingEvent(SMEM_ACCESS) || !VCCZCorruptionBug) {
3338 // Compute PartiallyWritesToVCCOpt if we haven't done so already.
3339 if (!PartiallyWritesToVCCOpt)
3340 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
3341 bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&
3342 MI.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr);
3343 // If we write to the full vcc or we write partially and the target
3344 // updates vccz on partial writes, then vccz will be updated correctly.
3345 bool UpdatesVCCZ = FullyWritesToVCC || (!VCCZNotUpdatedByPartialWrites &&
3346 *PartiallyWritesToVCCOpt);
3347 if (UpdatesVCCZ)
3348 MustRecomputeVCCZ = false;
3349 }
3350
3351 // If MI is a branch that reads VCCZ then emit a waitcnt and a vccz
3352 // restore instruction if either is needed.
3353 if (SIInstrInfo::isCBranchVCCZRead(MI) && MustRecomputeVCCZ) {
3354 // Recompute the vccz bit. Any time a value is written to vcc, the vccz
3355 // bit is updated, so we can restore the bit by reading the value of vcc
3356 // and then writing it back to the register.
3357 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
3358 TII.get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
3359 TRI.getVCC())
3360 .addReg(TRI.getVCC());
3361 MustRecomputeVCCZ = false;
3362 return true;
3363 }
3364 return false;
3365 }
3366};
3367
3368} // namespace
3369
3370// Generate s_waitcnt instructions where needed.
3371bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
3372 MachineBasicBlock &Block,
3373 WaitcntBrackets &ScoreBrackets) {
3374 bool Modified = false;
3375
3376 LLVM_DEBUG({
3377 dbgs() << "*** Begin Block: ";
3378 Block.printName(dbgs());
3379 ScoreBrackets.dump();
3380 });
3381 VCCZWorkaround VCCZW(ScoreBrackets, ST, TII, TRI);
3382
3383 // Walk over the instructions.
3384 MachineInstr *OldWaitcntInstr = nullptr;
3385
3386 // NOTE: We may append instrs after Inst while iterating.
3387 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
3388 E = Block.instr_end();
3389 Iter != E; ++Iter) {
3390 MachineInstr &Inst = *Iter;
3391 if (isNonWaitcntMetaInst(Inst))
3392 continue;
3393 // Track pre-existing waitcnts that were added in earlier iterations or by
3394 // the memory legalizer.
3395 if (isWaitInstr(Inst) ||
3396 (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3397 if (!OldWaitcntInstr)
3398 OldWaitcntInstr = &Inst;
3399 continue;
3400 }
3401
3402 PreheaderFlushFlags FlushFlags;
3403 if (Block.getFirstTerminator() == Inst)
3404 FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3405
3406 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
3407 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3408 FlushFlags);
3409 OldWaitcntInstr = nullptr;
3410
3411 if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
3412 // Asyncmarks record the current wait state and so should not allow
3413 // waitcnts that occur after them to be merged into waitcnts that occur
3414 // before.
3415 ScoreBrackets.recordAsyncMark(Inst);
3416 continue;
3417 }
3418
3419 if (TII.isSMRD(Inst)) {
3420 for (const MachineMemOperand *Memop : Inst.memoperands()) {
3421 // No need to handle invariant loads when avoiding WAR conflicts, as
3422 // there cannot be a vector store to the same memory location.
3423 if (!Memop->isInvariant()) {
3424 const Value *Ptr = Memop->getValue();
3425 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
3426 }
3427 }
3428 }
3429
3430 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3431
3432 // Note: insertForcedWaitAfter() may add instrs after Iter that need to be
3433 // visited by the loop.
3434 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
3435
3436 LLVM_DEBUG({
3437 Inst.print(dbgs());
3438 ScoreBrackets.dump();
3439 });
3440
3441 // If the target suffers from the vccz bugs, this may emit the necessary
3442 // vccz recompute instruction before \p Inst if needed.
3443 Modified |= VCCZW.tryRecomputeVCCZ(Inst);
3444 }
3445
3446 // Flush counters at the end of the block if needed (for preheaders with no
3447 // terminator).
3448 AMDGPU::Waitcnt Wait;
3449 if (Block.getFirstTerminator() == Block.end()) {
3450 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3451 if (FlushFlags.FlushVmCnt) {
3452 if (ScoreBrackets.hasPendingEvent(AMDGPU::LOAD_CNT))
3453 Wait.set(AMDGPU::LOAD_CNT, 0);
3454 if (ScoreBrackets.hasPendingEvent(AMDGPU::SAMPLE_CNT))
3455 Wait.set(AMDGPU::SAMPLE_CNT, 0);
3456 if (ScoreBrackets.hasPendingEvent(AMDGPU::BVH_CNT))
3457 Wait.set(AMDGPU::BVH_CNT, 0);
3458 }
3459 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(AMDGPU::DS_CNT))
3460 Wait.set(AMDGPU::DS_CNT, 0);
3461 }
3462
3463 // Combine or remove any redundant waitcnts at the end of the block.
3464 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
3465 OldWaitcntInstr);
3466
3467 LLVM_DEBUG({
3468 dbgs() << "*** End Block: ";
3469 Block.printName(dbgs());
3470 ScoreBrackets.dump();
3471 });
3472
3473 return Modified;
3474}
3475
3476bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &Block) {
3477 if (Block.size() <= 1)
3478 return false;
3479 // The Memory Legalizer conservatively inserts a soft xcnt before each
3480 // atomic RMW operation. However, for sequences of back-to-back atomic
3481 // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
3482 // the redundant soft xcnts.
3483 bool Modified = false;
3484 // Remember the last atomic with a soft xcnt right before it.
3485 MachineInstr *LastAtomicWithSoftXcnt = nullptr;
3486
3487 for (MachineInstr &MI : drop_begin(Block)) {
3488 // Ignore last atomic if non-LDS VMEM and SMEM.
3489 bool IsLDS =
3490 TII.isDS(MI) || (TII.isFLAT(MI) && TII.mayAccessLDSThroughFlat(MI));
3491 if (!IsLDS && (MI.mayLoad() ^ MI.mayStore()))
3492 LastAtomicWithSoftXcnt = nullptr;
3493
3494 bool IsAtomicRMW = (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) &&
3495 MI.mayLoad() && MI.mayStore();
3496 MachineInstr &PrevMI = *MI.getPrevNode();
3497 // This is an atomic with a soft xcnt.
3498 if (PrevMI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3499 // If we have already found an atomic with a soft xcnt, remove this soft
3500 // xcnt as it's redundant.
3501 if (LastAtomicWithSoftXcnt) {
3502 PrevMI.eraseFromParent();
3503 Modified = true;
3504 }
3505 LastAtomicWithSoftXcnt = &MI;
3506 }
3507 }
3508 return Modified;
3509}
3510
3511// Return flags indicating which counters should be flushed in the preheader.
3512PreheaderFlushFlags
3513SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
3514 const WaitcntBrackets &ScoreBrackets) {
3515 auto [Iterator, IsInserted] =
3516 PreheadersToFlush.try_emplace(&MBB, PreheaderFlushFlags());
3517 if (!IsInserted)
3518 return Iterator->second;
3519
3520 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
3521 if (!Succ)
3522 return PreheaderFlushFlags();
3523
3524 MachineLoop *Loop = MLI.getLoopFor(Succ);
3525 if (!Loop)
3526 return PreheaderFlushFlags();
3527
3528 if (Loop->getLoopPreheader() == &MBB) {
3529 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3530 return Iterator->second;
3531 }
3532
3533 return PreheaderFlushFlags();
3534}
3535
3536bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
3538 return TII.mayAccessVMEMThroughFlat(MI);
3539 return SIInstrInfo::isVMEM(MI);
3540}
3541
3542bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {
3543 return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore();
3544}
3545
3546// Check if instruction is a store to LDS that is counted via DSCNT
3547// (where that counter exists).
3548bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {
3549 return MI.mayStore() && SIInstrInfo::isDS(MI);
3550}
3551
3552// Return flags indicating which counters should be flushed in the preheader of
3553// the given loop. We currently decide to flush in the following situations:
3554// For VMEM (FlushVmCnt):
3555// 1. The loop contains vmem store(s), no vmem load and at least one use of a
3556// vgpr containing a value that is loaded outside of the loop. (Only on
3557// targets with no vscnt counter).
3558// 2. The loop contains vmem load(s), but the loaded values are not used in the
3559// loop, and at least one use of a vgpr containing a value that is loaded
3560// outside of the loop.
3561// For DS (FlushDsCnt, GFX12+ only):
3562// 3. The loop contains no DS reads, and at least one use of a vgpr containing
3563// a value that is DS read outside of the loop.
3564// 4. The loop contains DS read(s), loaded values are not used in the same
3565// iteration but in the next iteration (prefetch pattern), and at least one
3566// use of a vgpr containing a value that is DS read outside of the loop.
3567// Flushing in preheader reduces wait overhead if the wait requirement in
3568// iteration 1 would otherwise be more strict (but unfortunately preheader
3569// flush decision is taken before knowing that).
3570// 5. (Single-block loops only) The loop has DS prefetch reads with flush point
3571// tracking. Some DS reads may be used in the same iteration (creating
3572// "flush points"), but others remain unflushed at the backedge. When a DS
3573// read is consumed in the same iteration, it and all prior reads are
3574// "flushed" (FIFO order). No DS writes are allowed in the loop.
3575// TODO: Find a way to extend to multi-block loops.
3576PreheaderFlushFlags
3577SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
3578 const WaitcntBrackets &Brackets) {
3579 PreheaderFlushFlags Flags;
3580 bool HasVMemLoad = false;
3581 bool HasVMemStore = false;
3582 bool UsesVgprVMEMLoadedOutside = false;
3583 bool UsesVgprDSReadOutside = false;
3584 bool VMemInvalidated = false;
3585 // DS optimization only applies to GFX12+ where DS_CNT is separate.
3586 // Tracking status for "no DS read in loop" or "pure DS prefetch
3587 // (use only in next iteration)".
3588 bool TrackSimpleDSOpt = ST.hasExtendedWaitCounts();
3589 DenseSet<MCRegUnit> VgprUse;
3590 DenseSet<MCRegUnit> VgprDefVMEM;
3591 DenseSet<MCRegUnit> VgprDefDS;
3592
3593 // Track DS reads for prefetch pattern with flush points (single-block only).
3594 // Keeps track of the last DS read (position counted from the top of the loop)
3595 // to each VGPR. Read is considered consumed (and thus needs flushing) if
3596 // the dest register has a use or is overwritten (by any later opertions).
3597 DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
3598 unsigned DSReadPosition = 0;
3599 bool IsSingleBlock = ML->getNumBlocks() == 1;
3600 bool TrackDSFlushPoint = ST.hasExtendedWaitCounts() && IsSingleBlock;
3601 unsigned LastDSFlushPosition = 0;
3602
3603 for (MachineBasicBlock *MBB : ML->blocks()) {
3604 for (MachineInstr &MI : *MBB) {
3605 if (isVMEMOrFlatVMEM(MI)) {
3606 HasVMemLoad |= MI.mayLoad();
3607 HasVMemStore |= MI.mayStore();
3608 }
3609 // TODO: Can we relax DSStore check? There may be cases where
3610 // these DS stores are drained prior to the end of MBB (or loop).
3611 if (mayStoreIncrementingDSCNT(MI)) {
3612 // Early exit if none of the optimizations are feasible.
3613 // Otherwise, set tracking status appropriately and continue.
3614 if (VMemInvalidated)
3615 return Flags;
3616 TrackSimpleDSOpt = false;
3617 TrackDSFlushPoint = false;
3618 }
3619 bool IsDSRead = isDSRead(MI);
3620 if (IsDSRead)
3621 ++DSReadPosition;
3622
3623 // Helper: if RU has a pending DS read, update LastDSFlushPosition
3624 auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
3625 if (!TrackDSFlushPoint)
3626 return;
3627 if (auto It = LastDSReadPositionMap.find(RU);
3628 It != LastDSReadPositionMap.end()) {
3629 // RU defined by DSRead is used or overwritten. Need to complete
3630 // the read, if not already implied by a later DSRead (to any RU)
3631 // needing to complete in FIFO order.
3632 LastDSFlushPosition = std::max(LastDSFlushPosition, It->second);
3633 }
3634 };
3635
3636 for (const MachineOperand &Op : MI.all_uses()) {
3637 if (Op.isDebug() || !TRI.isVectorRegister(MRI, Op.getReg()))
3638 continue;
3639 // Vgpr use
3640 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3641 // If we find a register that is loaded inside the loop, 1. and 2.
3642 // are invalidated.
3643 if (VgprDefVMEM.contains(RU))
3644 VMemInvalidated = true;
3645
3646 // Check for DS reads used inside the loop
3647 if (VgprDefDS.contains(RU))
3648 TrackSimpleDSOpt = false;
3649
3650 // Early exit if all optimizations are invalidated
3651 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3652 return Flags;
3653
3654 // Check for flush points (DS read used in same iteration)
3655 updateDSReadFlushTracking(RU);
3656
3657 VgprUse.insert(RU);
3658 // Check if this register has a pending VMEM load from outside the
3659 // loop (value loaded outside and used inside).
3660 VMEMID ID = toVMEMID(RU);
3661 if (Brackets.hasPendingVMEM(ID, AMDGPU::LOAD_CNT) ||
3662 Brackets.hasPendingVMEM(ID, AMDGPU::SAMPLE_CNT) ||
3663 Brackets.hasPendingVMEM(ID, AMDGPU::BVH_CNT))
3664 UsesVgprVMEMLoadedOutside = true;
3665 // Check if loaded outside the loop via DS (not VMEM/FLAT).
3666 // Only consider it a DS read if there's no pending VMEM load for
3667 // this register, since FLAT can set both counters.
3668 else if (Brackets.hasPendingVMEM(ID, AMDGPU::DS_CNT))
3669 UsesVgprDSReadOutside = true;
3670 }
3671 }
3672
3673 // VMem load vgpr def
3674 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
3675 for (const MachineOperand &Op : MI.all_defs()) {
3676 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3677 // If we find a register that is loaded inside the loop, 1. and 2.
3678 // are invalidated.
3679 if (VgprUse.contains(RU))
3680 VMemInvalidated = true;
3681 VgprDefVMEM.insert(RU);
3682 }
3683 }
3684 // Early exit if all optimizations are invalidated
3685 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3686 return Flags;
3687 }
3688
3689 // DS read vgpr def
3690 // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
3691 // If USE comes before DEF, it's the prefetch pattern (use value from
3692 // previous iteration, read for next iteration). We should still flush
3693 // in preheader so iteration 1 doesn't need to wait inside the loop.
3694 // Only invalidate when DEF comes before USE (same-iteration consumption,
3695 // checked above when processing uses).
3696 if (IsDSRead || TrackDSFlushPoint) {
3697 for (const MachineOperand &Op : MI.all_defs()) {
3698 if (!TRI.isVectorRegister(MRI, Op.getReg()))
3699 continue;
3700 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3701 // Check for overwrite of pending DS read (flush point) by any
3702 // instruction
3703 updateDSReadFlushTracking(RU);
3704 if (IsDSRead) {
3705 VgprDefDS.insert(RU);
3706 if (TrackDSFlushPoint)
3707 LastDSReadPositionMap[RU] = DSReadPosition;
3708 }
3709 }
3710 }
3711 }
3712 }
3713 }
3714
3715 // VMEM flush decision
3716 if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
3717 ((!ST.hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3718 (HasVMemLoad && ST.hasVmemWriteVgprInOrder())))
3719 Flags.FlushVmCnt = true;
3720
3721 // DS flush decision:
3722 // Simple DS Opt: flush if loop uses DS read values from outside
3723 // and either has no DS reads in the loop, or DS reads whose results
3724 // are not used in the loop.
3725 bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
3726 // Prefetch with flush points: some DS reads used in same iteration,
3727 // but unflushed reads remain at backedge
3728 bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
3729 bool DSFlushPointPrefetch =
3730 TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
3731
3732 if (SimpleDSOpt || DSFlushPointPrefetch)
3733 Flags.FlushDsCnt = true;
3734
3735 return Flags;
3736}
3737
3738bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3739 auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3740 auto &PDT =
3741 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3742 AliasAnalysis *AA = nullptr;
3743 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3744 AA = &AAR->getAAResults();
3745
3746 return SIInsertWaitcnts(MLI, PDT, AA, MF).run();
3747}
3748
3749PreservedAnalyses
3752 auto &MLI = MFAM.getResult<MachineLoopAnalysis>(MF);
3753 auto &PDT = MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
3755 .getManager()
3756 .getCachedResult<AAManager>(MF.getFunction());
3757
3758 if (!SIInsertWaitcnts(MLI, PDT, AA, MF).run())
3759 return PreservedAnalyses::all();
3760
3763 .preserve<AAManager>();
3764}
3765
3766bool SIInsertWaitcnts::run() {
3768
3770
3771 // Initialize hardware limits first, as they're needed by the generators.
3772 Limits = AMDGPU::HardwareLimits(IV);
3773
3774 if (ST.hasExtendedWaitCounts()) {
3775 IsExpertMode = ST.hasExpertSchedulingMode() &&
3776 (ExpertSchedulingModeFlag.getNumOccurrences()
3778 : MF.getFunction()
3779 .getFnAttribute("amdgpu-expert-scheduling-mode")
3780 .getValueAsBool());
3781 MaxCounter = IsExpertMode ? AMDGPU::NUM_EXPERT_INST_CNTS
3783 // Initialize WCG per MF. It contains state that depends on MF attributes.
3784 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, Limits,
3785 IsExpertMode);
3786 } else {
3787 MaxCounter = AMDGPU::NUM_NORMAL_INST_CNTS;
3788 // Initialize WCG per MF. It contains state that depends on MF attributes.
3789 WCG = std::make_unique<WaitcntGeneratorPreGFX12>(
3790 MF, AMDGPU::NUM_NORMAL_INST_CNTS, Limits);
3791 }
3792
3793 SmemAccessCounter = getCounterFromEvent(SMEM_ACCESS);
3794
3795 bool Modified = false;
3796
3797 MachineBasicBlock &EntryBB = MF.front();
3798
3799 if (!MFI->isEntryFunction() &&
3800 !MF.getFunction().hasFnAttribute(Attribute::Naked)) {
3801 // Wait for any outstanding memory operations that the input registers may
3802 // depend on. We can't track them and it's better to do the wait after the
3803 // costly call sequence.
3804
3805 // TODO: Could insert earlier and schedule more liberally with operations
3806 // that only use caller preserved registers.
3808 while (I != EntryBB.end() && I->isMetaInstruction())
3809 ++I;
3810
3811 if (ST.hasExtendedWaitCounts()) {
3812 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
3813 .addImm(0);
3815 if (CT == AMDGPU::LOAD_CNT || CT == AMDGPU::DS_CNT ||
3816 CT == AMDGPU::STORE_CNT || CT == AMDGPU::X_CNT ||
3817 CT == AMDGPU::ASYNC_CNT)
3818 continue;
3819
3820 if (!ST.hasImageInsts() &&
3821 (CT == AMDGPU::EXP_CNT || CT == AMDGPU::SAMPLE_CNT ||
3822 CT == AMDGPU::BVH_CNT))
3823 continue;
3824
3825 BuildMI(EntryBB, I, DebugLoc(),
3826 TII.get(instrsForExtendedCounterTypes[CT]))
3827 .addImm(0);
3828 }
3829 if (IsExpertMode) {
3830 unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, ST);
3832 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3833 .addImm(Enc);
3834 }
3835 } else {
3836 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT)).addImm(0);
3837 }
3838
3839 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
3840 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3841 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3842
3843 Modified = true;
3844 }
3845
3846 // Keep iterating over the blocks in reverse post order, inserting and
3847 // updating s_waitcnt where needed, until a fix point is reached.
3848 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3849 BlockInfos.try_emplace(MBB);
3850
3851 std::unique_ptr<WaitcntBrackets> Brackets;
3852 bool Repeat;
3853 do {
3854 Repeat = false;
3855
3856 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
3857 ++BII) {
3858 MachineBasicBlock *MBB = BII->first;
3859 BlockInfo &BI = BII->second;
3860 if (!BI.Dirty)
3861 continue;
3862
3863 if (BI.Incoming) {
3864 if (!Brackets)
3865 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3866 else
3867 *Brackets = *BI.Incoming;
3868 } else {
3869 if (!Brackets) {
3870 Brackets = std::make_unique<WaitcntBrackets>(this);
3871 } else {
3872 // Reinitialize in-place. N.B. do not do this by assigning from a
3873 // temporary because the WaitcntBrackets class is large and it could
3874 // cause this function to use an unreasonable amount of stack space.
3875 Brackets->~WaitcntBrackets();
3876 new (Brackets.get()) WaitcntBrackets(this);
3877 }
3878 }
3879
3880 if (ST.hasWaitXcnt())
3881 Modified |= removeRedundantSoftXcnts(*MBB);
3882 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
3883 BI.Dirty = false;
3884
3885 if (Brackets->hasPendingEvent()) {
3886 BlockInfo *MoveBracketsToSucc = nullptr;
3887 for (MachineBasicBlock *Succ : MBB->successors()) {
3888 auto *SuccBII = BlockInfos.find(Succ);
3889 BlockInfo &SuccBI = SuccBII->second;
3890 if (!SuccBI.Incoming) {
3891 SuccBI.Dirty = true;
3892 if (SuccBII <= BII) {
3893 LLVM_DEBUG(dbgs() << "Repeat on backedge without merge\n");
3894 Repeat = true;
3895 }
3896 if (!MoveBracketsToSucc) {
3897 MoveBracketsToSucc = &SuccBI;
3898 } else {
3899 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3900 }
3901 } else {
3902 LLVM_DEBUG({
3903 dbgs() << "Try to merge ";
3904 MBB->printName(dbgs());
3905 dbgs() << " into ";
3906 Succ->printName(dbgs());
3907 dbgs() << '\n';
3908 });
3909 if (SuccBI.Incoming->merge(*Brackets)) {
3910 SuccBI.Dirty = true;
3911 if (SuccBII <= BII) {
3912 LLVM_DEBUG(dbgs() << "Repeat on backedge with merge\n");
3913 Repeat = true;
3914 }
3915 }
3916 }
3917 }
3918 if (MoveBracketsToSucc)
3919 MoveBracketsToSucc->Incoming = std::move(Brackets);
3920 }
3921 }
3922 } while (Repeat);
3923
3924 if (ST.hasScalarStores()) {
3925 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3926 bool HaveScalarStores = false;
3927
3928 for (MachineBasicBlock &MBB : MF) {
3929 for (MachineInstr &MI : MBB) {
3930 if (!HaveScalarStores && TII.isScalarStore(MI))
3931 HaveScalarStores = true;
3932
3933 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
3934 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3935 EndPgmBlocks.push_back(&MBB);
3936 }
3937 }
3938
3939 if (HaveScalarStores) {
3940 // If scalar writes are used, the cache must be flushed or else the next
3941 // wave to reuse the same scratch memory can be clobbered.
3942 //
3943 // Insert s_dcache_wb at wave termination points if there were any scalar
3944 // stores, and only if the cache hasn't already been flushed. This could
3945 // be improved by looking across blocks for flushes in postdominating
3946 // blocks from the stores but an explicitly requested flush is probably
3947 // very rare.
3948 for (MachineBasicBlock *MBB : EndPgmBlocks) {
3949 bool SeenDCacheWB = false;
3950
3951 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
3952 I != E; ++I) {
3953 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
3954 SeenDCacheWB = true;
3955 else if (TII.isScalarStore(*I))
3956 SeenDCacheWB = false;
3957
3958 // FIXME: It would be better to insert this before a waitcnt if any.
3959 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
3960 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3961 !SeenDCacheWB) {
3962 Modified = true;
3963 BuildMI(*MBB, I, I->getDebugLoc(), TII.get(AMDGPU::S_DCACHE_WB));
3964 }
3965 }
3966 }
3967 }
3968 }
3969
3970 if (IsExpertMode) {
3971 // Enable expert scheduling on function entry. To satisfy ABI requirements
3972 // and to allow calls between function with different expert scheduling
3973 // settings, disable it around calls and before returns.
3974
3976 while (I != EntryBB.end() && I->isMetaInstruction())
3977 ++I;
3978 setSchedulingMode(EntryBB, I, true);
3979
3980 for (MachineInstr *MI : CallInsts) {
3981 MachineBasicBlock &MBB = *MI->getParent();
3982 setSchedulingMode(MBB, MI, false);
3983 setSchedulingMode(MBB, std::next(MI->getIterator()), true);
3984 }
3985
3986 for (MachineInstr *MI : ReturnInsts)
3987 setSchedulingMode(*MI->getParent(), MI, false);
3988
3989 Modified = true;
3990 }
3991
3992 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
3993 // This is done in different ways depending on how the VGPRs were allocated
3994 // (i.e. whether we're in dynamic VGPR mode or not).
3995 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
3996 // waveslot limited kernel runs slower with the deallocation.
3997 if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) {
3998 for (auto [MI, _] : EndPgmInsts) {
3999 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
4000 TII.get(AMDGPU::S_ALLOC_VGPR))
4001 .addImm(0);
4002 Modified = true;
4003 }
4004 } else if (!WCG->isOptNone() &&
4005 ST.getGeneration() >= AMDGPUSubtarget::GFX11 &&
4006 (MF.getFrameInfo().hasCalls() ||
4007 ST.getOccupancyWithNumVGPRs(
4008 TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass),
4009 /*IsDynamicVGPR=*/false) <
4011 for (auto [MI, Flag] : EndPgmInsts) {
4012 if (Flag) {
4013 if (ST.requiresNopBeforeDeallocVGPRs()) {
4014 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
4015 TII.get(AMDGPU::S_NOP))
4016 .addImm(0);
4017 }
4018 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
4019 TII.get(AMDGPU::S_SENDMSG))
4021 Modified = true;
4022 }
4023 }
4024 }
4025
4026 return Modified;
4027}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
#define _
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
#define AMDGPU_EVENT_NAME(Name)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static std::optional< AMDGPU::InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static bool isWaitInstr(MachineInstr &Inst)
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
Provides some synthesis utilities to produce sequences of values.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
Represents the counter values to wait for in an s_waitcnt instruction.
unsigned get(InstCounterType T) const
void set(InstCounterType T, unsigned Val)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:254
bool erase(const KeyT &Val)
Definition DenseMap.h:328
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:239
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:728
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
LLVM_ABI void printName(raw_ostream &os, unsigned printNameFlags=PrintNameIr, ModuleSlotTracker *moduleSlotTracker=nullptr) const
Print the basic block's name as:
MachineInstrBundleIterator< MachineInstr > iterator
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator begin()
Definition MapVector.h:67
iterator find(const KeyT &Key)
Definition MapVector.h:156
iterator end()
Definition MapVector.h:69
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
Definition Pass.cpp:140
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool usesASYNC_CNT(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:882
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isDPMACCInstruction(unsigned Opc)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
iota_range< InstCounterType > inst_counter_types(InstCounterType MaxCounter)
unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getHasMatrixScale(unsigned Opc)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
bool empty() const
Definition BasicBlock.h:101
LLVM_ABI std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto seq_inclusive(T Begin, T End)
Iterate over an integral type from Begin to End inclusive.
Definition Sequence.h:325
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
APInt operator&(APInt a, const APInt &b)
Definition APInt.h:2152
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition Sequence.h:337
@ Wait
Definition Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2142
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2312
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
@ Async
"Asynchronous" unwind tables (instr precise)
Definition CodeGen.h:157
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool operator|=(SparseBitVector< ElementSize > &LHS, const SparseBitVector< ElementSize > *RHS)
APInt operator|(APInt a, const APInt &b)
Definition APInt.h:2172
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
#define N
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.