LLVM 23.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
31#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/Sequence.h"
39#include "llvm/IR/Dominators.h"
43
44using namespace llvm;
45using namespace llvm::AMDGPU;
46
47#define DEBUG_TYPE "si-insert-waitcnts"
48
49DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
50 "Force emit s_waitcnt expcnt(0) instrs");
51DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
52 "Force emit s_waitcnt lgkmcnt(0) instrs");
53DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
54 "Force emit s_waitcnt vmcnt(0) instrs");
55
56static cl::opt<bool>
57 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
58 cl::desc("Force all waitcnt instrs to be emitted as "
59 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
60 cl::init(false), cl::Hidden);
61
63 "amdgpu-waitcnt-load-forcezero",
64 cl::desc("Force all waitcnt load counters to wait until 0"),
65 cl::init(false), cl::Hidden);
66
68 "amdgpu-expert-scheduling-mode",
69 cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
70 cl::init(false), cl::Hidden);
71
72namespace {
73// Get the maximum wait count value for a given counter type.
74static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
76 switch (T) {
77 case LOAD_CNT:
78 return Limits.LoadcntMax;
79 case DS_CNT:
80 return Limits.DscntMax;
81 case EXP_CNT:
82 return Limits.ExpcntMax;
83 case STORE_CNT:
84 return Limits.StorecntMax;
85 case SAMPLE_CNT:
86 return Limits.SamplecntMax;
87 case BVH_CNT:
88 return Limits.BvhcntMax;
89 case KM_CNT:
90 return Limits.KmcntMax;
91 case X_CNT:
92 return Limits.XcntMax;
93 case VA_VDST:
94 return Limits.VaVdstMax;
95 case VM_VSRC:
96 return Limits.VmVsrcMax;
97 default:
98 return 0;
99 }
100}
101
102/// Integer IDs used to track vector memory locations we may have to wait on.
103/// Encoded as u16 chunks:
104///
105/// [0, REGUNITS_END ): MCRegUnit
106/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
107///
108/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
109/// It gives (2 << 16) - 1 entries per category which is more than enough
110/// for all register units. MCPhysReg is u16 so we don't even support >u16
111/// physical register numbers at this time, let alone >u16 register units.
112/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
113/// is enough for all register units.
114using VMEMID = uint32_t;
115
116enum : VMEMID {
117 TRACKINGID_RANGE_LEN = (1 << 16),
118
119 // Important: MCRegUnits must always be tracked starting from 0, as we
120 // need to be able to convert between a MCRegUnit and a VMEMID freely.
121 REGUNITS_BEGIN = 0,
122 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
123
124 // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
125 // entry, which is updated for all LDS DMA operations encountered.
126 // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
127 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
128 LDSDMA_BEGIN = REGUNITS_END,
129 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
130};
131
132/// Convert a MCRegUnit to a VMEMID.
133static constexpr VMEMID toVMEMID(MCRegUnit RU) {
134 return static_cast<unsigned>(RU);
135}
136
137#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
138 DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \
139 DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
140 DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
141 DECL(GLOBAL_INV_ACCESS) /* GLOBAL_INV (gfx12+ only) */ \
142 DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
143 DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
144 DECL(VMEM_GROUP) /* vmem group */ \
145 DECL(LDS_ACCESS) /* lds read & write */ \
146 DECL(GDS_ACCESS) /* gds read & write */ \
147 DECL(SQ_MESSAGE) /* send message */ \
148 DECL(SCC_WRITE) /* write to SCC from barrier */ \
149 DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
150 DECL(SMEM_GROUP) /* scalar-memory group */ \
151 DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
152 DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
153 DECL(EXP_POS_ACCESS) /* write to export position */ \
154 DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
155 DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
156 DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */ \
157 DECL(VGPR_CSMACC_WRITE) /* write VGPR dest in Core/Side-MACC VALU */ \
158 DECL(VGPR_DPMACC_WRITE) /* write VGPR dest in DPMACC VALU */ \
159 DECL(VGPR_TRANS_WRITE) /* write VGPR dest in TRANS VALU */ \
160 DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \
161 DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
162 DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
163 DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */ \
164 DECL(ASYNC_ACCESS) /* access that uses ASYNC_CNT */
165
166// clang-format off
167#define AMDGPU_EVENT_ENUM(Name) Name,
168enum WaitEventType {
170 NUM_WAIT_EVENTS
171};
172#undef AMDGPU_EVENT_ENUM
173} // namespace
174
175namespace llvm {
176template <> struct enum_iteration_traits<WaitEventType> {
177 static constexpr bool is_iterable = true;
178};
179} // namespace llvm
180
181namespace {
182
183/// Return an iterator over all events between VMEM_ACCESS (the first event)
184/// and \c MaxEvent (exclusive, default value yields an enumeration over
185/// all counters).
186auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
187 return enum_seq(VMEM_ACCESS, MaxEvent);
188}
189
190#define AMDGPU_EVENT_NAME(Name) #Name,
191static constexpr StringLiteral WaitEventTypeName[] = {
193};
194#undef AMDGPU_EVENT_NAME
195static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
196 return WaitEventTypeName[Event];
197}
198// clang-format on
199
200// Enumerate different types of result-returning VMEM operations. Although
201// s_waitcnt orders them all with a single vmcnt counter, in the absence of
202// s_waitcnt only instructions of the same VmemType are guaranteed to write
203// their results in order -- so there is no need to insert an s_waitcnt between
204// two instructions of the same type that write the same vgpr.
205enum VmemType {
206 // BUF instructions and MIMG instructions without a sampler.
207 VMEM_NOSAMPLER,
208 // MIMG instructions with a sampler.
209 VMEM_SAMPLER,
210 // BVH instructions
211 VMEM_BVH,
212 NUM_VMEM_TYPES
213};
214
215// Maps values of InstCounterType to the instruction that waits on that
216// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
217// returns true, and does not cover VA_VDST or VM_VSRC.
218static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
219 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
220 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
221 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT, AMDGPU::S_WAIT_ASYNCCNT};
222
223static bool updateVMCntOnly(const MachineInstr &Inst) {
224 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
226}
227
228#ifndef NDEBUG
229static bool isNormalMode(InstCounterType MaxCounter) {
230 return MaxCounter == NUM_NORMAL_INST_CNTS;
231}
232#endif // NDEBUG
233
234VmemType getVmemType(const MachineInstr &Inst) {
235 assert(updateVMCntOnly(Inst));
236 if (!SIInstrInfo::isImage(Inst))
237 return VMEM_NOSAMPLER;
238 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
239 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
240 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
241
242 if (BaseInfo->BVH)
243 return VMEM_BVH;
244
245 // We have to make an additional check for isVSAMPLE here since some
246 // instructions don't have a sampler, but are still classified as sampler
247 // instructions for the purposes of e.g. waitcnt.
248 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
249 return VMEM_SAMPLER;
250
251 return VMEM_NOSAMPLER;
252}
253
254void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
255 Wait.set(T, std::min(Wait.get(T), Count));
256}
257
258void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { Wait.set(T, ~0u); }
259
260/// A small set of events.
261class WaitEventSet {
262 unsigned Mask = 0;
263
264public:
265 WaitEventSet() = default;
266 explicit constexpr WaitEventSet(WaitEventType Event) {
267 static_assert(NUM_WAIT_EVENTS <= sizeof(Mask) * 8,
268 "Not enough bits in Mask for all the events");
269 Mask |= 1 << Event;
270 }
271 constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
272 for (auto &E : Events) {
273 Mask |= 1 << E;
274 }
275 }
276 void insert(const WaitEventType &Event) { Mask |= 1 << Event; }
277 void remove(const WaitEventType &Event) { Mask &= ~(1 << Event); }
278 void remove(const WaitEventSet &Other) { Mask &= ~Other.Mask; }
279 bool contains(const WaitEventType &Event) const {
280 return Mask & (1 << Event);
281 }
282 /// \Returns true if this set contains all elements of \p Other.
283 bool contains(const WaitEventSet &Other) const {
284 return (~Mask & Other.Mask) == 0;
285 }
286 /// \Returns the intersection of this and \p Other.
287 WaitEventSet operator&(const WaitEventSet &Other) const {
288 auto Copy = *this;
289 Copy.Mask &= Other.Mask;
290 return Copy;
291 }
292 /// \Returns the union of this and \p Other.
293 WaitEventSet operator|(const WaitEventSet &Other) const {
294 auto Copy = *this;
295 Copy.Mask |= Other.Mask;
296 return Copy;
297 }
298 /// This set becomes the union of this and \p Other.
299 WaitEventSet &operator|=(const WaitEventSet &Other) {
300 Mask |= Other.Mask;
301 return *this;
302 }
303 /// This set becomes the intersection of this and \p Other.
304 WaitEventSet &operator&=(const WaitEventSet &Other) {
305 Mask &= Other.Mask;
306 return *this;
307 }
308 bool operator==(const WaitEventSet &Other) const {
309 return Mask == Other.Mask;
310 }
311 bool operator!=(const WaitEventSet &Other) const { return !(*this == Other); }
312 bool empty() const { return Mask == 0; }
313 /// \Returns true if the set contains more than one element.
314 bool twoOrMore() const { return Mask & (Mask - 1); }
315 operator bool() const { return !empty(); }
316 void print(raw_ostream &OS) const {
317 ListSeparator LS(", ");
318 for (WaitEventType Event : wait_events()) {
319 if (contains(Event))
320 OS << LS << getWaitEventTypeName(Event);
321 }
322 }
323 LLVM_DUMP_METHOD void dump() const;
324};
325
326void WaitEventSet::dump() const {
327 print(dbgs());
328 dbgs() << "\n";
329}
330
331class WaitcntBrackets;
332
333// This abstracts the logic for generating and updating S_WAIT* instructions
334// away from the analysis that determines where they are needed. This was
335// done because the set of counters and instructions for waiting on them
336// underwent a major shift with gfx12, sufficiently so that having this
337// abstraction allows the main analysis logic to be simpler than it would
338// otherwise have had to become.
339class WaitcntGenerator {
340protected:
341 const GCNSubtarget &ST;
342 const SIInstrInfo &TII;
343 AMDGPU::IsaVersion IV;
344 InstCounterType MaxCounter;
345 bool OptNone;
346 bool ExpandWaitcntProfiling = false;
347 const AMDGPU::HardwareLimits &Limits;
348
349public:
350 WaitcntGenerator() = delete;
351 WaitcntGenerator(const WaitcntGenerator &) = delete;
352 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter,
353 const AMDGPU::HardwareLimits &Limits)
354 : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
355 IV(AMDGPU::getIsaVersion(ST.getCPU())), MaxCounter(MaxCounter),
356 OptNone(MF.getFunction().hasOptNone() ||
357 MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
358 ExpandWaitcntProfiling(
359 MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),
360 Limits(Limits) {}
361
362 // Return true if the current function should be compiled with no
363 // optimization.
364 bool isOptNone() const { return OptNone; }
365
366 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
367
368 // Edits an existing sequence of wait count instructions according
369 // to an incoming Waitcnt value, which is itself updated to reflect
370 // any new wait count instructions which may need to be generated by
371 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
372 // were made.
373 //
374 // This editing will usually be merely updated operands, but it may also
375 // delete instructions if the incoming Wait value indicates they are not
376 // needed. It may also remove existing instructions for which a wait
377 // is needed if it can be determined that it is better to generate new
378 // instructions later, as can happen on gfx12.
379 virtual bool
380 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
381 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
383
384 // Transform a soft waitcnt into a normal one.
385 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
386
387 // Generates new wait count instructions according to the value of
388 // Wait, returning true if any new instructions were created.
389 // ScoreBrackets is used for profiling expansion.
390 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
392 AMDGPU::Waitcnt Wait,
393 const WaitcntBrackets &ScoreBrackets) = 0;
394
395 // Returns the WaitEventSet that corresponds to counter \p T.
396 virtual const WaitEventSet &getWaitEvents(InstCounterType T) const = 0;
397
398 /// \returns the counter that corresponds to event \p E.
399 InstCounterType getCounterFromEvent(WaitEventType E) const {
400 for (auto T : inst_counter_types()) {
401 if (getWaitEvents(T).contains(E))
402 return T;
403 }
404 llvm_unreachable("event type has no associated counter");
405 }
406
407 // Returns a new waitcnt with all counters except VScnt set to 0. If
408 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
409 // AsyncCnt always defaults to ~0u (don't wait for it). It is only updated
410 // when a call to @llvm.amdgcn.wait.asyncmark() is processed.
411 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
412
413 virtual ~WaitcntGenerator() = default;
414};
415
416class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
417 static constexpr const WaitEventSet
418 WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
419 WaitEventSet(
420 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
421 WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
422 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
423 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
424 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
425 WaitEventSet(),
426 WaitEventSet(),
427 WaitEventSet(),
428 WaitEventSet(),
429 WaitEventSet(),
430 WaitEventSet()};
431
432public:
433 using WaitcntGenerator::WaitcntGenerator;
434 bool
435 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
436 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
437 MachineBasicBlock::instr_iterator It) const override;
438
439 bool createNewWaitcnt(MachineBasicBlock &Block,
441 AMDGPU::Waitcnt Wait,
442 const WaitcntBrackets &ScoreBrackets) override;
443
444 const WaitEventSet &getWaitEvents(InstCounterType T) const override {
445 return WaitEventMaskForInstPreGFX12[T];
446 }
447
448 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
449};
450
451class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
452protected:
453 bool IsExpertMode;
454 static constexpr const WaitEventSet
455 WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
456 WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
457 WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
458 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
459 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
460 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
461 WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),
462 WaitEventSet({VMEM_BVH_READ_ACCESS}),
463 WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
464 WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
465 WaitEventSet({ASYNC_ACCESS}),
466 WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
467 VGPR_XDL_WRITE}),
468 WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
469
470public:
471 WaitcntGeneratorGFX12Plus() = delete;
472 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
473 InstCounterType MaxCounter,
474 const AMDGPU::HardwareLimits &Limits,
475 bool IsExpertMode)
476 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
477
478 bool
479 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
480 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
481 MachineBasicBlock::instr_iterator It) const override;
482
483 bool createNewWaitcnt(MachineBasicBlock &Block,
485 AMDGPU::Waitcnt Wait,
486 const WaitcntBrackets &ScoreBrackets) override;
487
488 const WaitEventSet &getWaitEvents(InstCounterType T) const override {
489 return WaitEventMaskForInstGFX12Plus[T];
490 }
491
492 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
493};
494
495// Flags indicating which counters should be flushed in a loop preheader.
496struct PreheaderFlushFlags {
497 bool FlushVmCnt = false;
498 bool FlushDsCnt = false;
499};
500
501class SIInsertWaitcnts {
502 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
503 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
504 MachineLoopInfo &MLI;
505 MachinePostDominatorTree &PDT;
506 AliasAnalysis *AA = nullptr;
507 MachineFunction &MF;
508
509 struct BlockInfo {
510 std::unique_ptr<WaitcntBrackets> Incoming;
511 bool Dirty = true;
512 };
513
514 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
515
516 bool ForceEmitWaitcnt[NUM_INST_CNTS] = {};
517
518 std::unique_ptr<WaitcntGenerator> WCG;
519
520 // Remember call and return instructions in the function.
521 DenseSet<MachineInstr *> CallInsts;
522 DenseSet<MachineInstr *> ReturnInsts;
523
524 // Remember all S_ENDPGM instructions. The boolean flag is true if there might
525 // be outstanding stores but definitely no outstanding scratch stores, to help
526 // with insertion of DEALLOC_VGPRS messages.
527 DenseMap<MachineInstr *, bool> EndPgmInsts;
528
529 AMDGPU::HardwareLimits Limits;
530
531public:
532 const GCNSubtarget &ST;
533 const SIInstrInfo &TII;
534 const SIRegisterInfo &TRI;
535 const MachineRegisterInfo &MRI;
536 InstCounterType SmemAccessCounter;
537 InstCounterType MaxCounter;
538 bool IsExpertMode = false;
539
540 SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,
541 AliasAnalysis *AA, MachineFunction &MF)
542 : MLI(MLI), PDT(PDT), AA(AA), MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),
543 TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()),
544 MRI(MF.getRegInfo()) {
545 (void)ForceExpCounter;
546 (void)ForceLgkmCounter;
547 (void)ForceVMCounter;
548 }
549
550 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
551
552 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
553 const WaitcntBrackets &Brackets);
554 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
555 const WaitcntBrackets &ScoreBrackets);
556 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
557 bool isDSRead(const MachineInstr &MI) const;
558 bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;
559 bool run();
560
561 void setForceEmitWaitcnt() {
562// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
563// For debug builds, get the debug counter info and adjust if need be
564#ifndef NDEBUG
565 if (DebugCounter::isCounterSet(ForceExpCounter) &&
566 DebugCounter::shouldExecute(ForceExpCounter)) {
567 ForceEmitWaitcnt[EXP_CNT] = true;
568 } else {
569 ForceEmitWaitcnt[EXP_CNT] = false;
570 }
571
572 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
573 DebugCounter::shouldExecute(ForceLgkmCounter)) {
574 ForceEmitWaitcnt[DS_CNT] = true;
575 ForceEmitWaitcnt[KM_CNT] = true;
576 } else {
577 ForceEmitWaitcnt[DS_CNT] = false;
578 ForceEmitWaitcnt[KM_CNT] = false;
579 }
580
581 if (DebugCounter::isCounterSet(ForceVMCounter) &&
582 DebugCounter::shouldExecute(ForceVMCounter)) {
583 ForceEmitWaitcnt[LOAD_CNT] = true;
584 ForceEmitWaitcnt[SAMPLE_CNT] = true;
585 ForceEmitWaitcnt[BVH_CNT] = true;
586 } else {
587 ForceEmitWaitcnt[LOAD_CNT] = false;
588 ForceEmitWaitcnt[SAMPLE_CNT] = false;
589 ForceEmitWaitcnt[BVH_CNT] = false;
590 }
591
592 ForceEmitWaitcnt[VA_VDST] = false;
593 ForceEmitWaitcnt[VM_VSRC] = false;
594#endif // NDEBUG
595 }
596
597 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
598 // instruction.
599 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
600 switch (Inst.getOpcode()) {
601 // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
602 case AMDGPU::GLOBAL_INV:
603 return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write
604 // VGPRs
605 case AMDGPU::GLOBAL_WB:
606 case AMDGPU::GLOBAL_WBINV:
607 return VMEM_WRITE_ACCESS; // tracked using storecnt
608 default:
609 break;
610 }
611
612 // Maps VMEM access types to their corresponding WaitEventType.
613 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
614 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
615
617 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
618 // these should use VM_CNT.
619 if (!ST.hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
620 return VMEM_ACCESS;
621 if (Inst.mayStore() &&
622 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
623 if (TII.mayAccessScratch(Inst))
624 return SCRATCH_WRITE_ACCESS;
625 return VMEM_WRITE_ACCESS;
626 }
627 if (!ST.hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
628 return VMEM_ACCESS;
629 return VmemReadMapping[getVmemType(Inst)];
630 }
631
632 std::optional<WaitEventType>
633 getExpertSchedulingEventType(const MachineInstr &Inst) const;
634
635 bool isAsync(const MachineInstr &MI) const {
637 return false;
639 return true;
640 const MachineOperand *Async =
641 TII.getNamedOperand(MI, AMDGPU::OpName::IsAsync);
642 return Async && (Async->getImm());
643 }
644
645 bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const {
646 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI);
647 }
648
649 bool isAsyncLdsDmaWrite(const MachineInstr &MI) const {
650 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
651 }
652
653 bool isVmemAccess(const MachineInstr &MI) const;
654 bool generateWaitcntInstBefore(MachineInstr &MI,
655 WaitcntBrackets &ScoreBrackets,
656 MachineInstr *OldWaitcntInstr,
657 PreheaderFlushFlags FlushFlags);
658 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
660 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
661 MachineInstr *OldWaitcntInstr);
662 /// \returns all events that correspond to \p Inst.
663 WaitEventSet getEventsFor(const MachineInstr &Inst) const;
664 void updateEventWaitcntAfter(MachineInstr &Inst,
665 WaitcntBrackets *ScoreBrackets);
666 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
667 MachineBasicBlock *Block) const;
668 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
669 WaitcntBrackets &ScoreBrackets);
670 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
671 WaitcntBrackets &ScoreBrackets);
672 /// Removes redundant Soft Xcnt Waitcnts in \p Block emitted by the Memory
673 /// Legalizer. Returns true if block was modified.
674 bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
675 void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
676 bool ExpertMode) const;
677 const WaitEventSet &getWaitEvents(InstCounterType T) const {
678 return WCG->getWaitEvents(T);
679 }
680 InstCounterType getCounterFromEvent(WaitEventType E) const {
681 return WCG->getCounterFromEvent(E);
682 }
683};
684
685// This objects maintains the current score brackets of each wait counter, and
686// a per-register scoreboard for each wait counter.
687//
688// We also maintain the latest score for every event type that can change the
689// waitcnt in order to know if there are multiple types of events within
690// the brackets. When multiple types of event happen in the bracket,
691// wait count may get decreased out of order, therefore we need to put in
692// "s_waitcnt 0" before use.
693class WaitcntBrackets {
694public:
695 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
696 assert(Context->TRI.getNumRegUnits() < REGUNITS_END);
697 }
698
699#ifndef NDEBUG
700 ~WaitcntBrackets() {
701 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
702 for (auto &[ID, Val] : VMem) {
703 if (Val.empty())
704 ++NumUnusedVmem;
705 }
706 for (auto &[ID, Val] : SGPRs) {
707 if (Val.empty())
708 ++NumUnusedSGPRs;
709 }
710
711 if (NumUnusedVmem || NumUnusedSGPRs) {
712 errs() << "WaitcntBracket had unused entries at destruction time: "
713 << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
714 << " SGPR unused entries\n";
715 std::abort();
716 }
717 }
718#endif
719
720 bool isSmemCounter(InstCounterType T) const {
721 return T == Context->SmemAccessCounter || T == X_CNT;
722 }
723
724 unsigned getOutstanding(InstCounterType T) const {
725 return ScoreUBs[T] - ScoreLBs[T];
726 }
727
728 bool hasPendingVMEM(VMEMID ID, InstCounterType T) const {
729 return getVMemScore(ID, T) > getScoreLB(T);
730 }
731
732 /// \Return true if we have no score entries for counter \p T.
733 bool empty(InstCounterType T) const { return getScoreRange(T) == 0; }
734
735private:
736 unsigned getScoreLB(InstCounterType T) const {
738 return ScoreLBs[T];
739 }
740
741 unsigned getScoreUB(InstCounterType T) const {
743 return ScoreUBs[T];
744 }
745
746 unsigned getScoreRange(InstCounterType T) const {
747 return getScoreUB(T) - getScoreLB(T);
748 }
749
750 unsigned getSGPRScore(MCRegUnit RU, InstCounterType T) const {
751 auto It = SGPRs.find(RU);
752 return It != SGPRs.end() ? It->second.get(T) : 0;
753 }
754
755 unsigned getVMemScore(VMEMID TID, InstCounterType T) const {
756 auto It = VMem.find(TID);
757 return It != VMem.end() ? It->second.Scores[T] : 0;
758 }
759
760public:
761 bool merge(const WaitcntBrackets &Other);
762
763 bool counterOutOfOrder(InstCounterType T) const;
764 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
765 simplifyWaitcnt(Wait, Wait);
766 }
767 void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
768 AMDGPU::Waitcnt &UpdateWait) const;
769 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
770 void simplifyWaitcnt(Waitcnt &Wait, InstCounterType T) const;
771 void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
772 AMDGPU::Waitcnt &UpdateWait) const;
773 void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
774 AMDGPU::Waitcnt &UpdateWait) const;
775
776 void determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
777 AMDGPU::Waitcnt &Wait) const;
778 void determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
779 AMDGPU::Waitcnt &Wait) const;
780 AMDGPU::Waitcnt determineAsyncWait(unsigned N);
781 void tryClearSCCWriteEvent(MachineInstr *Inst);
782
783 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
784 void applyWaitcnt(InstCounterType T, unsigned Count);
785 void applyWaitcnt(const AMDGPU::Waitcnt &Wait, InstCounterType T);
786 void updateByEvent(WaitEventType E, MachineInstr &MI);
787 void recordAsyncMark(MachineInstr &MI);
788
789 bool hasPendingEvent() const { return !PendingEvents.empty(); }
790 bool hasPendingEvent(WaitEventType E) const {
791 return PendingEvents.contains(E);
792 }
793 bool hasPendingEvent(InstCounterType T) const {
794 bool HasPending = PendingEvents & Context->getWaitEvents(T);
795 assert(HasPending == !empty(T) &&
796 "Expected pending events iff scoreboard is not empty");
797 return HasPending;
798 }
799
800 bool hasMixedPendingEvents(InstCounterType T) const {
801 WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
802 // Return true if more than one bit is set in Events.
803 return Events.twoOrMore();
804 }
805
806 bool hasPendingFlat() const {
807 return ((LastFlatDsCnt > ScoreLBs[DS_CNT] &&
808 LastFlatDsCnt <= ScoreUBs[DS_CNT]) ||
809 (LastFlatLoadCnt > ScoreLBs[LOAD_CNT] &&
810 LastFlatLoadCnt <= ScoreUBs[LOAD_CNT]));
811 }
812
813 void setPendingFlat() {
814 LastFlatLoadCnt = ScoreUBs[LOAD_CNT];
815 LastFlatDsCnt = ScoreUBs[DS_CNT];
816 }
817
818 bool hasPendingGDS() const {
819 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
820 }
821
822 unsigned getPendingGDSWait() const {
823 return std::min(getScoreUB(DS_CNT) - LastGDS,
824 getWaitCountMax(Context->getLimits(), DS_CNT) - 1);
825 }
826
827 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
828
829 // Return true if there might be pending writes to the vgpr-interval by VMEM
830 // instructions with types different from V.
831 bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
832 for (MCRegUnit RU : regunits(Reg)) {
833 auto It = VMem.find(toVMEMID(RU));
834 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
835 return true;
836 }
837 return false;
838 }
839
840 void clearVgprVmemTypes(MCPhysReg Reg) {
841 for (MCRegUnit RU : regunits(Reg)) {
842 if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
843 It->second.VMEMTypes = 0;
844 if (It->second.empty())
845 VMem.erase(It);
846 }
847 }
848 }
849
850 void setStateOnFunctionEntryOrReturn() {
851 setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) +
852 getWaitCountMax(Context->getLimits(), STORE_CNT));
853 PendingEvents |= Context->getWaitEvents(STORE_CNT);
854 }
855
856 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
857 return LDSDMAStores;
858 }
859
860 bool hasPointSampleAccel(const MachineInstr &MI) const;
861 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
862 MCPhysReg RU) const;
863
864 void print(raw_ostream &) const;
865 void dump() const { print(dbgs()); }
866
867 // Free up memory by removing empty entries from the DenseMap that track event
868 // scores.
869 void purgeEmptyTrackingData();
870
871private:
872 struct MergeInfo {
873 unsigned OldLB;
874 unsigned OtherLB;
875 unsigned MyShift;
876 unsigned OtherShift;
877 };
878
879 using CounterValueArray = std::array<unsigned, NUM_INST_CNTS>;
880
881 void determineWaitForScore(InstCounterType T, unsigned Score,
882 AMDGPU::Waitcnt &Wait) const;
883
884 static bool mergeScore(const MergeInfo &M, unsigned &Score,
885 unsigned OtherScore);
886 bool mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
887 ArrayRef<CounterValueArray> OtherMarks);
888
890 assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
891 if (!Context->TRI.isInAllocatableClass(Reg))
892 return {{}, {}};
893 const TargetRegisterClass *RC = Context->TRI.getPhysRegBaseClass(Reg);
894 unsigned Size = Context->TRI.getRegSizeInBits(*RC);
895 if (Size == 16 && Context->ST.hasD16Writes32BitVgpr())
896 Reg = Context->TRI.get32BitRegister(Reg);
897 return Context->TRI.regunits(Reg);
898 }
899
900 void setScoreLB(InstCounterType T, unsigned Val) {
902 ScoreLBs[T] = Val;
903 }
904
905 void setScoreUB(InstCounterType T, unsigned Val) {
907 ScoreUBs[T] = Val;
908
909 if (T != EXP_CNT)
910 return;
911
912 if (getScoreRange(EXP_CNT) > getWaitCountMax(Context->getLimits(), EXP_CNT))
913 ScoreLBs[EXP_CNT] =
914 ScoreUBs[EXP_CNT] - getWaitCountMax(Context->getLimits(), EXP_CNT);
915 }
916
917 void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {
918 const SIRegisterInfo &TRI = Context->TRI;
919 if (Reg == AMDGPU::SCC) {
920 SCCScore = Val;
921 } else if (TRI.isVectorRegister(Context->MRI, Reg)) {
922 for (MCRegUnit RU : regunits(Reg))
923 VMem[toVMEMID(RU)].Scores[T] = Val;
924 } else if (TRI.isSGPRReg(Context->MRI, Reg)) {
925 for (MCRegUnit RU : regunits(Reg))
926 SGPRs[RU].get(T) = Val;
927 } else {
928 llvm_unreachable("Register cannot be tracked/unknown register!");
929 }
930 }
931
932 void setVMemScore(VMEMID TID, InstCounterType T, unsigned Val) {
933 VMem[TID].Scores[T] = Val;
934 }
935
936 void setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy,
937 unsigned Val);
938
939 const SIInsertWaitcnts *Context;
940
941 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
942 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
943 WaitEventSet PendingEvents;
944 // Remember the last flat memory operation.
945 unsigned LastFlatDsCnt = 0;
946 unsigned LastFlatLoadCnt = 0;
947 // Remember the last GDS operation.
948 unsigned LastGDS = 0;
949
950 // The score tracking logic is fragmented as follows:
951 // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
952 // - SGPRs: SGPR RegUnits
953 // - SCC: Non-allocatable and not general purpose: not a SGPR.
954 //
955 // For the VMem case, if the key is within the range of LDS DMA IDs,
956 // then the corresponding index into the `LDSDMAStores` vector below is:
957 // Key - LDSDMA_BEGIN - 1
958 // This is because LDSDMA_BEGIN is a generic entry and does not have an
959 // associated MachineInstr.
960 //
961 // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
962
963 struct VMEMInfo {
964 // Scores for all instruction counters. Zero-initialized.
965 CounterValueArray Scores{};
966 // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
967 unsigned VMEMTypes = 0;
968
969 bool empty() const { return all_of(Scores, equal_to(0)) && !VMEMTypes; }
970 };
971
972 /// Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
973 /// pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
974 class SGPRInfo {
975 /// Either DS_CNT or KM_CNT score.
976 unsigned ScoreDsKmCnt = 0;
977 unsigned ScoreXCnt = 0;
978
979 public:
980 unsigned get(InstCounterType T) const {
981 assert((T == DS_CNT || T == KM_CNT || T == X_CNT) && "Invalid counter");
982 return T == X_CNT ? ScoreXCnt : ScoreDsKmCnt;
983 }
984 unsigned &get(InstCounterType T) {
985 assert((T == DS_CNT || T == KM_CNT || T == X_CNT) && "Invalid counter");
986 return T == X_CNT ? ScoreXCnt : ScoreDsKmCnt;
987 }
988
989 bool empty() const { return !ScoreDsKmCnt && !ScoreXCnt; }
990 };
991
992 DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
993 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
994
995 // Reg score for SCC.
996 unsigned SCCScore = 0;
997 // The unique instruction that has an SCC write pending, if there is one.
998 const MachineInstr *PendingSCCWrite = nullptr;
999
1000 // Store representative LDS DMA operations. The only useful info here is
1001 // alias info. One store is kept per unique AAInfo.
1002 SmallVector<const MachineInstr *> LDSDMAStores;
1003
1004 // State of all counters at each async mark encountered so far.
1006
1007 // But in the rare pathological case, a nest of loops that pushes marks
1008 // without waiting on any mark can cause AsyncMarks to grow very large. We cap
1009 // it to a reasonable limit. We can tune this later or potentially introduce a
1010 // user option to control the value.
1011 static constexpr unsigned MaxAsyncMarks = 16;
1012
1013 // Track the upper bound score for async operations that are not part of a
1014 // mark yet. Initialized to all zeros.
1015 CounterValueArray AsyncScore{};
1016};
1017
1018class SIInsertWaitcntsLegacy : public MachineFunctionPass {
1019public:
1020 static char ID;
1021 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
1022
1023 bool runOnMachineFunction(MachineFunction &MF) override;
1024
1025 StringRef getPassName() const override {
1026 return "SI insert wait instructions";
1027 }
1028
1029 void getAnalysisUsage(AnalysisUsage &AU) const override {
1030 AU.setPreservesCFG();
1031 AU.addRequired<MachineLoopInfoWrapperPass>();
1032 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
1033 AU.addUsedIfAvailable<AAResultsWrapperPass>();
1034 AU.addPreserved<AAResultsWrapperPass>();
1036 }
1037};
1038
1039} // end anonymous namespace
1040
1041void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
1042 InstCounterType CntTy, unsigned Score) {
1043 setRegScore(Op.getReg().asMCReg(), CntTy, Score);
1044}
1045
1046// Return true if the subtarget is one that enables Point Sample Acceleration
1047// and the MachineInstr passed in is one to which it might be applied (the
1048// hardware makes this decision based on several factors, but we can't determine
1049// this at compile time, so we have to assume it might be applied if the
1050// instruction supports it).
1051bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
1052 if (!Context->ST.hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
1053 return false;
1054
1055 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
1056 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
1058 return BaseInfo->PointSampleAccel;
1059}
1060
1061// Return true if the subtarget enables Point Sample Acceleration, the supplied
1062// MachineInstr is one to which it might be applied and the supplied interval is
1063// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
1064// (this is the type that a point sample accelerated instruction effectively
1065// becomes)
1066bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
1067 MCPhysReg Reg) const {
1068 if (!hasPointSampleAccel(MI))
1069 return false;
1070
1071 return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER);
1072}
1073
1074void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
1075 InstCounterType T = Context->getCounterFromEvent(E);
1076 assert(T < Context->MaxCounter);
1077
1078 unsigned UB = getScoreUB(T);
1079 unsigned CurrScore = UB + 1;
1080 if (CurrScore == 0)
1081 report_fatal_error("InsertWaitcnt score wraparound");
1082 // PendingEvents and ScoreUB need to be update regardless if this event
1083 // changes the score of a register or not.
1084 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
1085 PendingEvents.insert(E);
1086 setScoreUB(T, CurrScore);
1087
1088 const SIRegisterInfo &TRI = Context->TRI;
1089 const MachineRegisterInfo &MRI = Context->MRI;
1090 const SIInstrInfo &TII = Context->TII;
1091
1092 if (T == EXP_CNT) {
1093 // Put score on the source vgprs. If this is a store, just use those
1094 // specific register(s).
1095 if (TII.isDS(Inst) && Inst.mayLoadOrStore()) {
1096 // All GDS operations must protect their address register (same as
1097 // export.)
1098 if (const auto *AddrOp = TII.getNamedOperand(Inst, AMDGPU::OpName::addr))
1099 setScoreByOperand(*AddrOp, EXP_CNT, CurrScore);
1100
1101 if (Inst.mayStore()) {
1102 if (const auto *Data0 =
1103 TII.getNamedOperand(Inst, AMDGPU::OpName::data0))
1104 setScoreByOperand(*Data0, EXP_CNT, CurrScore);
1105 if (const auto *Data1 =
1106 TII.getNamedOperand(Inst, AMDGPU::OpName::data1))
1107 setScoreByOperand(*Data1, EXP_CNT, CurrScore);
1108 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
1109 Inst.getOpcode() != AMDGPU::DS_APPEND &&
1110 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
1111 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1112 for (const MachineOperand &Op : Inst.all_uses()) {
1113 if (TRI.isVectorRegister(MRI, Op.getReg()))
1114 setScoreByOperand(Op, EXP_CNT, CurrScore);
1115 }
1116 }
1117 } else if (TII.isFLAT(Inst)) {
1118 if (Inst.mayStore()) {
1119 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1120 EXP_CNT, CurrScore);
1121 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1122 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1123 EXP_CNT, CurrScore);
1124 }
1125 } else if (TII.isMIMG(Inst)) {
1126 if (Inst.mayStore()) {
1127 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
1128 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1129 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1130 EXP_CNT, CurrScore);
1131 }
1132 } else if (TII.isMTBUF(Inst)) {
1133 if (Inst.mayStore())
1134 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
1135 } else if (TII.isMUBUF(Inst)) {
1136 if (Inst.mayStore()) {
1137 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
1138 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1139 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1140 EXP_CNT, CurrScore);
1141 }
1142 } else if (TII.isLDSDIR(Inst)) {
1143 // LDSDIR instructions attach the score to the destination.
1144 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::vdst),
1145 EXP_CNT, CurrScore);
1146 } else {
1147 if (TII.isEXP(Inst)) {
1148 // For export the destination registers are really temps that
1149 // can be used as the actual source after export patching, so
1150 // we need to treat them like sources and set the EXP_CNT
1151 // score.
1152 for (MachineOperand &DefMO : Inst.all_defs()) {
1153 if (TRI.isVGPR(MRI, DefMO.getReg())) {
1154 setScoreByOperand(DefMO, EXP_CNT, CurrScore);
1155 }
1156 }
1157 }
1158 for (const MachineOperand &Op : Inst.all_uses()) {
1159 if (TRI.isVectorRegister(MRI, Op.getReg()))
1160 setScoreByOperand(Op, EXP_CNT, CurrScore);
1161 }
1162 }
1163 } else if (T == X_CNT) {
1164 WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1165 if (PendingEvents.contains(OtherEvent)) {
1166 // Hardware inserts an implicit xcnt between interleaved
1167 // SMEM and VMEM operations. So there will never be
1168 // outstanding address translations for both SMEM and
1169 // VMEM at the same time.
1170 setScoreLB(T, getScoreUB(T) - 1);
1171 PendingEvents.remove(OtherEvent);
1172 }
1173 for (const MachineOperand &Op : Inst.all_uses())
1174 setScoreByOperand(Op, T, CurrScore);
1175 } else if (T == VA_VDST || T == VM_VSRC) {
1176 // Match the score to the VGPR destination or source registers as
1177 // appropriate
1178 for (const MachineOperand &Op : Inst.operands()) {
1179 if (!Op.isReg() || (T == VA_VDST && Op.isUse()) ||
1180 (T == VM_VSRC && Op.isDef()))
1181 continue;
1182 if (TRI.isVectorRegister(Context->MRI, Op.getReg()))
1183 setScoreByOperand(Op, T, CurrScore);
1184 }
1185 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1186 // Match the score to the destination registers.
1187 //
1188 // Check only explicit operands. Stores, especially spill stores, include
1189 // implicit uses and defs of their super registers which would create an
1190 // artificial dependency, while these are there only for register liveness
1191 // accounting purposes.
1192 //
1193 // Special cases where implicit register defs exists, such as M0 or VCC,
1194 // but none with memory instructions.
1195 for (const MachineOperand &Op : Inst.defs()) {
1196 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
1197 if (!TRI.isVectorRegister(MRI, Op.getReg())) // TODO: add wrapper
1198 continue;
1199 if (updateVMCntOnly(Inst)) {
1200 // updateVMCntOnly should only leave us with VGPRs
1201 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1202 // defs. That's required for a sane index into `VgprMemTypes` below
1203 assert(TRI.isVectorRegister(MRI, Op.getReg()));
1204 VmemType V = getVmemType(Inst);
1205 unsigned char TypesMask = 1 << V;
1206 // If instruction can have Point Sample Accel applied, we have to flag
1207 // this with another potential dependency
1208 if (hasPointSampleAccel(Inst))
1209 TypesMask |= 1 << VMEM_NOSAMPLER;
1210 for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))
1211 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1212 }
1213 }
1214 setScoreByOperand(Op, T, CurrScore);
1215 }
1216 if (Inst.mayStore() &&
1217 (TII.isDS(Inst) || Context->isNonAsyncLdsDmaWrite(Inst))) {
1218 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1219 // written can be accessed. A load from LDS to VMEM does not need a wait.
1220 //
1221 // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
1222 // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
1223 // store. The "Slot" is the index into LDSDMAStores + 1.
1224 unsigned Slot = 0;
1225 for (const auto *MemOp : Inst.memoperands()) {
1226 if (!MemOp->isStore() ||
1227 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1228 continue;
1229 // Comparing just AA info does not guarantee memoperands are equal
1230 // in general, but this is so for LDS DMA in practice.
1231 auto AAI = MemOp->getAAInfo();
1232 // Alias scope information gives a way to definitely identify an
1233 // original memory object and practically produced in the module LDS
1234 // lowering pass. If there is no scope available we will not be able
1235 // to disambiguate LDS aliasing as after the module lowering all LDS
1236 // is squashed into a single big object.
1237 if (!AAI || !AAI.Scope)
1238 break;
1239 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1240 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1241 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1242 Slot = I + 1;
1243 break;
1244 }
1245 }
1246 }
1247 if (Slot)
1248 break;
1249 // The slot may not be valid because it can be >= NUM_LDSDMA which
1250 // means the scoreboard cannot track it. We still want to preserve the
1251 // MI in order to check alias information, though.
1252 LDSDMAStores.push_back(&Inst);
1253 Slot = LDSDMAStores.size();
1254 break;
1255 }
1256 setVMemScore(LDSDMA_BEGIN, T, CurrScore);
1257 if (Slot && Slot < NUM_LDSDMA)
1258 setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
1259 }
1260
1261 // FIXME: Not supported on GFX12 yet. Newer async operations use other
1262 // counters too, so will need a map from instruction or event types to
1263 // counter types.
1264 if (Context->isAsyncLdsDmaWrite(Inst) && T == LOAD_CNT) {
1266 "unexpected GFX1250 instruction");
1267 AsyncScore[T] = CurrScore;
1268 }
1269
1271 setRegScore(AMDGPU::SCC, T, CurrScore);
1272 PendingSCCWrite = &Inst;
1273 }
1274 }
1275}
1276
1277void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1278 // In the absence of loops, AsyncMarks can grow linearly with the program
1279 // until we encounter an ASYNCMARK_WAIT. We could drop the oldest mark above a
1280 // limit every time we push a new mark, but that seems like unnecessary work
1281 // in practical cases. We do separately truncate the array when processing a
1282 // loop, which should be sufficient.
1283 AsyncMarks.push_back(AsyncScore);
1284 AsyncScore = {};
1285 LLVM_DEBUG({
1286 dbgs() << "recordAsyncMark:\n" << Inst;
1287 for (const auto &Mark : AsyncMarks) {
1288 llvm::interleaveComma(Mark, dbgs());
1289 dbgs() << '\n';
1290 }
1291 });
1292}
1293
1294void WaitcntBrackets::print(raw_ostream &OS) const {
1295 const GCNSubtarget &ST = Context->ST;
1296
1297 for (auto T : inst_counter_types(Context->MaxCounter)) {
1298 unsigned SR = getScoreRange(T);
1299 switch (T) {
1300 case LOAD_CNT:
1301 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1302 << SR << "):";
1303 break;
1304 case DS_CNT:
1305 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1306 << SR << "):";
1307 break;
1308 case EXP_CNT:
1309 OS << " EXP_CNT(" << SR << "):";
1310 break;
1311 case STORE_CNT:
1312 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1313 << SR << "):";
1314 break;
1315 case SAMPLE_CNT:
1316 OS << " SAMPLE_CNT(" << SR << "):";
1317 break;
1318 case BVH_CNT:
1319 OS << " BVH_CNT(" << SR << "):";
1320 break;
1321 case KM_CNT:
1322 OS << " KM_CNT(" << SR << "):";
1323 break;
1324 case X_CNT:
1325 OS << " X_CNT(" << SR << "):";
1326 break;
1327 case ASYNC_CNT:
1328 OS << " ASYNC_CNT(" << SR << "):";
1329 break;
1330 case VA_VDST:
1331 OS << " VA_VDST(" << SR << "): ";
1332 break;
1333 case VM_VSRC:
1334 OS << " VM_VSRC(" << SR << "): ";
1335 break;
1336 default:
1337 OS << " UNKNOWN(" << SR << "):";
1338 break;
1339 }
1340
1341 if (SR != 0) {
1342 // Print vgpr scores.
1343 unsigned LB = getScoreLB(T);
1344
1345 SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
1346 sort(SortedVMEMIDs);
1347
1348 for (auto ID : SortedVMEMIDs) {
1349 unsigned RegScore = VMem.at(ID).Scores[T];
1350 if (RegScore <= LB)
1351 continue;
1352 unsigned RelScore = RegScore - LB - 1;
1353 if (ID < REGUNITS_END) {
1354 OS << ' ' << RelScore << ":vRU" << ID;
1355 } else {
1356 assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
1357 "Unhandled/unexpected ID value!");
1358 OS << ' ' << RelScore << ":LDSDMA" << ID;
1359 }
1360 }
1361
1362 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1363 if (isSmemCounter(T)) {
1364 SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
1365 sort(SortedSMEMIDs);
1366 for (auto ID : SortedSMEMIDs) {
1367 unsigned RegScore = SGPRs.at(ID).get(T);
1368 if (RegScore <= LB)
1369 continue;
1370 unsigned RelScore = RegScore - LB - 1;
1371 OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
1372 }
1373 }
1374
1375 if (T == KM_CNT && SCCScore > 0)
1376 OS << ' ' << SCCScore << ":scc";
1377 }
1378 OS << '\n';
1379 }
1380
1381 OS << "Pending Events: ";
1382 if (hasPendingEvent()) {
1383 ListSeparator LS;
1384 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1385 if (hasPendingEvent((WaitEventType)I)) {
1386 OS << LS << WaitEventTypeName[I];
1387 }
1388 }
1389 } else {
1390 OS << "none";
1391 }
1392 OS << '\n';
1393
1394 OS << "Async score: ";
1395 if (AsyncScore.empty())
1396 OS << "none";
1397 else
1398 llvm::interleaveComma(AsyncScore, OS);
1399 OS << '\n';
1400
1401 OS << "Async marks: " << AsyncMarks.size() << '\n';
1402
1403 for (const auto &Mark : AsyncMarks) {
1404 for (auto T : inst_counter_types()) {
1405 unsigned MarkedScore = Mark[T];
1406 switch (T) {
1407 case LOAD_CNT:
1408 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM")
1409 << "_CNT: " << MarkedScore;
1410 break;
1411 case DS_CNT:
1412 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM")
1413 << "_CNT: " << MarkedScore;
1414 break;
1415 case EXP_CNT:
1416 OS << " EXP_CNT: " << MarkedScore;
1417 break;
1418 case STORE_CNT:
1419 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS")
1420 << "_CNT: " << MarkedScore;
1421 break;
1422 case SAMPLE_CNT:
1423 OS << " SAMPLE_CNT: " << MarkedScore;
1424 break;
1425 case BVH_CNT:
1426 OS << " BVH_CNT: " << MarkedScore;
1427 break;
1428 case KM_CNT:
1429 OS << " KM_CNT: " << MarkedScore;
1430 break;
1431 case X_CNT:
1432 OS << " X_CNT: " << MarkedScore;
1433 break;
1434 case ASYNC_CNT:
1435 OS << " ASYNC_CNT: " << MarkedScore;
1436 break;
1437 default:
1438 OS << " UNKNOWN: " << MarkedScore;
1439 break;
1440 }
1441 }
1442 OS << '\n';
1443 }
1444 OS << '\n';
1445}
1446
1447/// Simplify \p UpdateWait by removing waits that are redundant based on the
1448/// current WaitcntBrackets and any other waits specified in \p CheckWait.
1449void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
1450 AMDGPU::Waitcnt &UpdateWait) const {
1451 simplifyWaitcnt(UpdateWait, LOAD_CNT);
1452 simplifyWaitcnt(UpdateWait, EXP_CNT);
1453 simplifyWaitcnt(UpdateWait, DS_CNT);
1454 simplifyWaitcnt(UpdateWait, STORE_CNT);
1455 simplifyWaitcnt(UpdateWait, SAMPLE_CNT);
1456 simplifyWaitcnt(UpdateWait, BVH_CNT);
1457 simplifyWaitcnt(UpdateWait, KM_CNT);
1458 simplifyXcnt(CheckWait, UpdateWait);
1459 simplifyWaitcnt(UpdateWait, VA_VDST);
1460 simplifyVmVsrc(CheckWait, UpdateWait);
1461 simplifyWaitcnt(UpdateWait, ASYNC_CNT);
1462}
1463
1464void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1465 unsigned &Count) const {
1466 // The number of outstanding events for this type, T, can be calculated
1467 // as (UB - LB). If the current Count is greater than or equal to the number
1468 // of outstanding events, then the wait for this counter is redundant.
1469 if (Count >= getScoreRange(T))
1470 Count = ~0u;
1471}
1472
1473void WaitcntBrackets::simplifyWaitcnt(Waitcnt &Wait, InstCounterType T) const {
1474 unsigned Cnt = Wait.get(T);
1475 simplifyWaitcnt(T, Cnt);
1476 Wait.set(T, Cnt);
1477}
1478
1479void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
1480 AMDGPU::Waitcnt &UpdateWait) const {
1481 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1482 // optimizations. On entry to a block with multiple predescessors, there may
1483 // be pending SMEM and VMEM events active at the same time.
1484 // In such cases, only clear one active event at a time.
1485 // TODO: Revisit xcnt optimizations for gfx1250.
1486 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1487 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1488 // zero.
1489 if (CheckWait.get(KM_CNT) == 0 && hasPendingEvent(SMEM_GROUP))
1490 UpdateWait.set(X_CNT, ~0u);
1491 // If we have pending store we cannot optimize XCnt because we do not wait for
1492 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1493 // decremented to the same number as LOADCnt.
1494 if (CheckWait.get(LOAD_CNT) != ~0u && hasPendingEvent(VMEM_GROUP) &&
1495 !hasPendingEvent(STORE_CNT) &&
1496 CheckWait.get(X_CNT) >= CheckWait.get(LOAD_CNT))
1497 UpdateWait.set(X_CNT, ~0u);
1498 simplifyWaitcnt(UpdateWait, X_CNT);
1499}
1500
1501void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
1502 AMDGPU::Waitcnt &UpdateWait) const {
1503 // Waiting for some counters implies waiting for VM_VSRC, since an
1504 // instruction that decrements a counter on completion would have
1505 // decremented VM_VSRC once its VGPR operands had been read.
1506 if (CheckWait.get(VM_VSRC) >=
1507 std::min({CheckWait.get(LOAD_CNT), CheckWait.get(STORE_CNT),
1508 CheckWait.get(SAMPLE_CNT), CheckWait.get(BVH_CNT),
1509 CheckWait.get(DS_CNT)}))
1510 UpdateWait.set(VM_VSRC, ~0u);
1511 simplifyWaitcnt(UpdateWait, VM_VSRC);
1512}
1513
1514void WaitcntBrackets::purgeEmptyTrackingData() {
1515 for (auto &[K, V] : make_early_inc_range(VMem)) {
1516 if (V.empty())
1517 VMem.erase(K);
1518 }
1519 for (auto &[K, V] : make_early_inc_range(SGPRs)) {
1520 if (V.empty())
1521 SGPRs.erase(K);
1522 }
1523}
1524
1525void WaitcntBrackets::determineWaitForScore(InstCounterType T,
1526 unsigned ScoreToWait,
1527 AMDGPU::Waitcnt &Wait) const {
1528 const unsigned LB = getScoreLB(T);
1529 const unsigned UB = getScoreUB(T);
1530
1531 // If the score falls within the bracket, we need a waitcnt.
1532 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1533 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1534 !Context->ST.hasFlatLgkmVMemCountInOrder()) {
1535 // If there is a pending FLAT operation, and this is a VMem or LGKM
1536 // waitcnt and the target can report early completion, then we need
1537 // to force a waitcnt 0.
1538 addWait(Wait, T, 0);
1539 } else if (counterOutOfOrder(T)) {
1540 // Counter can get decremented out-of-order when there
1541 // are multiple types event in the bracket. Also emit an s_wait counter
1542 // with a conservative value of 0 for the counter.
1543 addWait(Wait, T, 0);
1544 } else {
1545 // If a counter has been maxed out avoid overflow by waiting for
1546 // MAX(CounterType) - 1 instead.
1547 unsigned NeededWait = std::min(
1548 UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1);
1549 addWait(Wait, T, NeededWait);
1550 }
1551 }
1552}
1553
1554AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
1555 LLVM_DEBUG({
1556 dbgs() << "Need " << N << " async marks. Found " << AsyncMarks.size()
1557 << ":\n";
1558 for (const auto &Mark : AsyncMarks) {
1559 llvm::interleaveComma(Mark, dbgs());
1560 dbgs() << '\n';
1561 }
1562 });
1563
1564 if (AsyncMarks.size() == MaxAsyncMarks) {
1565 // Enforcing MaxAsyncMarks here is unnecessary work because the size of
1566 // MaxAsyncMarks is linear when traversing straightline code. But we do
1567 // need to check if truncation may have occured at a merge, and adjust N
1568 // to ensure that a wait is generated.
1569 LLVM_DEBUG(dbgs() << "Possible truncation. Ensuring a non-trivial wait.\n");
1570 N = std::min(N, (unsigned)MaxAsyncMarks - 1);
1571 }
1572
1573 AMDGPU::Waitcnt Wait;
1574 if (AsyncMarks.size() <= N) {
1575 LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n");
1576 return Wait;
1577 }
1578
1579 size_t MarkIndex = AsyncMarks.size() - N - 1;
1580 const auto &RequiredMark = AsyncMarks[MarkIndex];
1582 determineWaitForScore(T, RequiredMark[T], Wait);
1583
1584 // Immediately remove the waited mark and all older ones
1585 // This happens BEFORE the wait is actually inserted, which is fine
1586 // because we've already extracted the wait requirements
1587 LLVM_DEBUG({
1588 dbgs() << "Removing " << (MarkIndex + 1)
1589 << " async marks after determining wait\n";
1590 });
1591 AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
1592
1593 LLVM_DEBUG(dbgs() << "Waits to add: " << Wait);
1594 return Wait;
1595}
1596
1597void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
1598 AMDGPU::Waitcnt &Wait) const {
1599 if (Reg == AMDGPU::SCC) {
1600 determineWaitForScore(T, SCCScore, Wait);
1601 } else {
1602 bool IsVGPR = Context->TRI.isVectorRegister(Context->MRI, Reg);
1603 for (MCRegUnit RU : regunits(Reg))
1604 determineWaitForScore(
1605 T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
1606 Wait);
1607 }
1608}
1609
1610void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
1611 AMDGPU::Waitcnt &Wait) const {
1612 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1613 determineWaitForScore(T, getVMemScore(TID, T), Wait);
1614}
1615
1616void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1617 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1618 // SCC has landed
1619 if (PendingSCCWrite &&
1620 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1621 PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
1622 WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
1623 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1624 if ((PendingEvents & Context->getWaitEvents(KM_CNT)) ==
1625 SCC_WRITE_PendingEvent) {
1626 setScoreLB(KM_CNT, getScoreUB(KM_CNT));
1627 }
1628
1629 PendingEvents.remove(SCC_WRITE_PendingEvent);
1630 PendingSCCWrite = nullptr;
1631 }
1632}
1633
1634void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1636 applyWaitcnt(Wait, T);
1637}
1638
1639void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1640 const unsigned UB = getScoreUB(T);
1641 if (Count >= UB)
1642 return;
1643 if (Count != 0) {
1644 if (counterOutOfOrder(T))
1645 return;
1646 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1647 } else {
1648 setScoreLB(T, UB);
1649 PendingEvents.remove(Context->getWaitEvents(T));
1650 }
1651
1652 if (T == KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) {
1653 if (!hasMixedPendingEvents(X_CNT))
1654 applyWaitcnt(X_CNT, 0);
1655 else
1656 PendingEvents.remove(SMEM_GROUP);
1657 }
1658 if (T == LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
1659 !hasPendingEvent(STORE_CNT)) {
1660 if (!hasMixedPendingEvents(X_CNT))
1661 applyWaitcnt(X_CNT, Count);
1662 else if (Count == 0)
1663 PendingEvents.remove(VMEM_GROUP);
1664 }
1665}
1666
1667void WaitcntBrackets::applyWaitcnt(const Waitcnt &Wait, InstCounterType T) {
1668 unsigned Cnt = Wait.get(T);
1669 applyWaitcnt(T, Cnt);
1670}
1671
1672// Where there are multiple types of event in the bracket of a counter,
1673// the decrement may go out of order.
1674bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1675 // Scalar memory read always can go out of order.
1676 if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1677 (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
1678 return true;
1679
1680 // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
1681 // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
1682 // out-of-order completion.
1683 if (T == LOAD_CNT) {
1684 WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
1685 // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
1686 // events
1687 Events.remove(GLOBAL_INV_ACCESS);
1688 // Return true only if there are still multiple event types after removing
1689 // GLOBAL_INV
1690 return Events.twoOrMore();
1691 }
1692
1693 return hasMixedPendingEvents(T);
1694}
1695
1696INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1697 false, false)
1700INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1702
1703char SIInsertWaitcntsLegacy::ID = 0;
1704
1705char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1706
1708 return new SIInsertWaitcntsLegacy();
1709}
1710
1711static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1712 unsigned NewEnc) {
1713 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1714 assert(OpIdx >= 0);
1715
1716 MachineOperand &MO = MI.getOperand(OpIdx);
1717
1718 if (NewEnc == MO.getImm())
1719 return false;
1720
1721 MO.setImm(NewEnc);
1722 return true;
1723}
1724
1725/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1726/// and if so, which counter it is waiting on.
1727static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1728 switch (Opcode) {
1729 case AMDGPU::S_WAIT_LOADCNT:
1730 return LOAD_CNT;
1731 case AMDGPU::S_WAIT_EXPCNT:
1732 return EXP_CNT;
1733 case AMDGPU::S_WAIT_STORECNT:
1734 return STORE_CNT;
1735 case AMDGPU::S_WAIT_SAMPLECNT:
1736 return SAMPLE_CNT;
1737 case AMDGPU::S_WAIT_BVHCNT:
1738 return BVH_CNT;
1739 case AMDGPU::S_WAIT_DSCNT:
1740 return DS_CNT;
1741 case AMDGPU::S_WAIT_KMCNT:
1742 return KM_CNT;
1743 case AMDGPU::S_WAIT_XCNT:
1744 return X_CNT;
1745 default:
1746 return {};
1747 }
1748}
1749
1750bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1751 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1752 if (Opcode == Waitcnt->getOpcode())
1753 return false;
1754
1755 Waitcnt->setDesc(TII.get(Opcode));
1756 return true;
1757}
1758
1759/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1760/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1761/// from \p Wait that were added by previous passes. Currently this pass
1762/// conservatively assumes that these preexisting waits are required for
1763/// correctness.
1764bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1765 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1766 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1767 assert(isNormalMode(MaxCounter));
1768
1769 bool Modified = false;
1770 MachineInstr *WaitcntInstr = nullptr;
1771 MachineInstr *WaitcntVsCntInstr = nullptr;
1772
1773 LLVM_DEBUG({
1774 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1775 if (It.isEnd())
1776 dbgs() << "end of block\n";
1777 else
1778 dbgs() << *It;
1779 });
1780
1781 for (auto &II :
1782 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1783 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1784 if (II.isMetaInstruction()) {
1785 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1786 continue;
1787 }
1788
1789 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1790 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1791
1792 // Update required wait count. If this is a soft waitcnt (= it was added
1793 // by an earlier pass), it may be entirely removed.
1794 if (Opcode == AMDGPU::S_WAITCNT) {
1795 unsigned IEnc = II.getOperand(0).getImm();
1796 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1797 if (TrySimplify)
1798 ScoreBrackets.simplifyWaitcnt(OldWait);
1799 Wait = Wait.combined(OldWait);
1800
1801 // Merge consecutive waitcnt of the same type by erasing multiples.
1802 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1803 II.eraseFromParent();
1804 Modified = true;
1805 } else
1806 WaitcntInstr = &II;
1807 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1808 assert(ST.hasVMemToLDSLoad());
1809 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1810 << "Before: " << Wait << '\n';);
1811 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, LDSDMA_BEGIN, Wait);
1812 LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
1813
1814 // It is possible (but unlikely) that this is the only wait instruction,
1815 // in which case, we exit this loop without a WaitcntInstr to consume
1816 // `Wait`. But that works because `Wait` was passed in by reference, and
1817 // the callee eventually calls createNewWaitcnt on it. We test this
1818 // possibility in an articial MIR test since such a situation cannot be
1819 // recreated by running the memory legalizer.
1820 II.eraseFromParent();
1821 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1822 unsigned N = II.getOperand(0).getImm();
1823 LLVM_DEBUG(dbgs() << "Processing WAIT_ASYNCMARK: " << II << '\n';);
1824 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
1825 Wait = Wait.combined(OldWait);
1826 } else {
1827 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1828 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1829
1830 unsigned OldVSCnt =
1831 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1832 if (TrySimplify)
1833 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1834 Wait.set(STORE_CNT, std::min(Wait.get(STORE_CNT), OldVSCnt));
1835
1836 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1837 II.eraseFromParent();
1838 Modified = true;
1839 } else
1840 WaitcntVsCntInstr = &II;
1841 }
1842 }
1843
1844 if (WaitcntInstr) {
1845 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1847 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1848
1849 ScoreBrackets.applyWaitcnt(Wait, LOAD_CNT);
1850 ScoreBrackets.applyWaitcnt(Wait, EXP_CNT);
1851 ScoreBrackets.applyWaitcnt(Wait, DS_CNT);
1852 Wait.set(LOAD_CNT, ~0u);
1853 Wait.set(EXP_CNT, ~0u);
1854 Wait.set(DS_CNT, ~0u);
1855
1856 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1857 << "New Instr at block end: "
1858 << *WaitcntInstr << '\n'
1859 : dbgs() << "applied pre-existing waitcnt\n"
1860 << "Old Instr: " << *It
1861 << "New Instr: " << *WaitcntInstr << '\n');
1862 }
1863
1864 if (WaitcntVsCntInstr) {
1866 *WaitcntVsCntInstr, AMDGPU::OpName::simm16, Wait.get(STORE_CNT));
1867 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1868
1869 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.get(STORE_CNT));
1870 Wait.set(STORE_CNT, ~0u);
1871
1872 LLVM_DEBUG(It.isEnd()
1873 ? dbgs() << "applied pre-existing waitcnt\n"
1874 << "New Instr at block end: " << *WaitcntVsCntInstr
1875 << '\n'
1876 : dbgs() << "applied pre-existing waitcnt\n"
1877 << "Old Instr: " << *It
1878 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1879 }
1880
1881 return Modified;
1882}
1883
1884/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1885/// required counters in \p Wait
1886bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1887 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1888 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
1889 assert(isNormalMode(MaxCounter));
1890
1891 bool Modified = false;
1892 const DebugLoc &DL = Block.findDebugLoc(It);
1893
1894 // Helper to emit expanded waitcnt sequence for profiling.
1895 // Emits waitcnts from (Outstanding-1) down to Target.
1896 // The EmitWaitcnt callback emits a single waitcnt.
1897 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
1898 auto EmitWaitcnt) {
1899 do {
1900 EmitWaitcnt(--Outstanding);
1901 } while (Outstanding > Target);
1902 Modified = true;
1903 };
1904
1905 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1906 // single instruction while VScnt has its own instruction.
1907 if (Wait.hasWaitExceptStoreCnt()) {
1908 // If profiling expansion is enabled, emit an expanded sequence
1909 if (ExpandWaitcntProfiling) {
1910 // Check if any of the counters to be waited on are out-of-order.
1911 // If so, fall back to normal (non-expanded) behavior since expansion
1912 // would provide misleading profiling information.
1913 bool AnyOutOfOrder = false;
1914 for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
1915 unsigned WaitCnt = Wait.get(CT);
1916 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
1917 AnyOutOfOrder = true;
1918 break;
1919 }
1920 }
1921
1922 if (AnyOutOfOrder) {
1923 // Fall back to non-expanded wait
1924 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1925 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
1926 Modified = true;
1927 } else {
1928 // All counters are in-order, safe to expand
1929 for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
1930 unsigned WaitCnt = Wait.get(CT);
1931 if (WaitCnt == ~0u)
1932 continue;
1933
1934 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
1935 getWaitCountMax(getLimits(), CT) - 1);
1936 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
1937 AMDGPU::Waitcnt W;
1938 W.set(CT, Count);
1939 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT))
1941 });
1942 }
1943 }
1944 } else {
1945 // Normal behavior: emit single combined waitcnt
1946 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1947 [[maybe_unused]] auto SWaitInst =
1948 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
1949 Modified = true;
1950
1951 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1952 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1953 dbgs() << "New Instr: " << *SWaitInst << '\n');
1954 }
1955 }
1956
1957 if (Wait.hasWaitStoreCnt()) {
1958 assert(ST.hasVscnt());
1959
1960 if (ExpandWaitcntProfiling && Wait.get(STORE_CNT) != ~0u &&
1961 !ScoreBrackets.counterOutOfOrder(STORE_CNT)) {
1962 // Only expand if counter is not out-of-order
1963 unsigned Outstanding =
1964 std::min(ScoreBrackets.getOutstanding(STORE_CNT),
1965 getWaitCountMax(getLimits(), STORE_CNT) - 1);
1966 EmitExpandedWaitcnt(
1967 Outstanding, Wait.get(STORE_CNT), [&](unsigned Count) {
1968 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
1969 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1970 .addImm(Count);
1971 });
1972 } else {
1973 [[maybe_unused]] auto SWaitInst =
1974 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
1975 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1976 .addImm(Wait.get(STORE_CNT));
1977 Modified = true;
1978
1979 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1980 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1981 dbgs() << "New Instr: " << *SWaitInst << '\n');
1982 }
1983 }
1984
1985 return Modified;
1986}
1987
1988AMDGPU::Waitcnt
1989WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1990 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST.hasVscnt() ? 0 : ~0u);
1991}
1992
1993AMDGPU::Waitcnt
1994WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1995 unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
1996 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1997 ~0u /* XCNT */, ~0u /* ASYNC_CNT */, ExpertVal,
1998 ExpertVal);
1999}
2000
2001/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
2002/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
2003/// were added by previous passes. Currently this pass conservatively
2004/// assumes that these preexisting waits are required for correctness.
2005bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
2006 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
2007 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
2008 assert(!isNormalMode(MaxCounter));
2009
2010 bool Modified = false;
2011 MachineInstr *CombinedLoadDsCntInstr = nullptr;
2012 MachineInstr *CombinedStoreDsCntInstr = nullptr;
2013 MachineInstr *WaitcntDepctrInstr = nullptr;
2014 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
2015
2016 LLVM_DEBUG({
2017 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
2018 if (It.isEnd())
2019 dbgs() << "end of block\n";
2020 else
2021 dbgs() << *It;
2022 });
2023
2024 // Accumulate waits that should not be simplified.
2025 AMDGPU::Waitcnt RequiredWait;
2026
2027 for (auto &II :
2028 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
2029 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
2030 if (II.isMetaInstruction()) {
2031 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
2032 continue;
2033 }
2034
2035 // Update required wait count. If this is a soft waitcnt (= it was added
2036 // by an earlier pass), it may be entirely removed.
2037
2038 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
2039 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
2040
2041 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
2042 // attempt to do more than that either.
2043 if (Opcode == AMDGPU::S_WAITCNT)
2044 continue;
2045
2046 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
2047 unsigned OldEnc =
2048 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2049 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
2050 if (TrySimplify)
2051 Wait = Wait.combined(OldWait);
2052 else
2053 RequiredWait = RequiredWait.combined(OldWait);
2054 // Keep the first wait_loadcnt, erase the rest.
2055 if (CombinedLoadDsCntInstr == nullptr) {
2056 CombinedLoadDsCntInstr = &II;
2057 } else {
2058 II.eraseFromParent();
2059 Modified = true;
2060 }
2061 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
2062 unsigned OldEnc =
2063 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2064 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
2065 if (TrySimplify)
2066 Wait = Wait.combined(OldWait);
2067 else
2068 RequiredWait = RequiredWait.combined(OldWait);
2069 // Keep the first wait_storecnt, erase the rest.
2070 if (CombinedStoreDsCntInstr == nullptr) {
2071 CombinedStoreDsCntInstr = &II;
2072 } else {
2073 II.eraseFromParent();
2074 Modified = true;
2075 }
2076 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
2077 unsigned OldEnc =
2078 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2079 AMDGPU::Waitcnt OldWait;
2082 if (TrySimplify)
2083 ScoreBrackets.simplifyWaitcnt(OldWait);
2084 Wait = Wait.combined(OldWait);
2085 if (WaitcntDepctrInstr == nullptr) {
2086 WaitcntDepctrInstr = &II;
2087 } else {
2088 // S_WAITCNT_DEPCTR requires special care. Don't remove a
2089 // duplicate if it is waiting on things other than VA_VDST or
2090 // VM_VSRC. If that is the case, just make sure the VA_VDST and
2091 // VM_VSRC subfields of the operand are set to the "no wait"
2092 // values.
2093
2094 unsigned Enc =
2095 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2096 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u);
2097 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u);
2098
2099 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
2100 Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc);
2101 Modified |= promoteSoftWaitCnt(&II);
2102 } else {
2103 II.eraseFromParent();
2104 Modified = true;
2105 }
2106 }
2107 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
2108 // Architectures higher than GFX10 do not have direct loads to
2109 // LDS, so no work required here yet.
2110 II.eraseFromParent();
2111 Modified = true;
2112 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
2113 reportFatalUsageError("WAIT_ASYNCMARK is not ready for GFX12 yet");
2114 } else {
2115 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
2116 assert(CT.has_value());
2117 unsigned OldCnt =
2118 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2119 if (TrySimplify)
2120 addWait(Wait, CT.value(), OldCnt);
2121 else
2122 addWait(RequiredWait, CT.value(), OldCnt);
2123 // Keep the first wait of its kind, erase the rest.
2124 if (WaitInstrs[CT.value()] == nullptr) {
2125 WaitInstrs[CT.value()] = &II;
2126 } else {
2127 II.eraseFromParent();
2128 Modified = true;
2129 }
2130 }
2131 }
2132
2133 ScoreBrackets.simplifyWaitcnt(Wait.combined(RequiredWait), Wait);
2134 Wait = Wait.combined(RequiredWait);
2135
2136 if (CombinedLoadDsCntInstr) {
2137 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
2138 // to be waited for. Otherwise, let the instruction be deleted so
2139 // the appropriate single counter wait instruction can be inserted
2140 // instead, when new S_WAIT_*CNT instructions are inserted by
2141 // createNewWaitcnt(). As a side effect, resetting the wait counts will
2142 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
2143 // the loop below that deals with single counter instructions.
2144 //
2145 // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
2146 // instructions that have decremented LOAD_CNT or DS_CNT on completion
2147 // will have needed to wait for their register sources to be available
2148 // first.
2149 if (Wait.get(LOAD_CNT) != ~0u && Wait.get(DS_CNT) != ~0u) {
2150 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2151 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
2152 AMDGPU::OpName::simm16, NewEnc);
2153 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
2154 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.get(LOAD_CNT));
2155 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.get(DS_CNT));
2156 Wait.set(LOAD_CNT, ~0u);
2157 Wait.set(DS_CNT, ~0u);
2158
2159 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2160 << "New Instr at block end: "
2161 << *CombinedLoadDsCntInstr << '\n'
2162 : dbgs() << "applied pre-existing waitcnt\n"
2163 << "Old Instr: " << *It << "New Instr: "
2164 << *CombinedLoadDsCntInstr << '\n');
2165 } else {
2166 CombinedLoadDsCntInstr->eraseFromParent();
2167 Modified = true;
2168 }
2169 }
2170
2171 if (CombinedStoreDsCntInstr) {
2172 // Similarly for S_WAIT_STORECNT_DSCNT.
2173 if (Wait.get(STORE_CNT) != ~0u && Wait.get(DS_CNT) != ~0u) {
2174 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2175 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
2176 AMDGPU::OpName::simm16, NewEnc);
2177 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2178 ScoreBrackets.applyWaitcnt(Wait, STORE_CNT);
2179 ScoreBrackets.applyWaitcnt(Wait, DS_CNT);
2180 Wait.set(STORE_CNT, ~0u);
2181 Wait.set(DS_CNT, ~0u);
2182
2183 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2184 << "New Instr at block end: "
2185 << *CombinedStoreDsCntInstr << '\n'
2186 : dbgs() << "applied pre-existing waitcnt\n"
2187 << "Old Instr: " << *It << "New Instr: "
2188 << *CombinedStoreDsCntInstr << '\n');
2189 } else {
2190 CombinedStoreDsCntInstr->eraseFromParent();
2191 Modified = true;
2192 }
2193 }
2194
2195 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
2196 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
2197 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
2198 // instructions so that createNewWaitcnt() will create new combined
2199 // instructions to replace them.
2200
2201 if (Wait.get(DS_CNT) != ~0u) {
2202 // This is a vector of addresses in WaitInstrs pointing to instructions
2203 // that should be removed if they are present.
2205
2206 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
2207 // both) need to be waited for, ensure that there are no existing
2208 // individual wait count instructions for these.
2209
2210 if (Wait.get(LOAD_CNT) != ~0u) {
2211 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
2212 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
2213 } else if (Wait.get(STORE_CNT) != ~0u) {
2214 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
2215 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
2216 }
2217
2218 for (MachineInstr **WI : WaitsToErase) {
2219 if (!*WI)
2220 continue;
2221
2222 (*WI)->eraseFromParent();
2223 *WI = nullptr;
2224 Modified = true;
2225 }
2226 }
2227
2229 if (!WaitInstrs[CT])
2230 continue;
2231
2232 unsigned NewCnt = Wait.get(CT);
2233 if (NewCnt != ~0u) {
2234 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
2235 AMDGPU::OpName::simm16, NewCnt);
2236 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2237
2238 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2239 setNoWait(Wait, CT);
2240
2241 LLVM_DEBUG(It.isEnd()
2242 ? dbgs() << "applied pre-existing waitcnt\n"
2243 << "New Instr at block end: " << *WaitInstrs[CT]
2244 << '\n'
2245 : dbgs() << "applied pre-existing waitcnt\n"
2246 << "Old Instr: " << *It
2247 << "New Instr: " << *WaitInstrs[CT] << '\n');
2248 } else {
2249 WaitInstrs[CT]->eraseFromParent();
2250 Modified = true;
2251 }
2252 }
2253
2254 if (WaitcntDepctrInstr) {
2255 // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
2256 // subfields with the new required values.
2257 unsigned Enc =
2258 TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2259 ->getImm();
2262
2263 ScoreBrackets.applyWaitcnt(VA_VDST, Wait.get(VA_VDST));
2264 ScoreBrackets.applyWaitcnt(VM_VSRC, Wait.get(VM_VSRC));
2265 Wait.set(VA_VDST, ~0u);
2266 Wait.set(VM_VSRC, ~0u);
2267
2268 // If that new encoded Depctr immediate would actually still wait
2269 // for anything, update the instruction's operand. Otherwise it can
2270 // just be deleted.
2271 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
2272 Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr,
2273 AMDGPU::OpName::simm16, Enc);
2274 LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n"
2275 << "New Instr at block end: "
2276 << *WaitcntDepctrInstr << '\n'
2277 : dbgs() << "applyPreexistingWaitcnt\n"
2278 << "Old Instr: " << *It << "New Instr: "
2279 << *WaitcntDepctrInstr << '\n');
2280 } else {
2281 WaitcntDepctrInstr->eraseFromParent();
2282 Modified = true;
2283 }
2284 }
2285
2286 return Modified;
2287}
2288
2289/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
2290bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2291 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
2292 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
2293 assert(!isNormalMode(MaxCounter));
2294
2295 bool Modified = false;
2296 const DebugLoc &DL = Block.findDebugLoc(It);
2297
2298 // Helper to emit expanded waitcnt sequence for profiling.
2299 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
2300 auto EmitWaitcnt) {
2301 for (unsigned I = Outstanding - 1; I > Target && I != ~0u; --I)
2302 EmitWaitcnt(I);
2303 EmitWaitcnt(Target);
2304 Modified = true;
2305 };
2306
2307 // For GFX12+, we use separate wait instructions, which makes expansion
2308 // simpler
2309 if (ExpandWaitcntProfiling) {
2311 unsigned Count = Wait.get(CT);
2312 if (Count == ~0u)
2313 continue;
2314
2315 // Skip expansion for out-of-order counters - emit normal wait instead
2316 if (ScoreBrackets.counterOutOfOrder(CT)) {
2317 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2318 .addImm(Count);
2319 Modified = true;
2320 continue;
2321 }
2322
2323 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
2324 getWaitCountMax(getLimits(), CT) - 1);
2325 EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
2326 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2327 .addImm(Val);
2328 });
2329 }
2330 return Modified;
2331 }
2332
2333 // Normal behavior (no expansion)
2334 // Check for opportunities to use combined wait instructions.
2335 if (Wait.get(DS_CNT) != ~0u) {
2336 MachineInstr *SWaitInst = nullptr;
2337
2338 if (Wait.get(LOAD_CNT) != ~0u) {
2339 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2340
2341 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2342 .addImm(Enc);
2343
2344 Wait.set(LOAD_CNT, ~0u);
2345 Wait.set(DS_CNT, ~0u);
2346 } else if (Wait.get(STORE_CNT) != ~0u) {
2347 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2348
2349 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_STORECNT_DSCNT))
2350 .addImm(Enc);
2351
2352 Wait.set(STORE_CNT, ~0u);
2353 Wait.set(DS_CNT, ~0u);
2354 }
2355
2356 if (SWaitInst) {
2357 Modified = true;
2358
2359 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2360 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2361 dbgs() << "New Instr: " << *SWaitInst << '\n');
2362 }
2363 }
2364
2365 // Generate an instruction for any remaining counter that needs
2366 // waiting for.
2367
2369 unsigned Count = Wait.get(CT);
2370 if (Count == ~0u)
2371 continue;
2372
2373 [[maybe_unused]] auto SWaitInst =
2374 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2375 .addImm(Count);
2376
2377 Modified = true;
2378
2379 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2380 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2381 dbgs() << "New Instr: " << *SWaitInst << '\n');
2382 }
2383
2384 if (Wait.hasWaitDepctr()) {
2385 assert(IsExpertMode);
2386 unsigned Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Wait.get(VM_VSRC), ST);
2388
2389 [[maybe_unused]] auto SWaitInst =
2390 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_DEPCTR)).addImm(Enc);
2391
2392 Modified = true;
2393
2394 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
2395 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2396 dbgs() << "New Instr: " << *SWaitInst << '\n');
2397 }
2398
2399 return Modified;
2400}
2401
2402/// Generate s_waitcnt instruction to be placed before cur_Inst.
2403/// Instructions of a given type are returned in order,
2404/// but instructions of different types can complete out of order.
2405/// We rely on this in-order completion
2406/// and simply assign a score to the memory access instructions.
2407/// We keep track of the active "score bracket" to determine
2408/// if an access of a memory read requires an s_waitcnt
2409/// and if so what the value of each counter is.
2410/// The "score bracket" is bound by the lower bound and upper bound
2411/// scores (*_score_LB and *_score_ub respectively).
2412/// If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here.
2413/// If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here
2414/// (GFX12+ only, where DS_CNT is a separate counter).
2415bool SIInsertWaitcnts::generateWaitcntInstBefore(
2416 MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
2417 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2418 LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs()););
2419 setForceEmitWaitcnt();
2420
2421 assert(!MI.isMetaInstruction());
2422
2423 AMDGPU::Waitcnt Wait;
2424 const unsigned Opc = MI.getOpcode();
2425
2426 switch (Opc) {
2427 case AMDGPU::BUFFER_WBINVL1:
2428 case AMDGPU::BUFFER_WBINVL1_SC:
2429 case AMDGPU::BUFFER_WBINVL1_VOL:
2430 case AMDGPU::BUFFER_GL0_INV:
2431 case AMDGPU::BUFFER_GL1_INV: {
2432 // FIXME: This should have already been handled by the memory legalizer.
2433 // Removing this currently doesn't affect any lit tests, but we need to
2434 // verify that nothing was relying on this. The number of buffer invalidates
2435 // being handled here should not be expanded.
2436 Wait.set(LOAD_CNT, 0);
2437 break;
2438 }
2439 case AMDGPU::SI_RETURN_TO_EPILOG:
2440 case AMDGPU::SI_RETURN:
2441 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2442 case AMDGPU::S_SETPC_B64_return: {
2443 // All waits must be resolved at call return.
2444 // NOTE: this could be improved with knowledge of all call sites or
2445 // with knowledge of the called routines.
2446 ReturnInsts.insert(&MI);
2447 AMDGPU::Waitcnt AllZeroWait =
2448 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2449 // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
2450 // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
2451 // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
2452 // no need to wait for it at function boundaries.
2453 if (ST.hasExtendedWaitCounts() &&
2454 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
2455 AllZeroWait.set(LOAD_CNT, ~0u);
2456 Wait = AllZeroWait;
2457 break;
2458 }
2459 case AMDGPU::S_ENDPGM:
2460 case AMDGPU::S_ENDPGM_SAVED: {
2461 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
2462 // Technically the hardware will do this on its own if we don't, but that
2463 // might cost extra cycles compared to doing it explicitly.
2464 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
2465 // have to wait for outstanding VMEM stores. In this case it can be useful
2466 // to send a message to explicitly release all VGPRs before the stores have
2467 // completed, but it is only safe to do this if there are no outstanding
2468 // scratch stores.
2469 EndPgmInsts[&MI] = !ScoreBrackets.empty(STORE_CNT) &&
2470 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
2471 break;
2472 }
2473 case AMDGPU::S_SENDMSG:
2474 case AMDGPU::S_SENDMSGHALT: {
2475 if (ST.hasLegacyGeometry() &&
2476 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
2478 // Resolve vm waits before gs-done.
2479 Wait.set(LOAD_CNT, 0);
2480 break;
2481 }
2482 [[fallthrough]];
2483 }
2484 default: {
2485
2486 // Export & GDS instructions do not read the EXEC mask until after the
2487 // export is granted (which can occur well after the instruction is issued).
2488 // The shader program must flush all EXP operations on the export-count
2489 // before overwriting the EXEC mask.
2490 if (MI.modifiesRegister(AMDGPU::EXEC, &TRI)) {
2491 // Export and GDS are tracked individually, either may trigger a waitcnt
2492 // for EXEC.
2493 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2494 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2495 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2496 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2497 Wait.set(EXP_CNT, 0);
2498 }
2499 }
2500
2501 // Wait for any pending GDS instruction to complete before any
2502 // "Always GDS" instruction.
2503 if (TII.isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
2504 addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
2505
2506 if (MI.isCall()) {
2507 // The function is going to insert a wait on everything in its prolog.
2508 // This still needs to be careful if the call target is a load (e.g. a GOT
2509 // load). We also need to check WAW dependency with saved PC.
2510 CallInsts.insert(&MI);
2511 Wait = AMDGPU::Waitcnt();
2512
2513 const MachineOperand &CallAddrOp = TII.getCalleeOperand(MI);
2514 if (CallAddrOp.isReg()) {
2515 ScoreBrackets.determineWaitForPhysReg(
2516 SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait);
2517
2518 if (const auto *RtnAddrOp =
2519 TII.getNamedOperand(MI, AMDGPU::OpName::dst)) {
2520 ScoreBrackets.determineWaitForPhysReg(
2521 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait);
2522 }
2523 }
2524 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
2525 ScoreBrackets.tryClearSCCWriteEvent(&MI);
2526 } else {
2527 // FIXME: Should not be relying on memoperands.
2528 // Look at the source operands of every instruction to see if
2529 // any of them results from a previous memory operation that affects
2530 // its current usage. If so, an s_waitcnt instruction needs to be
2531 // emitted.
2532 // If the source operand was defined by a load, add the s_waitcnt
2533 // instruction.
2534 //
2535 // Two cases are handled for destination operands:
2536 // 1) If the destination operand was defined by a load, add the s_waitcnt
2537 // instruction to guarantee the right WAW order.
2538 // 2) If a destination operand that was used by a recent export/store ins,
2539 // add s_waitcnt on exp_cnt to guarantee the WAR order.
2540
2541 for (const MachineMemOperand *Memop : MI.memoperands()) {
2542 const Value *Ptr = Memop->getValue();
2543 if (Memop->isStore()) {
2544 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
2545 addWait(Wait, SmemAccessCounter, 0);
2546 if (PDT.dominates(MI.getParent(), It->second))
2547 SLoadAddresses.erase(It);
2548 }
2549 }
2550 unsigned AS = Memop->getAddrSpace();
2552 continue;
2553 // No need to wait before load from VMEM to LDS.
2554 if (TII.mayWriteLDSThroughDMA(MI))
2555 continue;
2556
2557 // LOAD_CNT is only relevant to vgpr or LDS.
2558 unsigned TID = LDSDMA_BEGIN;
2559 if (Ptr && Memop->getAAInfo()) {
2560 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2561 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2562 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
2563 if ((I + 1) >= NUM_LDSDMA) {
2564 // We didn't have enough slot to track this LDS DMA store, it
2565 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
2566 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
2567 break;
2568 }
2569
2570 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID + I + 1, Wait);
2571 }
2572 }
2573 } else {
2574 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
2575 }
2576 if (Memop->isStore()) {
2577 ScoreBrackets.determineWaitForLDSDMA(EXP_CNT, TID, Wait);
2578 }
2579 }
2580
2581 // Loop over use and def operands.
2582 for (const MachineOperand &Op : MI.operands()) {
2583 if (!Op.isReg())
2584 continue;
2585
2586 // If the instruction does not read tied source, skip the operand.
2587 if (Op.isTied() && Op.isUse() && TII.doesNotReadTiedSource(MI))
2588 continue;
2589
2590 MCPhysReg Reg = Op.getReg().asMCReg();
2591
2592 const bool IsVGPR = TRI.isVectorRegister(MRI, Op.getReg());
2593 if (IsVGPR) {
2594 // Implicit VGPR defs and uses are never a part of the memory
2595 // instructions description and usually present to account for
2596 // super-register liveness.
2597 // TODO: Most of the other instructions also have implicit uses
2598 // for the liveness accounting only.
2599 if (Op.isImplicit() && MI.mayLoadOrStore())
2600 continue;
2601
2602 ScoreBrackets.determineWaitForPhysReg(VA_VDST, Reg, Wait);
2603 if (Op.isDef())
2604 ScoreBrackets.determineWaitForPhysReg(VM_VSRC, Reg, Wait);
2605 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2606 // previous write and this write are the same type of VMEM
2607 // instruction, in which case they are (in some architectures)
2608 // guaranteed to write their results in order anyway.
2609 // Additionally check instructions where Point Sample Acceleration
2610 // might be applied.
2611 if (Op.isUse() || !updateVMCntOnly(MI) ||
2612 ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||
2613 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
2614 !ST.hasVmemWriteVgprInOrder()) {
2615 ScoreBrackets.determineWaitForPhysReg(LOAD_CNT, Reg, Wait);
2616 ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT, Reg, Wait);
2617 ScoreBrackets.determineWaitForPhysReg(BVH_CNT, Reg, Wait);
2618 ScoreBrackets.clearVgprVmemTypes(Reg);
2619 }
2620
2621 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2622 ScoreBrackets.determineWaitForPhysReg(EXP_CNT, Reg, Wait);
2623 }
2624 ScoreBrackets.determineWaitForPhysReg(DS_CNT, Reg, Wait);
2625 } else if (Op.getReg() == AMDGPU::SCC) {
2626 ScoreBrackets.determineWaitForPhysReg(KM_CNT, Reg, Wait);
2627 } else {
2628 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait);
2629 }
2630
2631 if (ST.hasWaitXcnt() && Op.isDef())
2632 ScoreBrackets.determineWaitForPhysReg(X_CNT, Reg, Wait);
2633 }
2634 }
2635 }
2636 }
2637
2638 // Ensure safety against exceptions from outstanding memory operations while
2639 // waiting for a barrier:
2640 //
2641 // * Some subtargets safely handle backing off the barrier in hardware
2642 // when an exception occurs.
2643 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2644 // there can be no outstanding memory operations during the wait.
2645 // * Subtargets with split barriers don't need to back off the barrier; it
2646 // is up to the trap handler to preserve the user barrier state correctly.
2647 //
2648 // In all other cases, ensure safety by ensuring that there are no outstanding
2649 // memory operations.
2650 if (Opc == AMDGPU::S_BARRIER && !ST.hasAutoWaitcntBeforeBarrier() &&
2651 !ST.hasBackOffBarrier()) {
2652 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2653 }
2654
2655 // TODO: Remove this work-around, enable the assert for Bug 457939
2656 // after fixing the scheduler. Also, the Shader Compiler code is
2657 // independent of target.
2658 if (SIInstrInfo::isCBranchVCCZRead(MI) && ST.hasReadVCCZBug() &&
2659 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2660 Wait.set(DS_CNT, 0);
2661 }
2662
2663 // Verify that the wait is actually needed.
2664 ScoreBrackets.simplifyWaitcnt(Wait);
2665
2666 // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
2667 // waits on VA_VDST if the instruction it would precede is not a VALU
2668 // instruction, since hardware handles VALU->VGPR->VALU hazards in
2669 // expert scheduling mode.
2670 if (TII.isVALU(MI))
2671 Wait.set(VA_VDST, ~0u);
2672
2673 // Since the translation for VMEM addresses occur in-order, we can apply the
2674 // XCnt if the current instruction is of VMEM type and has a memory
2675 // dependency with another VMEM instruction in flight.
2676 if (Wait.get(X_CNT) != ~0u && isVmemAccess(MI)) {
2677 ScoreBrackets.applyWaitcnt(Wait, X_CNT);
2678 Wait.set(X_CNT, ~0u);
2679 }
2680
2681 // When forcing emit, we need to skip terminators because that would break the
2682 // terminators of the MBB if we emit a waitcnt between terminators.
2683 if (ForceEmitZeroFlag && !MI.isTerminator())
2684 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2685
2686 // If we force waitcnt then update Wait accordingly.
2688 if (!ForceEmitWaitcnt[T])
2689 continue;
2690 Wait.set(T, 0);
2691 }
2692
2693 if (FlushFlags.FlushVmCnt) {
2695 Wait.set(T, 0);
2696 }
2697
2698 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
2699 Wait.set(DS_CNT, 0);
2700
2701 if (ForceEmitZeroLoadFlag && Wait.get(LOAD_CNT) != ~0u)
2702 Wait.set(LOAD_CNT, 0);
2703
2704 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2705 OldWaitcntInstr);
2706}
2707
2708bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2710 MachineBasicBlock &Block,
2711 WaitcntBrackets &ScoreBrackets,
2712 MachineInstr *OldWaitcntInstr) {
2713 bool Modified = false;
2714
2715 if (OldWaitcntInstr)
2716 // Try to merge the required wait with preexisting waitcnt instructions.
2717 // Also erase redundant waitcnt.
2718 Modified =
2719 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2720
2721 // ExpCnt can be merged into VINTERP.
2722 if (Wait.get(EXP_CNT) != ~0u && It != Block.instr_end() &&
2724 MachineOperand *WaitExp = TII.getNamedOperand(*It, AMDGPU::OpName::waitexp);
2725 if (Wait.get(EXP_CNT) < WaitExp->getImm()) {
2726 WaitExp->setImm(Wait.get(EXP_CNT));
2727 Modified = true;
2728 }
2729 // Apply ExpCnt before resetting it, so applyWaitcnt below sees all counts.
2730 ScoreBrackets.applyWaitcnt(Wait, EXP_CNT);
2731 Wait.set(EXP_CNT, ~0u);
2732
2733 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2734 << "Update Instr: " << *It);
2735 }
2736
2737 if (WCG->createNewWaitcnt(Block, It, Wait, ScoreBrackets))
2738 Modified = true;
2739
2740 // Any counts that could have been applied to any existing waitcnt
2741 // instructions will have been done so, now deal with any remaining.
2742 ScoreBrackets.applyWaitcnt(Wait);
2743
2744 return Modified;
2745}
2746
2747std::optional<WaitEventType>
2748SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
2749 if (TII.isVALU(Inst)) {
2750 // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
2751 // out-of-order with respect to each other, so each of these classes
2752 // has its own event.
2753
2754 if (TII.isXDL(Inst))
2755 return VGPR_XDL_WRITE;
2756
2757 if (TII.isTRANS(Inst))
2758 return VGPR_TRANS_WRITE;
2759
2761 return VGPR_DPMACC_WRITE;
2762
2763 return VGPR_CSMACC_WRITE;
2764 }
2765
2766 // FLAT and LDS instructions may read their VGPR sources out-of-order
2767 // with respect to each other and all other VMEM instructions, so
2768 // each of these also has a separate event.
2769
2770 if (TII.isFLAT(Inst))
2771 return VGPR_FLAT_READ;
2772
2773 if (TII.isDS(Inst))
2774 return VGPR_LDS_READ;
2775
2776 if (TII.isVMEM(Inst) || TII.isVIMAGE(Inst) || TII.isVSAMPLE(Inst))
2777 return VGPR_VMEM_READ;
2778
2779 // Otherwise, no hazard.
2780
2781 return {};
2782}
2783
2784bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2785 return (TII.isFLAT(MI) && TII.mayAccessVMEMThroughFlat(MI)) ||
2786 (TII.isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2787}
2788
2789// Return true if the next instruction is S_ENDPGM, following fallthrough
2790// blocks if necessary.
2791bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2792 MachineBasicBlock *Block) const {
2793 auto BlockEnd = Block->getParent()->end();
2794 auto BlockIter = Block->getIterator();
2795
2796 while (true) {
2797 if (It.isEnd()) {
2798 if (++BlockIter != BlockEnd) {
2799 It = BlockIter->instr_begin();
2800 continue;
2801 }
2802
2803 return false;
2804 }
2805
2806 if (!It->isMetaInstruction())
2807 break;
2808
2809 It++;
2810 }
2811
2812 assert(!It.isEnd());
2813
2814 return It->getOpcode() == AMDGPU::S_ENDPGM;
2815}
2816
2817// Add a wait after an instruction if architecture requirements mandate one.
2818bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2819 MachineBasicBlock &Block,
2820 WaitcntBrackets &ScoreBrackets) {
2821 AMDGPU::Waitcnt Wait;
2822 bool NeedsEndPGMCheck = false;
2823
2824 if (ST.isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2825 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2827
2828 if (TII.isAlwaysGDS(Inst.getOpcode())) {
2829 Wait.set(DS_CNT, 0);
2830 NeedsEndPGMCheck = true;
2831 }
2832
2833 ScoreBrackets.simplifyWaitcnt(Wait);
2834
2835 auto SuccessorIt = std::next(Inst.getIterator());
2836 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2837 /*OldWaitcntInstr=*/nullptr);
2838
2839 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2840 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII.get(AMDGPU::S_NOP))
2841 .addImm(0);
2842 }
2843
2844 return Result;
2845}
2846
2847WaitEventSet SIInsertWaitcnts::getEventsFor(const MachineInstr &Inst) const {
2848 WaitEventSet Events;
2849 if (IsExpertMode) {
2850 if (const auto ET = getExpertSchedulingEventType(Inst))
2851 Events.insert(*ET);
2852 }
2853
2854 if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
2855 if (TII.isAlwaysGDS(Inst.getOpcode()) ||
2856 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2857 Events.insert(GDS_ACCESS);
2858 Events.insert(GDS_GPR_LOCK);
2859 } else {
2860 Events.insert(LDS_ACCESS);
2861 }
2862 } else if (TII.isFLAT(Inst)) {
2864 Events.insert(getVmemWaitEventType(Inst));
2865 } else {
2866 assert(Inst.mayLoadOrStore());
2867 if (TII.mayAccessVMEMThroughFlat(Inst)) {
2868 if (ST.hasWaitXcnt())
2869 Events.insert(VMEM_GROUP);
2870 Events.insert(getVmemWaitEventType(Inst));
2871 }
2872 if (TII.mayAccessLDSThroughFlat(Inst))
2873 Events.insert(LDS_ACCESS);
2874 }
2875 } else if (SIInstrInfo::isVMEM(Inst) &&
2877 Inst.getOpcode() == AMDGPU::BUFFER_WBL2)) {
2878 // BUFFER_WBL2 is included here because unlike invalidates, has to be
2879 // followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has
2880 // completed.
2881 if (ST.hasWaitXcnt())
2882 Events.insert(VMEM_GROUP);
2883 Events.insert(getVmemWaitEventType(Inst));
2884 if (ST.vmemWriteNeedsExpWaitcnt() &&
2885 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2886 Events.insert(VMW_GPR_LOCK);
2887 }
2888 } else if (TII.isSMRD(Inst)) {
2889 if (ST.hasWaitXcnt())
2890 Events.insert(SMEM_GROUP);
2891 Events.insert(SMEM_ACCESS);
2892 } else if (SIInstrInfo::isLDSDIR(Inst)) {
2893 Events.insert(EXP_LDS_ACCESS);
2894 } else if (SIInstrInfo::isEXP(Inst)) {
2895 unsigned Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2897 Events.insert(EXP_PARAM_ACCESS);
2899 Events.insert(EXP_POS_ACCESS);
2900 else
2901 Events.insert(EXP_GPR_LOCK);
2902 } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
2903 Events.insert(SCC_WRITE);
2904 } else {
2905 switch (Inst.getOpcode()) {
2906 case AMDGPU::S_SENDMSG:
2907 case AMDGPU::S_SENDMSG_RTN_B32:
2908 case AMDGPU::S_SENDMSG_RTN_B64:
2909 case AMDGPU::S_SENDMSGHALT:
2910 Events.insert(SQ_MESSAGE);
2911 break;
2912 case AMDGPU::S_MEMTIME:
2913 case AMDGPU::S_MEMREALTIME:
2914 case AMDGPU::S_GET_BARRIER_STATE_M0:
2915 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2916 Events.insert(SMEM_ACCESS);
2917 break;
2918 }
2919 }
2920 return Events;
2921}
2922
2923void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2924 WaitcntBrackets *ScoreBrackets) {
2925
2926 WaitEventSet InstEvents = getEventsFor(Inst);
2927 for (WaitEventType E : wait_events()) {
2928 if (InstEvents.contains(E))
2929 ScoreBrackets->updateByEvent(E, Inst);
2930 }
2931
2932 if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
2933 if (TII.isAlwaysGDS(Inst.getOpcode()) ||
2934 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2935 ScoreBrackets->setPendingGDS();
2936 }
2937 } else if (TII.isFLAT(Inst)) {
2938 if (Inst.mayLoadOrStore() && TII.mayAccessVMEMThroughFlat(Inst) &&
2939 TII.mayAccessLDSThroughFlat(Inst) && !SIInstrInfo::isLDSDMA(Inst)) {
2940 // Async/LDSDMA operations have FLAT encoding but do not actually use flat
2941 // pointers. They do have two operands that each access global and LDS,
2942 // thus making it appear at this point that they are using a flat pointer.
2943 // Filter them out, and for the rest, generate a dependency on flat
2944 // pointers so that both VM and LGKM counters are flushed.
2945 ScoreBrackets->setPendingFlat();
2946 }
2947 if (SIInstrInfo::usesASYNC_CNT(Inst)) {
2948 ScoreBrackets->updateByEvent(ASYNC_ACCESS, Inst);
2949 }
2950 } else if (Inst.isCall()) {
2951 // Act as a wait on everything, but AsyncCnt is never included in such
2952 // blanket waits.
2953 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2954 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2955 } else if (TII.isVINTERP(Inst)) {
2956 int64_t Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2957 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2958 }
2959}
2960
2961bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2962 unsigned OtherScore) {
2963 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2964 unsigned OtherShifted =
2965 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2966 Score = std::max(MyShifted, OtherShifted);
2967 return OtherShifted > MyShifted;
2968}
2969
2970bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
2971 ArrayRef<CounterValueArray> OtherMarks) {
2972 bool StrictDom = false;
2973
2974 LLVM_DEBUG(dbgs() << "Merging async marks ...");
2975 // Early exit: both empty
2976 if (AsyncMarks.empty() && OtherMarks.empty()) {
2977 LLVM_DEBUG(dbgs() << " nothing to merge\n");
2978 return false;
2979 }
2980 LLVM_DEBUG(dbgs() << '\n');
2981
2982 // Determine maximum length needed after merging
2983 auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.size());
2984 MaxSize = std::min(MaxSize, MaxAsyncMarks);
2985
2986 // Keep only the most recent marks within our limit.
2987 if (AsyncMarks.size() > MaxSize)
2988 AsyncMarks.erase(AsyncMarks.begin(),
2989 AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
2990
2991 // Pad with zero-filled marks if our list is shorter. Zero represents "no
2992 // pending async operations at this checkpoint" and acts as the identity
2993 // element for max() during merging. We pad at the beginning since the marks
2994 // need to be aligned in most-recent order.
2995 constexpr CounterValueArray ZeroMark{};
2996 AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
2997
2998 LLVM_DEBUG({
2999 dbgs() << "Before merge:\n";
3000 for (const auto &Mark : AsyncMarks) {
3001 llvm::interleaveComma(Mark, dbgs());
3002 dbgs() << '\n';
3003 }
3004 dbgs() << "Other marks:\n";
3005 for (const auto &Mark : OtherMarks) {
3006 llvm::interleaveComma(Mark, dbgs());
3007 dbgs() << '\n';
3008 }
3009 });
3010
3011 // Merge element-wise using the existing mergeScore function and the
3012 // appropriate MergeInfo for each counter type. Iterate only while we have
3013 // elements in both vectors.
3014 unsigned OtherSize = OtherMarks.size();
3015 unsigned OurSize = AsyncMarks.size();
3016 unsigned MergeCount = std::min(OtherSize, OurSize);
3017 for (auto Idx : seq_inclusive<unsigned>(1, MergeCount)) {
3018 for (auto T : inst_counter_types(Context->MaxCounter)) {
3019 StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],
3020 OtherMarks[OtherSize - Idx][T]);
3021 }
3022 }
3023
3024 LLVM_DEBUG({
3025 dbgs() << "After merge:\n";
3026 for (const auto &Mark : AsyncMarks) {
3027 llvm::interleaveComma(Mark, dbgs());
3028 dbgs() << '\n';
3029 }
3030 });
3031
3032 return StrictDom;
3033}
3034
3035/// Merge the pending events and associater score brackets of \p Other into
3036/// this brackets status.
3037///
3038/// Returns whether the merge resulted in a change that requires tighter waits
3039/// (i.e. the merged brackets strictly dominate the original brackets).
3040bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
3041 bool StrictDom = false;
3042
3043 // Check if "other" has keys we don't have, and create default entries for
3044 // those. If they remain empty after merging, we will clean it up after.
3045 for (auto K : Other.VMem.keys())
3046 VMem.try_emplace(K);
3047 for (auto K : Other.SGPRs.keys())
3048 SGPRs.try_emplace(K);
3049
3050 // Array to store MergeInfo for each counter type
3051 MergeInfo MergeInfos[NUM_INST_CNTS];
3052
3053 for (auto T : inst_counter_types(Context->MaxCounter)) {
3054 // Merge event flags for this counter
3055 const WaitEventSet &EventsForT = Context->getWaitEvents(T);
3056 const WaitEventSet OldEvents = PendingEvents & EventsForT;
3057 const WaitEventSet OtherEvents = Other.PendingEvents & EventsForT;
3058 if (!OldEvents.contains(OtherEvents))
3059 StrictDom = true;
3060 PendingEvents |= OtherEvents;
3061
3062 // Merge scores for this counter
3063 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
3064 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
3065 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
3066 if (NewUB < ScoreLBs[T])
3067 report_fatal_error("waitcnt score overflow");
3068
3069 MergeInfo &M = MergeInfos[T];
3070 M.OldLB = ScoreLBs[T];
3071 M.OtherLB = Other.ScoreLBs[T];
3072 M.MyShift = NewUB - ScoreUBs[T];
3073 M.OtherShift = NewUB - Other.ScoreUBs[T];
3074
3075 ScoreUBs[T] = NewUB;
3076
3077 if (T == LOAD_CNT)
3078 StrictDom |= mergeScore(M, LastFlatLoadCnt, Other.LastFlatLoadCnt);
3079
3080 if (T == DS_CNT) {
3081 StrictDom |= mergeScore(M, LastFlatDsCnt, Other.LastFlatDsCnt);
3082 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
3083 }
3084
3085 if (T == KM_CNT) {
3086 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
3087 if (Other.hasPendingEvent(SCC_WRITE)) {
3088 if (!OldEvents.contains(SCC_WRITE)) {
3089 PendingSCCWrite = Other.PendingSCCWrite;
3090 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
3091 PendingSCCWrite = nullptr;
3092 }
3093 }
3094 }
3095
3096 for (auto &[RegID, Info] : VMem)
3097 StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));
3098
3099 if (isSmemCounter(T)) {
3100 for (auto &[RegID, Info] : SGPRs) {
3101 auto It = Other.SGPRs.find(RegID);
3102 unsigned OtherScore = (It != Other.SGPRs.end()) ? It->second.get(T) : 0;
3103 StrictDom |= mergeScore(M, Info.get(T), OtherScore);
3104 }
3105 }
3106 }
3107
3108 for (auto &[TID, Info] : VMem) {
3109 if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {
3110 unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;
3111 StrictDom |= NewVmemTypes != Info.VMEMTypes;
3112 Info.VMEMTypes = NewVmemTypes;
3113 }
3114 }
3115
3116 StrictDom |= mergeAsyncMarks(MergeInfos, Other.AsyncMarks);
3117 for (auto T : inst_counter_types(Context->MaxCounter))
3118 StrictDom |= mergeScore(MergeInfos[T], AsyncScore[T], Other.AsyncScore[T]);
3119
3120 purgeEmptyTrackingData();
3121 return StrictDom;
3122}
3123
3124static bool isWaitInstr(MachineInstr &Inst) {
3125 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
3126 return Opcode == AMDGPU::S_WAITCNT ||
3127 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
3128 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
3129 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
3130 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
3131 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
3132 Opcode == AMDGPU::WAIT_ASYNCMARK ||
3133 counterTypeForInstr(Opcode).has_value();
3134}
3135
3136void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
3138 bool ExpertMode) const {
3139 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
3141 BuildMI(MBB, I, DebugLoc(), TII.get(AMDGPU::S_SETREG_IMM32_B32))
3142 .addImm(ExpertMode ? 2 : 0)
3143 .addImm(EncodedReg);
3144}
3145
3146namespace {
3147// TODO: Remove this work-around after fixing the scheduler.
3148// There are two reasons why vccz might be incorrect; see ST.hasReadVCCZBug()
3149// and ST.partialVCCWritesUpdateVCCZ().
3150// i. VCCZBug: There is a hardware bug on CI/SI where SMRD instruction may
3151// corrupt vccz bit, so when we detect that an instruction may read from
3152// a corrupt vccz bit, we need to:
3153// 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
3154// operations to complete.
3155// 2. Recompute the correct value of vccz by writing the current value
3156// of vcc back to vcc.
3157// ii. Partial writes to vcc don't update vccz, so we need to recompute the
3158// correct value of vccz by reading vcc and writing it back to vcc.
3159// No waitcnt is needed in this case.
3160class VCCZWorkaround {
3161 const WaitcntBrackets &ScoreBrackets;
3162 const GCNSubtarget &ST;
3163 const SIInstrInfo &TII;
3164 const SIRegisterInfo &TRI;
3165 bool VCCZCorruptionBug = false;
3166 bool VCCZNotUpdatedByPartialWrites = false;
3167 /// vccz could be incorrect at a basic block boundary if a predecessor wrote
3168 /// to vcc and then issued an smem load, so initialize to true.
3169 bool MustRecomputeVCCZ = true;
3170
3171public:
3172 VCCZWorkaround(const WaitcntBrackets &ScoreBrackets, const GCNSubtarget &ST,
3173 const SIInstrInfo &TII, const SIRegisterInfo &TRI)
3174 : ScoreBrackets(ScoreBrackets), ST(ST), TII(TII), TRI(TRI) {
3175 VCCZCorruptionBug = ST.hasReadVCCZBug();
3176 VCCZNotUpdatedByPartialWrites = !ST.partialVCCWritesUpdateVCCZ();
3177 }
3178 /// If \p MI reads vccz and we must recompute it based on MustRecomputeVCCZ,
3179 /// then emit a vccz recompute instruction before \p MI. This needs to be
3180 /// called on every instruction in the basic block because it also tracks the
3181 /// state and updates MustRecomputeVCCZ accordingly. Returns true if it
3182 /// modified the IR.
3183 bool tryRecomputeVCCZ(MachineInstr &MI) {
3184 // No need to run this if neither bug is present.
3185 if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)
3186 return false;
3187
3188 // If MI is an SMEM and it can corrupt vccz on this target, then we need
3189 // both to emit a waitcnt and to recompute vccz.
3190 // But we don't actually emit a waitcnt here. This is done in
3191 // generateWaitcntInstBefore() because it tracks all the necessary waitcnt
3192 // state, and can either skip emitting a waitcnt if there is already one in
3193 // the IR, or emit an "optimized" combined waitcnt.
3194 // If this is an smem read, it could complete and clobber vccz at any time.
3195 MustRecomputeVCCZ |= VCCZCorruptionBug && TII.isSMRD(MI);
3196
3197 // If the target partial vcc writes don't update vccz, and MI is such an
3198 // instruction then we must recompute vccz.
3199 // Note: We are using PartiallyWritesToVCCOpt optional to avoid calling
3200 // `definesRegister()` more than needed, because it's not very cheap.
3201 std::optional<bool> PartiallyWritesToVCCOpt;
3202 auto PartiallyWritesToVCC = [](MachineInstr &MI) {
3203 return MI.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
3204 MI.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr);
3205 };
3206 if (VCCZNotUpdatedByPartialWrites) {
3207 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
3208 // If this is a partial VCC write but won't update vccz, then we must
3209 // recompute vccz.
3210 MustRecomputeVCCZ |= *PartiallyWritesToVCCOpt;
3211 }
3212
3213 // If MI is a vcc write with no pending smem, or there is a pending smem
3214 // but the target does not suffer from the vccz corruption bug, then we
3215 // don't need to recompute vccz as this write will recompute it anyway.
3216 if (!ScoreBrackets.hasPendingEvent(SMEM_ACCESS) || !VCCZCorruptionBug) {
3217 // Compute PartiallyWritesToVCCOpt if we haven't done so already.
3218 if (!PartiallyWritesToVCCOpt)
3219 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
3220 bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&
3221 MI.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr);
3222 // If we write to the full vcc or we write partially and the target
3223 // updates vccz on partial writes, then vccz will be updated correctly.
3224 bool UpdatesVCCZ = FullyWritesToVCC || (!VCCZNotUpdatedByPartialWrites &&
3225 *PartiallyWritesToVCCOpt);
3226 if (UpdatesVCCZ)
3227 MustRecomputeVCCZ = false;
3228 }
3229
3230 // If MI is a branch that reads VCCZ then emit a waitcnt and a vccz
3231 // restore instruction if either is needed.
3232 if (SIInstrInfo::isCBranchVCCZRead(MI) && MustRecomputeVCCZ) {
3233 // Recompute the vccz bit. Any time a value is written to vcc, the vccz
3234 // bit is updated, so we can restore the bit by reading the value of vcc
3235 // and then writing it back to the register.
3236 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
3237 TII.get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
3238 TRI.getVCC())
3239 .addReg(TRI.getVCC());
3240 MustRecomputeVCCZ = false;
3241 return true;
3242 }
3243 return false;
3244 }
3245};
3246
3247} // namespace
3248
3249// Generate s_waitcnt instructions where needed.
3250bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
3251 MachineBasicBlock &Block,
3252 WaitcntBrackets &ScoreBrackets) {
3253 bool Modified = false;
3254
3255 LLVM_DEBUG({
3256 dbgs() << "*** Begin Block: ";
3257 Block.printName(dbgs());
3258 ScoreBrackets.dump();
3259 });
3260 VCCZWorkaround VCCZW(ScoreBrackets, ST, TII, TRI);
3261
3262 // Walk over the instructions.
3263 MachineInstr *OldWaitcntInstr = nullptr;
3264
3265 // NOTE: We may append instrs after Inst while iterating.
3266 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
3267 E = Block.instr_end();
3268 Iter != E; ++Iter) {
3269 MachineInstr &Inst = *Iter;
3270 if (Inst.isMetaInstruction())
3271 continue;
3272 // Track pre-existing waitcnts that were added in earlier iterations or by
3273 // the memory legalizer.
3274 if (isWaitInstr(Inst) ||
3275 (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3276 if (!OldWaitcntInstr)
3277 OldWaitcntInstr = &Inst;
3278 continue;
3279 }
3280
3281 PreheaderFlushFlags FlushFlags;
3282 if (Block.getFirstTerminator() == Inst)
3283 FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3284
3285 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
3286 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3287 FlushFlags);
3288 OldWaitcntInstr = nullptr;
3289
3290 if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
3291 // Asyncmarks record the current wait state and so should not allow
3292 // waitcnts that occur after them to be merged into waitcnts that occur
3293 // before.
3294 ScoreBrackets.recordAsyncMark(Inst);
3295 continue;
3296 }
3297
3298 if (TII.isSMRD(Inst)) {
3299 for (const MachineMemOperand *Memop : Inst.memoperands()) {
3300 // No need to handle invariant loads when avoiding WAR conflicts, as
3301 // there cannot be a vector store to the same memory location.
3302 if (!Memop->isInvariant()) {
3303 const Value *Ptr = Memop->getValue();
3304 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
3305 }
3306 }
3307 }
3308
3309 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3310
3311 // Note: insertForcedWaitAfter() may add instrs after Iter that need to be
3312 // visited by the loop.
3313 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
3314
3315 LLVM_DEBUG({
3316 Inst.print(dbgs());
3317 ScoreBrackets.dump();
3318 });
3319
3320 // If the target suffers from the vccz bugs, this may emit the necessary
3321 // vccz recompute instruction before \p Inst if needed.
3322 Modified |= VCCZW.tryRecomputeVCCZ(Inst);
3323 }
3324
3325 // Flush counters at the end of the block if needed (for preheaders with no
3326 // terminator).
3327 AMDGPU::Waitcnt Wait;
3328 if (Block.getFirstTerminator() == Block.end()) {
3329 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3330 if (FlushFlags.FlushVmCnt) {
3331 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
3332 Wait.set(LOAD_CNT, 0);
3333 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
3334 Wait.set(SAMPLE_CNT, 0);
3335 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
3336 Wait.set(BVH_CNT, 0);
3337 }
3338 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
3339 Wait.set(DS_CNT, 0);
3340 }
3341
3342 // Combine or remove any redundant waitcnts at the end of the block.
3343 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
3344 OldWaitcntInstr);
3345
3346 LLVM_DEBUG({
3347 dbgs() << "*** End Block: ";
3348 Block.printName(dbgs());
3349 ScoreBrackets.dump();
3350 });
3351
3352 return Modified;
3353}
3354
3355bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &Block) {
3356 if (Block.size() <= 1)
3357 return false;
3358 // The Memory Legalizer conservatively inserts a soft xcnt before each
3359 // atomic RMW operation. However, for sequences of back-to-back atomic
3360 // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
3361 // the redundant soft xcnts.
3362 bool Modified = false;
3363 // Remember the last atomic with a soft xcnt right before it.
3364 MachineInstr *LastAtomicWithSoftXcnt = nullptr;
3365
3366 for (MachineInstr &MI : drop_begin(Block)) {
3367 // Ignore last atomic if non-LDS VMEM and SMEM.
3368 bool IsLDS =
3369 TII.isDS(MI) || (TII.isFLAT(MI) && TII.mayAccessLDSThroughFlat(MI));
3370 if (!IsLDS && (MI.mayLoad() ^ MI.mayStore()))
3371 LastAtomicWithSoftXcnt = nullptr;
3372
3373 bool IsAtomicRMW = (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) &&
3374 MI.mayLoad() && MI.mayStore();
3375 MachineInstr &PrevMI = *MI.getPrevNode();
3376 // This is an atomic with a soft xcnt.
3377 if (PrevMI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3378 // If we have already found an atomic with a soft xcnt, remove this soft
3379 // xcnt as it's redundant.
3380 if (LastAtomicWithSoftXcnt) {
3381 PrevMI.eraseFromParent();
3382 Modified = true;
3383 }
3384 LastAtomicWithSoftXcnt = &MI;
3385 }
3386 }
3387 return Modified;
3388}
3389
3390// Return flags indicating which counters should be flushed in the preheader.
3391PreheaderFlushFlags
3392SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
3393 const WaitcntBrackets &ScoreBrackets) {
3394 auto [Iterator, IsInserted] =
3395 PreheadersToFlush.try_emplace(&MBB, PreheaderFlushFlags());
3396 if (!IsInserted)
3397 return Iterator->second;
3398
3399 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
3400 if (!Succ)
3401 return PreheaderFlushFlags();
3402
3403 MachineLoop *Loop = MLI.getLoopFor(Succ);
3404 if (!Loop)
3405 return PreheaderFlushFlags();
3406
3407 if (Loop->getLoopPreheader() == &MBB) {
3408 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3409 return Iterator->second;
3410 }
3411
3412 return PreheaderFlushFlags();
3413}
3414
3415bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
3417 return TII.mayAccessVMEMThroughFlat(MI);
3418 return SIInstrInfo::isVMEM(MI);
3419}
3420
3421bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {
3422 return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore();
3423}
3424
3425// Check if instruction is a store to LDS that is counted via DSCNT
3426// (where that counter exists).
3427bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {
3428 return MI.mayStore() && SIInstrInfo::isDS(MI);
3429}
3430
3431// Return flags indicating which counters should be flushed in the preheader of
3432// the given loop. We currently decide to flush in the following situations:
3433// For VMEM (FlushVmCnt):
3434// 1. The loop contains vmem store(s), no vmem load and at least one use of a
3435// vgpr containing a value that is loaded outside of the loop. (Only on
3436// targets with no vscnt counter).
3437// 2. The loop contains vmem load(s), but the loaded values are not used in the
3438// loop, and at least one use of a vgpr containing a value that is loaded
3439// outside of the loop.
3440// For DS (FlushDsCnt, GFX12+ only):
3441// 3. The loop contains no DS reads, and at least one use of a vgpr containing
3442// a value that is DS read outside of the loop.
3443// 4. The loop contains DS read(s), loaded values are not used in the same
3444// iteration but in the next iteration (prefetch pattern), and at least one
3445// use of a vgpr containing a value that is DS read outside of the loop.
3446// Flushing in preheader reduces wait overhead if the wait requirement in
3447// iteration 1 would otherwise be more strict (but unfortunately preheader
3448// flush decision is taken before knowing that).
3449// 5. (Single-block loops only) The loop has DS prefetch reads with flush point
3450// tracking. Some DS reads may be used in the same iteration (creating
3451// "flush points"), but others remain unflushed at the backedge. When a DS
3452// read is consumed in the same iteration, it and all prior reads are
3453// "flushed" (FIFO order). No DS writes are allowed in the loop.
3454// TODO: Find a way to extend to multi-block loops.
3455PreheaderFlushFlags
3456SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
3457 const WaitcntBrackets &Brackets) {
3458 PreheaderFlushFlags Flags;
3459 bool HasVMemLoad = false;
3460 bool HasVMemStore = false;
3461 bool UsesVgprVMEMLoadedOutside = false;
3462 bool UsesVgprDSReadOutside = false;
3463 bool VMemInvalidated = false;
3464 // DS optimization only applies to GFX12+ where DS_CNT is separate.
3465 // Tracking status for "no DS read in loop" or "pure DS prefetch
3466 // (use only in next iteration)".
3467 bool TrackSimpleDSOpt = ST.hasExtendedWaitCounts();
3468 DenseSet<MCRegUnit> VgprUse;
3469 DenseSet<MCRegUnit> VgprDefVMEM;
3470 DenseSet<MCRegUnit> VgprDefDS;
3471
3472 // Track DS reads for prefetch pattern with flush points (single-block only).
3473 // Keeps track of the last DS read (position counted from the top of the loop)
3474 // to each VGPR. Read is considered consumed (and thus needs flushing) if
3475 // the dest register has a use or is overwritten (by any later opertions).
3476 DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
3477 unsigned DSReadPosition = 0;
3478 bool IsSingleBlock = ML->getNumBlocks() == 1;
3479 bool TrackDSFlushPoint = ST.hasExtendedWaitCounts() && IsSingleBlock;
3480 unsigned LastDSFlushPosition = 0;
3481
3482 for (MachineBasicBlock *MBB : ML->blocks()) {
3483 for (MachineInstr &MI : *MBB) {
3484 if (isVMEMOrFlatVMEM(MI)) {
3485 HasVMemLoad |= MI.mayLoad();
3486 HasVMemStore |= MI.mayStore();
3487 }
3488 // TODO: Can we relax DSStore check? There may be cases where
3489 // these DS stores are drained prior to the end of MBB (or loop).
3490 if (mayStoreIncrementingDSCNT(MI)) {
3491 // Early exit if none of the optimizations are feasible.
3492 // Otherwise, set tracking status appropriately and continue.
3493 if (VMemInvalidated)
3494 return Flags;
3495 TrackSimpleDSOpt = false;
3496 TrackDSFlushPoint = false;
3497 }
3498 bool IsDSRead = isDSRead(MI);
3499 if (IsDSRead)
3500 ++DSReadPosition;
3501
3502 // Helper: if RU has a pending DS read, update LastDSFlushPosition
3503 auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
3504 if (!TrackDSFlushPoint)
3505 return;
3506 if (auto It = LastDSReadPositionMap.find(RU);
3507 It != LastDSReadPositionMap.end()) {
3508 // RU defined by DSRead is used or overwritten. Need to complete
3509 // the read, if not already implied by a later DSRead (to any RU)
3510 // needing to complete in FIFO order.
3511 LastDSFlushPosition = std::max(LastDSFlushPosition, It->second);
3512 }
3513 };
3514
3515 for (const MachineOperand &Op : MI.all_uses()) {
3516 if (Op.isDebug() || !TRI.isVectorRegister(MRI, Op.getReg()))
3517 continue;
3518 // Vgpr use
3519 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3520 // If we find a register that is loaded inside the loop, 1. and 2.
3521 // are invalidated.
3522 if (VgprDefVMEM.contains(RU))
3523 VMemInvalidated = true;
3524
3525 // Check for DS reads used inside the loop
3526 if (VgprDefDS.contains(RU))
3527 TrackSimpleDSOpt = false;
3528
3529 // Early exit if all optimizations are invalidated
3530 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3531 return Flags;
3532
3533 // Check for flush points (DS read used in same iteration)
3534 updateDSReadFlushTracking(RU);
3535
3536 VgprUse.insert(RU);
3537 // Check if this register has a pending VMEM load from outside the
3538 // loop (value loaded outside and used inside).
3539 VMEMID ID = toVMEMID(RU);
3540 if (Brackets.hasPendingVMEM(ID, LOAD_CNT) ||
3541 Brackets.hasPendingVMEM(ID, SAMPLE_CNT) ||
3542 Brackets.hasPendingVMEM(ID, BVH_CNT))
3543 UsesVgprVMEMLoadedOutside = true;
3544 // Check if loaded outside the loop via DS (not VMEM/FLAT).
3545 // Only consider it a DS read if there's no pending VMEM load for
3546 // this register, since FLAT can set both counters.
3547 else if (Brackets.hasPendingVMEM(ID, DS_CNT))
3548 UsesVgprDSReadOutside = true;
3549 }
3550 }
3551
3552 // VMem load vgpr def
3553 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
3554 for (const MachineOperand &Op : MI.all_defs()) {
3555 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3556 // If we find a register that is loaded inside the loop, 1. and 2.
3557 // are invalidated.
3558 if (VgprUse.contains(RU))
3559 VMemInvalidated = true;
3560 VgprDefVMEM.insert(RU);
3561 }
3562 }
3563 // Early exit if all optimizations are invalidated
3564 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3565 return Flags;
3566 }
3567
3568 // DS read vgpr def
3569 // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
3570 // If USE comes before DEF, it's the prefetch pattern (use value from
3571 // previous iteration, read for next iteration). We should still flush
3572 // in preheader so iteration 1 doesn't need to wait inside the loop.
3573 // Only invalidate when DEF comes before USE (same-iteration consumption,
3574 // checked above when processing uses).
3575 if (IsDSRead || TrackDSFlushPoint) {
3576 for (const MachineOperand &Op : MI.all_defs()) {
3577 if (!TRI.isVectorRegister(MRI, Op.getReg()))
3578 continue;
3579 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3580 // Check for overwrite of pending DS read (flush point) by any
3581 // instruction
3582 updateDSReadFlushTracking(RU);
3583 if (IsDSRead) {
3584 VgprDefDS.insert(RU);
3585 if (TrackDSFlushPoint)
3586 LastDSReadPositionMap[RU] = DSReadPosition;
3587 }
3588 }
3589 }
3590 }
3591 }
3592 }
3593
3594 // VMEM flush decision
3595 if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
3596 ((!ST.hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3597 (HasVMemLoad && ST.hasVmemWriteVgprInOrder())))
3598 Flags.FlushVmCnt = true;
3599
3600 // DS flush decision:
3601 // Simple DS Opt: flush if loop uses DS read values from outside
3602 // and either has no DS reads in the loop, or DS reads whose results
3603 // are not used in the loop.
3604 bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
3605 // Prefetch with flush points: some DS reads used in same iteration,
3606 // but unflushed reads remain at backedge
3607 bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
3608 bool DSFlushPointPrefetch =
3609 TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
3610
3611 if (SimpleDSOpt || DSFlushPointPrefetch)
3612 Flags.FlushDsCnt = true;
3613
3614 return Flags;
3615}
3616
3617bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3618 auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3619 auto &PDT =
3620 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3621 AliasAnalysis *AA = nullptr;
3622 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3623 AA = &AAR->getAAResults();
3624
3625 return SIInsertWaitcnts(MLI, PDT, AA, MF).run();
3626}
3627
3628PreservedAnalyses
3631 auto &MLI = MFAM.getResult<MachineLoopAnalysis>(MF);
3632 auto &PDT = MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
3634 .getManager()
3635 .getCachedResult<AAManager>(MF.getFunction());
3636
3637 if (!SIInsertWaitcnts(MLI, PDT, AA, MF).run())
3638 return PreservedAnalyses::all();
3639
3642 .preserve<AAManager>();
3643}
3644
3645bool SIInsertWaitcnts::run() {
3647
3649
3650 // Initialize hardware limits first, as they're needed by the generators.
3651 Limits = AMDGPU::HardwareLimits(IV);
3652
3653 if (ST.hasExtendedWaitCounts()) {
3654 IsExpertMode = ST.hasExpertSchedulingMode() &&
3655 (ExpertSchedulingModeFlag.getNumOccurrences()
3657 : MF.getFunction()
3658 .getFnAttribute("amdgpu-expert-scheduling-mode")
3659 .getValueAsBool());
3660 MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS;
3661 // Initialize WCG per MF. It contains state that depends on MF attributes.
3662 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, Limits,
3663 IsExpertMode);
3664 } else {
3665 MaxCounter = NUM_NORMAL_INST_CNTS;
3666 // Initialize WCG per MF. It contains state that depends on MF attributes.
3667 WCG = std::make_unique<WaitcntGeneratorPreGFX12>(MF, NUM_NORMAL_INST_CNTS,
3668 Limits);
3669 }
3670
3671 SmemAccessCounter = getCounterFromEvent(SMEM_ACCESS);
3672
3673 bool Modified = false;
3674
3675 MachineBasicBlock &EntryBB = MF.front();
3676
3677 if (!MFI->isEntryFunction()) {
3678 // Wait for any outstanding memory operations that the input registers may
3679 // depend on. We can't track them and it's better to do the wait after the
3680 // costly call sequence.
3681
3682 // TODO: Could insert earlier and schedule more liberally with operations
3683 // that only use caller preserved registers.
3685 while (I != EntryBB.end() && I->isMetaInstruction())
3686 ++I;
3687
3688 if (ST.hasExtendedWaitCounts()) {
3689 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
3690 .addImm(0);
3692 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT ||
3693 CT == ASYNC_CNT)
3694 continue;
3695
3696 if (!ST.hasImageInsts() &&
3697 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
3698 continue;
3699
3700 BuildMI(EntryBB, I, DebugLoc(),
3701 TII.get(instrsForExtendedCounterTypes[CT]))
3702 .addImm(0);
3703 }
3704 if (IsExpertMode) {
3705 unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, ST);
3707 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3708 .addImm(Enc);
3709 }
3710 } else {
3711 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT)).addImm(0);
3712 }
3713
3714 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
3715 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3716 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3717
3718 Modified = true;
3719 }
3720
3721 // Keep iterating over the blocks in reverse post order, inserting and
3722 // updating s_waitcnt where needed, until a fix point is reached.
3723 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3724 BlockInfos.try_emplace(MBB);
3725
3726 std::unique_ptr<WaitcntBrackets> Brackets;
3727 bool Repeat;
3728 do {
3729 Repeat = false;
3730
3731 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
3732 ++BII) {
3733 MachineBasicBlock *MBB = BII->first;
3734 BlockInfo &BI = BII->second;
3735 if (!BI.Dirty)
3736 continue;
3737
3738 if (BI.Incoming) {
3739 if (!Brackets)
3740 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3741 else
3742 *Brackets = *BI.Incoming;
3743 } else {
3744 if (!Brackets) {
3745 Brackets = std::make_unique<WaitcntBrackets>(this);
3746 } else {
3747 // Reinitialize in-place. N.B. do not do this by assigning from a
3748 // temporary because the WaitcntBrackets class is large and it could
3749 // cause this function to use an unreasonable amount of stack space.
3750 Brackets->~WaitcntBrackets();
3751 new (Brackets.get()) WaitcntBrackets(this);
3752 }
3753 }
3754
3755 if (ST.hasWaitXcnt())
3756 Modified |= removeRedundantSoftXcnts(*MBB);
3757 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
3758 BI.Dirty = false;
3759
3760 if (Brackets->hasPendingEvent()) {
3761 BlockInfo *MoveBracketsToSucc = nullptr;
3762 for (MachineBasicBlock *Succ : MBB->successors()) {
3763 auto *SuccBII = BlockInfos.find(Succ);
3764 BlockInfo &SuccBI = SuccBII->second;
3765 if (!SuccBI.Incoming) {
3766 SuccBI.Dirty = true;
3767 if (SuccBII <= BII) {
3768 LLVM_DEBUG(dbgs() << "Repeat on backedge without merge\n");
3769 Repeat = true;
3770 }
3771 if (!MoveBracketsToSucc) {
3772 MoveBracketsToSucc = &SuccBI;
3773 } else {
3774 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3775 }
3776 } else {
3777 LLVM_DEBUG({
3778 dbgs() << "Try to merge ";
3779 MBB->printName(dbgs());
3780 dbgs() << " into ";
3781 Succ->printName(dbgs());
3782 dbgs() << '\n';
3783 });
3784 if (SuccBI.Incoming->merge(*Brackets)) {
3785 SuccBI.Dirty = true;
3786 if (SuccBII <= BII) {
3787 LLVM_DEBUG(dbgs() << "Repeat on backedge with merge\n");
3788 Repeat = true;
3789 }
3790 }
3791 }
3792 }
3793 if (MoveBracketsToSucc)
3794 MoveBracketsToSucc->Incoming = std::move(Brackets);
3795 }
3796 }
3797 } while (Repeat);
3798
3799 if (ST.hasScalarStores()) {
3800 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3801 bool HaveScalarStores = false;
3802
3803 for (MachineBasicBlock &MBB : MF) {
3804 for (MachineInstr &MI : MBB) {
3805 if (!HaveScalarStores && TII.isScalarStore(MI))
3806 HaveScalarStores = true;
3807
3808 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
3809 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3810 EndPgmBlocks.push_back(&MBB);
3811 }
3812 }
3813
3814 if (HaveScalarStores) {
3815 // If scalar writes are used, the cache must be flushed or else the next
3816 // wave to reuse the same scratch memory can be clobbered.
3817 //
3818 // Insert s_dcache_wb at wave termination points if there were any scalar
3819 // stores, and only if the cache hasn't already been flushed. This could
3820 // be improved by looking across blocks for flushes in postdominating
3821 // blocks from the stores but an explicitly requested flush is probably
3822 // very rare.
3823 for (MachineBasicBlock *MBB : EndPgmBlocks) {
3824 bool SeenDCacheWB = false;
3825
3826 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
3827 I != E; ++I) {
3828 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
3829 SeenDCacheWB = true;
3830 else if (TII.isScalarStore(*I))
3831 SeenDCacheWB = false;
3832
3833 // FIXME: It would be better to insert this before a waitcnt if any.
3834 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
3835 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3836 !SeenDCacheWB) {
3837 Modified = true;
3838 BuildMI(*MBB, I, I->getDebugLoc(), TII.get(AMDGPU::S_DCACHE_WB));
3839 }
3840 }
3841 }
3842 }
3843 }
3844
3845 if (IsExpertMode) {
3846 // Enable expert scheduling on function entry. To satisfy ABI requirements
3847 // and to allow calls between function with different expert scheduling
3848 // settings, disable it around calls and before returns.
3849
3851 while (I != EntryBB.end() && I->isMetaInstruction())
3852 ++I;
3853 setSchedulingMode(EntryBB, I, true);
3854
3855 for (MachineInstr *MI : CallInsts) {
3856 MachineBasicBlock &MBB = *MI->getParent();
3857 setSchedulingMode(MBB, MI, false);
3858 setSchedulingMode(MBB, std::next(MI->getIterator()), true);
3859 }
3860
3861 for (MachineInstr *MI : ReturnInsts)
3862 setSchedulingMode(*MI->getParent(), MI, false);
3863
3864 Modified = true;
3865 }
3866
3867 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
3868 // This is done in different ways depending on how the VGPRs were allocated
3869 // (i.e. whether we're in dynamic VGPR mode or not).
3870 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
3871 // waveslot limited kernel runs slower with the deallocation.
3872 if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) {
3873 for (auto [MI, _] : EndPgmInsts) {
3874 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3875 TII.get(AMDGPU::S_ALLOC_VGPR))
3876 .addImm(0);
3877 Modified = true;
3878 }
3879 } else if (!WCG->isOptNone() &&
3880 ST.getGeneration() >= AMDGPUSubtarget::GFX11 &&
3881 (MF.getFrameInfo().hasCalls() ||
3882 ST.getOccupancyWithNumVGPRs(
3883 TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass),
3884 /*IsDynamicVGPR=*/false) <
3886 for (auto [MI, Flag] : EndPgmInsts) {
3887 if (Flag) {
3888 if (ST.requiresNopBeforeDeallocVGPRs()) {
3889 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3890 TII.get(AMDGPU::S_NOP))
3891 .addImm(0);
3892 }
3893 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3894 TII.get(AMDGPU::S_SENDMSG))
3896 Modified = true;
3897 }
3898 }
3899 }
3900
3901 return Modified;
3902}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
#define _
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
#define T
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
#define AMDGPU_EVENT_NAME(Name)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
Provides some synthesis utilities to produce sequences of values.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
Represents the counter values to wait for in an s_waitcnt instruction.
unsigned get(InstCounterType T) const
void set(InstCounterType T, unsigned Val)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
LLVM_ABI void printName(raw_ostream &os, unsigned printNameFlags=PrintNameIr, ModuleSlotTracker *moduleSlotTracker=nullptr) const
Print the basic block's name as:
MachineInstrBundleIterator< MachineInstr > iterator
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator end()
Definition MapVector.h:67
iterator find(const KeyT &Key)
Definition MapVector.h:154
iterator begin()
Definition MapVector.h:65
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
Definition Pass.cpp:140
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool usesASYNC_CNT(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:882
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
bool isDPMACCInstruction(unsigned Opc)
iota_range< InstCounterType > inst_counter_types(InstCounterType MaxCounter)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
bool empty() const
Definition BasicBlock.h:101
LLVM_ABI std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto seq_inclusive(T Begin, T End)
Iterate over an integral type from Begin to End inclusive.
Definition Sequence.h:325
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
APInt operator&(APInt a, const APInt &b)
Definition APInt.h:2141
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition Sequence.h:337
@ Wait
Definition Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2131
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2313
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
@ Async
"Asynchronous" unwind tables (instr precise)
Definition CodeGen.h:157
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool operator|=(SparseBitVector< ElementSize > &LHS, const SparseBitVector< ElementSize > *RHS)
APInt operator|(APInt a, const APInt &b)
Definition APInt.h:2161
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
#define N
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.