LLVM 23.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
31#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/Sequence.h"
39#include "llvm/IR/Dominators.h"
43
44using namespace llvm;
45using namespace llvm::AMDGPU;
46
47#define DEBUG_TYPE "si-insert-waitcnts"
48
49DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
50 "Force emit s_waitcnt expcnt(0) instrs");
51DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
52 "Force emit s_waitcnt lgkmcnt(0) instrs");
53DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
54 "Force emit s_waitcnt vmcnt(0) instrs");
55
56static cl::opt<bool>
57 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
58 cl::desc("Force all waitcnt instrs to be emitted as "
59 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
60 cl::init(false), cl::Hidden);
61
63 "amdgpu-waitcnt-load-forcezero",
64 cl::desc("Force all waitcnt load counters to wait until 0"),
65 cl::init(false), cl::Hidden);
66
68 "amdgpu-expert-scheduling-mode",
69 cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
70 cl::init(false), cl::Hidden);
71
72namespace {
73// Get the maximum wait count value for a given counter type.
74static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
76 switch (T) {
77 case LOAD_CNT:
78 return Limits.LoadcntMax;
79 case DS_CNT:
80 return Limits.DscntMax;
81 case EXP_CNT:
82 return Limits.ExpcntMax;
83 case STORE_CNT:
84 return Limits.StorecntMax;
85 case SAMPLE_CNT:
86 return Limits.SamplecntMax;
87 case BVH_CNT:
88 return Limits.BvhcntMax;
89 case KM_CNT:
90 return Limits.KmcntMax;
91 case X_CNT:
92 return Limits.XcntMax;
93 case VA_VDST:
94 return Limits.VaVdstMax;
95 case VM_VSRC:
96 return Limits.VmVsrcMax;
97 default:
98 return 0;
99 }
100}
101
102/// Integer IDs used to track vector memory locations we may have to wait on.
103/// Encoded as u16 chunks:
104///
105/// [0, REGUNITS_END ): MCRegUnit
106/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
107///
108/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
109/// It gives (2 << 16) - 1 entries per category which is more than enough
110/// for all register units. MCPhysReg is u16 so we don't even support >u16
111/// physical register numbers at this time, let alone >u16 register units.
112/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
113/// is enough for all register units.
114using VMEMID = uint32_t;
115
116enum : VMEMID {
117 TRACKINGID_RANGE_LEN = (1 << 16),
118
119 // Important: MCRegUnits must always be tracked starting from 0, as we
120 // need to be able to convert between a MCRegUnit and a VMEMID freely.
121 REGUNITS_BEGIN = 0,
122 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
123
124 // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
125 // entry, which is updated for all LDS DMA operations encountered.
126 // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
127 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
128 LDSDMA_BEGIN = REGUNITS_END,
129 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
130};
131
132/// Convert a MCRegUnit to a VMEMID.
133static constexpr VMEMID toVMEMID(MCRegUnit RU) {
134 return static_cast<unsigned>(RU);
135}
136
137#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
138 DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \
139 DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
140 DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
141 DECL(GLOBAL_INV_ACCESS) /* GLOBAL_INV (gfx12+ only) */ \
142 DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
143 DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
144 DECL(VMEM_GROUP) /* vmem group */ \
145 DECL(LDS_ACCESS) /* lds read & write */ \
146 DECL(GDS_ACCESS) /* gds read & write */ \
147 DECL(SQ_MESSAGE) /* send message */ \
148 DECL(SCC_WRITE) /* write to SCC from barrier */ \
149 DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
150 DECL(SMEM_GROUP) /* scalar-memory group */ \
151 DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
152 DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
153 DECL(EXP_POS_ACCESS) /* write to export position */ \
154 DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
155 DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
156 DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */ \
157 DECL(VGPR_CSMACC_WRITE) /* write VGPR dest in Core/Side-MACC VALU */ \
158 DECL(VGPR_DPMACC_WRITE) /* write VGPR dest in DPMACC VALU */ \
159 DECL(VGPR_TRANS_WRITE) /* write VGPR dest in TRANS VALU */ \
160 DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \
161 DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
162 DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
163 DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */
164
165// clang-format off
166#define AMDGPU_EVENT_ENUM(Name) Name,
167enum WaitEventType {
169 NUM_WAIT_EVENTS
170};
171#undef AMDGPU_EVENT_ENUM
172} // namespace
173
174namespace llvm {
175template <> struct enum_iteration_traits<WaitEventType> {
176 static constexpr bool is_iterable = true;
177};
178} // namespace llvm
179
180namespace {
181
182/// Return an iterator over all events between VMEM_ACCESS (the first event)
183/// and \c MaxEvent (exclusive, default value yields an enumeration over
184/// all counters).
185auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
186 return enum_seq(VMEM_ACCESS, MaxEvent);
187}
188
189#define AMDGPU_EVENT_NAME(Name) #Name,
190static constexpr StringLiteral WaitEventTypeName[] = {
192};
193#undef AMDGPU_EVENT_NAME
194static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
195 return WaitEventTypeName[Event];
196}
197// clang-format on
198
199// Enumerate different types of result-returning VMEM operations. Although
200// s_waitcnt orders them all with a single vmcnt counter, in the absence of
201// s_waitcnt only instructions of the same VmemType are guaranteed to write
202// their results in order -- so there is no need to insert an s_waitcnt between
203// two instructions of the same type that write the same vgpr.
204enum VmemType {
205 // BUF instructions and MIMG instructions without a sampler.
206 VMEM_NOSAMPLER,
207 // MIMG instructions with a sampler.
208 VMEM_SAMPLER,
209 // BVH instructions
210 VMEM_BVH,
211 NUM_VMEM_TYPES
212};
213
214// Maps values of InstCounterType to the instruction that waits on that
215// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
216// returns true, and does not cover VA_VDST or VM_VSRC.
217static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
218 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
219 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
220 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
221
222static bool updateVMCntOnly(const MachineInstr &Inst) {
223 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
225}
226
227#ifndef NDEBUG
228static bool isNormalMode(InstCounterType MaxCounter) {
229 return MaxCounter == NUM_NORMAL_INST_CNTS;
230}
231#endif // NDEBUG
232
233VmemType getVmemType(const MachineInstr &Inst) {
234 assert(updateVMCntOnly(Inst));
235 if (!SIInstrInfo::isImage(Inst))
236 return VMEM_NOSAMPLER;
237 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
238 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
239 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
240
241 if (BaseInfo->BVH)
242 return VMEM_BVH;
243
244 // We have to make an additional check for isVSAMPLE here since some
245 // instructions don't have a sampler, but are still classified as sampler
246 // instructions for the purposes of e.g. waitcnt.
247 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
248 return VMEM_SAMPLER;
249
250 return VMEM_NOSAMPLER;
251}
252
253void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
254 Wait.set(T, std::min(Wait.get(T), Count));
255}
256
257void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { Wait.set(T, ~0u); }
258
259/// A small set of events.
260class WaitEventSet {
261 unsigned Mask = 0;
262
263public:
264 WaitEventSet() = default;
265 explicit constexpr WaitEventSet(WaitEventType Event) {
266 static_assert(NUM_WAIT_EVENTS <= sizeof(Mask) * 8,
267 "Not enough bits in Mask for all the events");
268 Mask |= 1 << Event;
269 }
270 constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
271 for (auto &E : Events) {
272 Mask |= 1 << E;
273 }
274 }
275 void insert(const WaitEventType &Event) { Mask |= 1 << Event; }
276 void remove(const WaitEventType &Event) { Mask &= ~(1 << Event); }
277 void remove(const WaitEventSet &Other) { Mask &= ~Other.Mask; }
278 bool contains(const WaitEventType &Event) const {
279 return Mask & (1 << Event);
280 }
281 /// \Returns true if this set contains all elements of \p Other.
282 bool contains(const WaitEventSet &Other) const {
283 return (~Mask & Other.Mask) == 0;
284 }
285 /// \Returns the intersection of this and \p Other.
286 WaitEventSet operator&(const WaitEventSet &Other) const {
287 auto Copy = *this;
288 Copy.Mask &= Other.Mask;
289 return Copy;
290 }
291 /// \Returns the union of this and \p Other.
292 WaitEventSet operator|(const WaitEventSet &Other) const {
293 auto Copy = *this;
294 Copy.Mask |= Other.Mask;
295 return Copy;
296 }
297 /// This set becomes the union of this and \p Other.
298 WaitEventSet &operator|=(const WaitEventSet &Other) {
299 Mask |= Other.Mask;
300 return *this;
301 }
302 /// This set becomes the intersection of this and \p Other.
303 WaitEventSet &operator&=(const WaitEventSet &Other) {
304 Mask &= Other.Mask;
305 return *this;
306 }
307 bool operator==(const WaitEventSet &Other) const {
308 return Mask == Other.Mask;
309 }
310 bool operator!=(const WaitEventSet &Other) const { return !(*this == Other); }
311 bool empty() const { return Mask == 0; }
312 /// \Returns true if the set contains more than one element.
313 bool twoOrMore() const { return Mask & (Mask - 1); }
314 operator bool() const { return !empty(); }
315 void print(raw_ostream &OS) const {
316 ListSeparator LS(", ");
317 for (WaitEventType Event : wait_events()) {
318 if (contains(Event))
319 OS << LS << getWaitEventTypeName(Event);
320 }
321 }
322 LLVM_DUMP_METHOD void dump() const;
323};
324
325void WaitEventSet::dump() const {
326 print(dbgs());
327 dbgs() << "\n";
328}
329
330class WaitcntBrackets;
331
332// This abstracts the logic for generating and updating S_WAIT* instructions
333// away from the analysis that determines where they are needed. This was
334// done because the set of counters and instructions for waiting on them
335// underwent a major shift with gfx12, sufficiently so that having this
336// abstraction allows the main analysis logic to be simpler than it would
337// otherwise have had to become.
338class WaitcntGenerator {
339protected:
340 const GCNSubtarget &ST;
341 const SIInstrInfo &TII;
342 AMDGPU::IsaVersion IV;
343 InstCounterType MaxCounter;
344 bool OptNone;
345 bool ExpandWaitcntProfiling = false;
346 const AMDGPU::HardwareLimits *Limits = nullptr;
347
348public:
349 WaitcntGenerator() = delete;
350 WaitcntGenerator(const WaitcntGenerator &) = delete;
351 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter,
352 const AMDGPU::HardwareLimits *Limits)
353 : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
354 IV(AMDGPU::getIsaVersion(ST.getCPU())), MaxCounter(MaxCounter),
355 OptNone(MF.getFunction().hasOptNone() ||
356 MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
357 ExpandWaitcntProfiling(
358 MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),
359 Limits(Limits) {}
360
361 // Return true if the current function should be compiled with no
362 // optimization.
363 bool isOptNone() const { return OptNone; }
364
365 const AMDGPU::HardwareLimits &getLimits() const { return *Limits; }
366
367 // Edits an existing sequence of wait count instructions according
368 // to an incoming Waitcnt value, which is itself updated to reflect
369 // any new wait count instructions which may need to be generated by
370 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
371 // were made.
372 //
373 // This editing will usually be merely updated operands, but it may also
374 // delete instructions if the incoming Wait value indicates they are not
375 // needed. It may also remove existing instructions for which a wait
376 // is needed if it can be determined that it is better to generate new
377 // instructions later, as can happen on gfx12.
378 virtual bool
379 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
380 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
382
383 // Transform a soft waitcnt into a normal one.
384 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
385
386 // Generates new wait count instructions according to the value of
387 // Wait, returning true if any new instructions were created.
388 // ScoreBrackets is used for profiling expansion.
389 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
391 AMDGPU::Waitcnt Wait,
392 const WaitcntBrackets &ScoreBrackets) = 0;
393
394 // Returns the WaitEventSet that corresponds to counter \p T.
395 virtual const WaitEventSet &getWaitEvents(InstCounterType T) const = 0;
396
397 /// \returns the counter that corresponds to event \p E.
398 InstCounterType getCounterFromEvent(WaitEventType E) const {
399 for (auto T : inst_counter_types()) {
400 if (getWaitEvents(T).contains(E))
401 return T;
402 }
403 llvm_unreachable("event type has no associated counter");
404 }
405
406 // Returns a new waitcnt with all counters except VScnt set to 0. If
407 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
408 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
409
410 virtual ~WaitcntGenerator() = default;
411};
412
413class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
414 static constexpr const WaitEventSet
415 WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
416 WaitEventSet(
417 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
418 WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
419 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
420 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
421 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
422 WaitEventSet(),
423 WaitEventSet(),
424 WaitEventSet(),
425 WaitEventSet(),
426 WaitEventSet(),
427 WaitEventSet()};
428
429public:
430 using WaitcntGenerator::WaitcntGenerator;
431 bool
432 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
433 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
434 MachineBasicBlock::instr_iterator It) const override;
435
436 bool createNewWaitcnt(MachineBasicBlock &Block,
438 AMDGPU::Waitcnt Wait,
439 const WaitcntBrackets &ScoreBrackets) override;
440
441 const WaitEventSet &getWaitEvents(InstCounterType T) const override {
442 return WaitEventMaskForInstPreGFX12[T];
443 }
444
445 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
446};
447
448class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
449protected:
450 bool IsExpertMode;
451 static constexpr const WaitEventSet
452 WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
453 WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
454 WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
455 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
456 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
457 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
458 WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),
459 WaitEventSet({VMEM_BVH_READ_ACCESS}),
460 WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
461 WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
462 WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
463 VGPR_XDL_WRITE}),
464 WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
465
466public:
467 WaitcntGeneratorGFX12Plus() = delete;
468 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
469 InstCounterType MaxCounter,
470 const AMDGPU::HardwareLimits *Limits,
471 bool IsExpertMode)
472 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
473
474 bool
475 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
476 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
477 MachineBasicBlock::instr_iterator It) const override;
478
479 bool createNewWaitcnt(MachineBasicBlock &Block,
481 AMDGPU::Waitcnt Wait,
482 const WaitcntBrackets &ScoreBrackets) override;
483
484 const WaitEventSet &getWaitEvents(InstCounterType T) const override {
485 return WaitEventMaskForInstGFX12Plus[T];
486 }
487
488 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
489};
490
491// Flags indicating which counters should be flushed in a loop preheader.
492struct PreheaderFlushFlags {
493 bool FlushVmCnt = false;
494 bool FlushDsCnt = false;
495};
496
497class SIInsertWaitcnts {
498public:
499 const GCNSubtarget *ST;
500 const SIInstrInfo *TII = nullptr;
501 const SIRegisterInfo *TRI = nullptr;
502 const MachineRegisterInfo *MRI = nullptr;
503 InstCounterType SmemAccessCounter;
504 InstCounterType MaxCounter;
505 bool IsExpertMode = false;
506
507private:
508 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
509 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
510 MachineLoopInfo &MLI;
511 MachinePostDominatorTree &PDT;
512 AliasAnalysis *AA = nullptr;
513 MachineFunction &MF;
514
515 struct BlockInfo {
516 std::unique_ptr<WaitcntBrackets> Incoming;
517 bool Dirty = true;
518 };
519
520 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
521
522 bool ForceEmitWaitcnt[NUM_INST_CNTS];
523
524 std::unique_ptr<WaitcntGenerator> WCG;
525
526 // Remember call and return instructions in the function.
527 DenseSet<MachineInstr *> CallInsts;
528 DenseSet<MachineInstr *> ReturnInsts;
529
530 // Remember all S_ENDPGM instructions. The boolean flag is true if there might
531 // be outstanding stores but definitely no outstanding scratch stores, to help
532 // with insertion of DEALLOC_VGPRS messages.
533 DenseMap<MachineInstr *, bool> EndPgmInsts;
534
535 AMDGPU::HardwareLimits Limits;
536
537public:
538 SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,
539 AliasAnalysis *AA, MachineFunction &MF)
540 : MLI(MLI), PDT(PDT), AA(AA), MF(MF) {
541 (void)ForceExpCounter;
542 (void)ForceLgkmCounter;
543 (void)ForceVMCounter;
544 }
545
546 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
547
548 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
549 const WaitcntBrackets &Brackets);
550 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
551 const WaitcntBrackets &ScoreBrackets);
552 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
553 bool isDSRead(const MachineInstr &MI) const;
554 bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;
555 bool run();
556
557 void setForceEmitWaitcnt() {
558// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
559// For debug builds, get the debug counter info and adjust if need be
560#ifndef NDEBUG
561 if (DebugCounter::isCounterSet(ForceExpCounter) &&
562 DebugCounter::shouldExecute(ForceExpCounter)) {
563 ForceEmitWaitcnt[EXP_CNT] = true;
564 } else {
565 ForceEmitWaitcnt[EXP_CNT] = false;
566 }
567
568 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
569 DebugCounter::shouldExecute(ForceLgkmCounter)) {
570 ForceEmitWaitcnt[DS_CNT] = true;
571 ForceEmitWaitcnt[KM_CNT] = true;
572 } else {
573 ForceEmitWaitcnt[DS_CNT] = false;
574 ForceEmitWaitcnt[KM_CNT] = false;
575 }
576
577 if (DebugCounter::isCounterSet(ForceVMCounter) &&
578 DebugCounter::shouldExecute(ForceVMCounter)) {
579 ForceEmitWaitcnt[LOAD_CNT] = true;
580 ForceEmitWaitcnt[SAMPLE_CNT] = true;
581 ForceEmitWaitcnt[BVH_CNT] = true;
582 } else {
583 ForceEmitWaitcnt[LOAD_CNT] = false;
584 ForceEmitWaitcnt[SAMPLE_CNT] = false;
585 ForceEmitWaitcnt[BVH_CNT] = false;
586 }
587
588 ForceEmitWaitcnt[VA_VDST] = false;
589 ForceEmitWaitcnt[VM_VSRC] = false;
590#endif // NDEBUG
591 }
592
593 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
594 // instruction.
595 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
596 switch (Inst.getOpcode()) {
597 // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
598 case AMDGPU::GLOBAL_INV:
599 return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write
600 // VGPRs
601 case AMDGPU::GLOBAL_WB:
602 case AMDGPU::GLOBAL_WBINV:
603 return VMEM_WRITE_ACCESS; // tracked using storecnt
604 default:
605 break;
606 }
607
608 // Maps VMEM access types to their corresponding WaitEventType.
609 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
610 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
611
613 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
614 // these should use VM_CNT.
615 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
616 return VMEM_ACCESS;
617 if (Inst.mayStore() &&
618 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
619 if (TII->mayAccessScratch(Inst))
620 return SCRATCH_WRITE_ACCESS;
621 return VMEM_WRITE_ACCESS;
622 }
623 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
624 return VMEM_ACCESS;
625 return VmemReadMapping[getVmemType(Inst)];
626 }
627
628 std::optional<WaitEventType>
629 getExpertSchedulingEventType(const MachineInstr &Inst) const;
630
631 bool isAsync(const MachineInstr &MI) const {
633 return false;
635 return true;
636 const MachineOperand *Async =
637 TII->getNamedOperand(MI, AMDGPU::OpName::IsAsync);
638 return Async && (Async->getImm());
639 }
640
641 bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const {
642 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI);
643 }
644
645 bool isAsyncLdsDmaWrite(const MachineInstr &MI) const {
646 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
647 }
648
649 bool isVmemAccess(const MachineInstr &MI) const;
650 bool generateWaitcntInstBefore(MachineInstr &MI,
651 WaitcntBrackets &ScoreBrackets,
652 MachineInstr *OldWaitcntInstr,
653 PreheaderFlushFlags FlushFlags);
654 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
656 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
657 MachineInstr *OldWaitcntInstr);
658 /// \returns all events that correspond to \p Inst.
659 WaitEventSet getEventsFor(const MachineInstr &Inst) const;
660 void updateEventWaitcntAfter(MachineInstr &Inst,
661 WaitcntBrackets *ScoreBrackets);
662 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
663 MachineBasicBlock *Block) const;
664 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
665 WaitcntBrackets &ScoreBrackets);
666 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
667 WaitcntBrackets &ScoreBrackets);
668 /// Removes redundant Soft Xcnt Waitcnts in \p Block emitted by the Memory
669 /// Legalizer. Returns true if block was modified.
670 bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
671 void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
672 bool ExpertMode) const;
673 const WaitEventSet &getWaitEvents(InstCounterType T) const {
674 return WCG->getWaitEvents(T);
675 }
676 InstCounterType getCounterFromEvent(WaitEventType E) const {
677 return WCG->getCounterFromEvent(E);
678 }
679};
680
681// This objects maintains the current score brackets of each wait counter, and
682// a per-register scoreboard for each wait counter.
683//
684// We also maintain the latest score for every event type that can change the
685// waitcnt in order to know if there are multiple types of events within
686// the brackets. When multiple types of event happen in the bracket,
687// wait count may get decreased out of order, therefore we need to put in
688// "s_waitcnt 0" before use.
689class WaitcntBrackets {
690public:
691 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
692 assert(Context->TRI->getNumRegUnits() < REGUNITS_END);
693 }
694
695#ifndef NDEBUG
696 ~WaitcntBrackets() {
697 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
698 for (auto &[ID, Val] : VMem) {
699 if (Val.empty())
700 ++NumUnusedVmem;
701 }
702 for (auto &[ID, Val] : SGPRs) {
703 if (Val.empty())
704 ++NumUnusedSGPRs;
705 }
706
707 if (NumUnusedVmem || NumUnusedSGPRs) {
708 errs() << "WaitcntBracket had unused entries at destruction time: "
709 << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
710 << " SGPR unused entries\n";
711 std::abort();
712 }
713 }
714#endif
715
716 bool isSmemCounter(InstCounterType T) const {
717 return T == Context->SmemAccessCounter || T == X_CNT;
718 }
719
720 unsigned getSgprScoresIdx(InstCounterType T) const {
721 assert(isSmemCounter(T) && "Invalid SMEM counter");
722 return T == X_CNT ? 1 : 0;
723 }
724
725 unsigned getOutstanding(InstCounterType T) const {
726 return ScoreUBs[T] - ScoreLBs[T];
727 }
728
729 bool hasPendingVMEM(VMEMID ID, InstCounterType T) const {
730 return getVMemScore(ID, T) > getScoreLB(T);
731 }
732
733 /// \Return true if we have no score entries for counter \p T.
734 bool empty(InstCounterType T) const { return getScoreRange(T) == 0; }
735
736private:
737 unsigned getScoreLB(InstCounterType T) const {
739 return ScoreLBs[T];
740 }
741
742 unsigned getScoreUB(InstCounterType T) const {
744 return ScoreUBs[T];
745 }
746
747 unsigned getScoreRange(InstCounterType T) const {
748 return getScoreUB(T) - getScoreLB(T);
749 }
750
751 unsigned getSGPRScore(MCRegUnit RU, InstCounterType T) const {
752 auto It = SGPRs.find(RU);
753 return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(T)] : 0;
754 }
755
756 unsigned getVMemScore(VMEMID TID, InstCounterType T) const {
757 auto It = VMem.find(TID);
758 return It != VMem.end() ? It->second.Scores[T] : 0;
759 }
760
761public:
762 bool merge(const WaitcntBrackets &Other);
763
764 bool counterOutOfOrder(InstCounterType T) const;
765 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
766 simplifyWaitcnt(Wait, Wait);
767 }
768 void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
769 AMDGPU::Waitcnt &UpdateWait) const;
770 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
771 void simplifyWaitcnt(Waitcnt &Wait, InstCounterType T) const;
772 void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
773 AMDGPU::Waitcnt &UpdateWait) const;
774 void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
775 AMDGPU::Waitcnt &UpdateWait) const;
776
777 void determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
778 AMDGPU::Waitcnt &Wait) const;
779 void determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
780 AMDGPU::Waitcnt &Wait) const;
781 AMDGPU::Waitcnt determineAsyncWait(unsigned N);
782 void tryClearSCCWriteEvent(MachineInstr *Inst);
783
784 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
785 void applyWaitcnt(InstCounterType T, unsigned Count);
786 void applyWaitcnt(const AMDGPU::Waitcnt &Wait, InstCounterType T);
787 void updateByEvent(WaitEventType E, MachineInstr &MI);
788 void recordAsyncMark(MachineInstr &MI);
789
790 bool hasPendingEvent() const { return !PendingEvents.empty(); }
791 bool hasPendingEvent(WaitEventType E) const {
792 return PendingEvents.contains(E);
793 }
794 bool hasPendingEvent(InstCounterType T) const {
795 bool HasPending = PendingEvents & Context->getWaitEvents(T);
796 assert(HasPending == !empty(T) &&
797 "Expected pending events iff scoreboard is not empty");
798 return HasPending;
799 }
800
801 bool hasMixedPendingEvents(InstCounterType T) const {
802 WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
803 // Return true if more than one bit is set in Events.
804 return Events.twoOrMore();
805 }
806
807 bool hasPendingFlat() const {
808 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
809 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
810 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
811 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
812 }
813
814 void setPendingFlat() {
815 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
816 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
817 }
818
819 bool hasPendingGDS() const {
820 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
821 }
822
823 unsigned getPendingGDSWait() const {
824 return std::min(getScoreUB(DS_CNT) - LastGDS,
825 getWaitCountMax(Context->getLimits(), DS_CNT) - 1);
826 }
827
828 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
829
830 // Return true if there might be pending writes to the vgpr-interval by VMEM
831 // instructions with types different from V.
832 bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
833 for (MCRegUnit RU : regunits(Reg)) {
834 auto It = VMem.find(toVMEMID(RU));
835 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
836 return true;
837 }
838 return false;
839 }
840
841 void clearVgprVmemTypes(MCPhysReg Reg) {
842 for (MCRegUnit RU : regunits(Reg)) {
843 if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
844 It->second.VMEMTypes = 0;
845 if (It->second.empty())
846 VMem.erase(It);
847 }
848 }
849 }
850
851 void setStateOnFunctionEntryOrReturn() {
852 setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) +
853 getWaitCountMax(Context->getLimits(), STORE_CNT));
854 PendingEvents |= Context->getWaitEvents(STORE_CNT);
855 }
856
857 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
858 return LDSDMAStores;
859 }
860
861 bool hasPointSampleAccel(const MachineInstr &MI) const;
862 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
863 MCPhysReg RU) const;
864
865 void print(raw_ostream &) const;
866 void dump() const { print(dbgs()); }
867
868 // Free up memory by removing empty entries from the DenseMap that track event
869 // scores.
870 void purgeEmptyTrackingData();
871
872private:
873 struct MergeInfo {
874 unsigned OldLB;
875 unsigned OtherLB;
876 unsigned MyShift;
877 unsigned OtherShift;
878 };
879
880 using CounterValueArray = std::array<unsigned, NUM_INST_CNTS>;
881
882 void determineWaitForScore(InstCounterType T, unsigned Score,
883 AMDGPU::Waitcnt &Wait) const;
884
885 static bool mergeScore(const MergeInfo &M, unsigned &Score,
886 unsigned OtherScore);
887 bool mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
888 ArrayRef<CounterValueArray> OtherMarks);
889
891 assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
892 if (!Context->TRI->isInAllocatableClass(Reg))
893 return {{}, {}};
894 const TargetRegisterClass *RC = Context->TRI->getPhysRegBaseClass(Reg);
895 unsigned Size = Context->TRI->getRegSizeInBits(*RC);
896 if (Size == 16 && Context->ST->hasD16Writes32BitVgpr())
897 Reg = Context->TRI->get32BitRegister(Reg);
898 return Context->TRI->regunits(Reg);
899 }
900
901 void setScoreLB(InstCounterType T, unsigned Val) {
903 ScoreLBs[T] = Val;
904 }
905
906 void setScoreUB(InstCounterType T, unsigned Val) {
908 ScoreUBs[T] = Val;
909
910 if (T != EXP_CNT)
911 return;
912
913 if (getScoreRange(EXP_CNT) > getWaitCountMax(Context->getLimits(), EXP_CNT))
914 ScoreLBs[EXP_CNT] =
915 ScoreUBs[EXP_CNT] - getWaitCountMax(Context->getLimits(), EXP_CNT);
916 }
917
918 void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {
919 const SIRegisterInfo *TRI = Context->TRI;
920 if (Reg == AMDGPU::SCC) {
921 SCCScore = Val;
922 } else if (TRI->isVectorRegister(*Context->MRI, Reg)) {
923 for (MCRegUnit RU : regunits(Reg))
924 VMem[toVMEMID(RU)].Scores[T] = Val;
925 } else if (TRI->isSGPRReg(*Context->MRI, Reg)) {
926 auto STy = getSgprScoresIdx(T);
927 for (MCRegUnit RU : regunits(Reg))
928 SGPRs[RU].Scores[STy] = Val;
929 } else {
930 llvm_unreachable("Register cannot be tracked/unknown register!");
931 }
932 }
933
934 void setVMemScore(VMEMID TID, InstCounterType T, unsigned Val) {
935 VMem[TID].Scores[T] = Val;
936 }
937
938 void setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy,
939 unsigned Val);
940
941 const SIInsertWaitcnts *Context;
942
943 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
944 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
945 WaitEventSet PendingEvents;
946 // Remember the last flat memory operation.
947 unsigned LastFlat[NUM_INST_CNTS] = {0};
948 // Remember the last GDS operation.
949 unsigned LastGDS = 0;
950
951 // The score tracking logic is fragmented as follows:
952 // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
953 // - SGPRs: SGPR RegUnits
954 // - SCC: Non-allocatable and not general purpose: not a SGPR.
955 //
956 // For the VMem case, if the key is within the range of LDS DMA IDs,
957 // then the corresponding index into the `LDSDMAStores` vector below is:
958 // Key - LDSDMA_BEGIN - 1
959 // This is because LDSDMA_BEGIN is a generic entry and does not have an
960 // associated MachineInstr.
961 //
962 // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
963
964 struct VMEMInfo {
965 // Scores for all instruction counters. Zero-initialized.
966 CounterValueArray Scores{};
967 // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
968 unsigned VMEMTypes = 0;
969
970 bool empty() const { return all_of(Scores, equal_to(0)) && !VMEMTypes; }
971 };
972
973 struct SGPRInfo {
974 // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
975 // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
976 // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps
977 // the X_CNT score.
978 std::array<unsigned, 2> Scores = {0};
979
980 bool empty() const { return !Scores[0] && !Scores[1]; }
981 };
982
983 DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
984 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
985
986 // Reg score for SCC.
987 unsigned SCCScore = 0;
988 // The unique instruction that has an SCC write pending, if there is one.
989 const MachineInstr *PendingSCCWrite = nullptr;
990
991 // Store representative LDS DMA operations. The only useful info here is
992 // alias info. One store is kept per unique AAInfo.
993 SmallVector<const MachineInstr *> LDSDMAStores;
994
995 // State of all counters at each async mark encountered so far.
997
998 // But in the rare pathological case, a nest of loops that pushes marks
999 // without waiting on any mark can cause AsyncMarks to grow very large. We cap
1000 // it to a reasonable limit. We can tune this later or potentially introduce a
1001 // user option to control the value.
1002 static constexpr unsigned MaxAsyncMarks = 16;
1003
1004 // Track the upper bound score for async operations that are not part of a
1005 // mark yet. Initialized to all zeros.
1006 CounterValueArray AsyncScore{};
1007};
1008
1009class SIInsertWaitcntsLegacy : public MachineFunctionPass {
1010public:
1011 static char ID;
1012 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
1013
1014 bool runOnMachineFunction(MachineFunction &MF) override;
1015
1016 StringRef getPassName() const override {
1017 return "SI insert wait instructions";
1018 }
1019
1020 void getAnalysisUsage(AnalysisUsage &AU) const override {
1021 AU.setPreservesCFG();
1022 AU.addRequired<MachineLoopInfoWrapperPass>();
1023 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
1024 AU.addUsedIfAvailable<AAResultsWrapperPass>();
1025 AU.addPreserved<AAResultsWrapperPass>();
1027 }
1028};
1029
1030} // end anonymous namespace
1031
1032void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
1033 InstCounterType CntTy, unsigned Score) {
1034 setRegScore(Op.getReg().asMCReg(), CntTy, Score);
1035}
1036
1037// Return true if the subtarget is one that enables Point Sample Acceleration
1038// and the MachineInstr passed in is one to which it might be applied (the
1039// hardware makes this decision based on several factors, but we can't determine
1040// this at compile time, so we have to assume it might be applied if the
1041// instruction supports it).
1042bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
1043 if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
1044 return false;
1045
1046 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
1047 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
1049 return BaseInfo->PointSampleAccel;
1050}
1051
1052// Return true if the subtarget enables Point Sample Acceleration, the supplied
1053// MachineInstr is one to which it might be applied and the supplied interval is
1054// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
1055// (this is the type that a point sample accelerated instruction effectively
1056// becomes)
1057bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
1058 MCPhysReg Reg) const {
1059 if (!hasPointSampleAccel(MI))
1060 return false;
1061
1062 return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER);
1063}
1064
1065void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
1066 InstCounterType T = Context->getCounterFromEvent(E);
1067 assert(T < Context->MaxCounter);
1068
1069 unsigned UB = getScoreUB(T);
1070 unsigned CurrScore = UB + 1;
1071 if (CurrScore == 0)
1072 report_fatal_error("InsertWaitcnt score wraparound");
1073 // PendingEvents and ScoreUB need to be update regardless if this event
1074 // changes the score of a register or not.
1075 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
1076 PendingEvents.insert(E);
1077 setScoreUB(T, CurrScore);
1078
1079 const SIRegisterInfo *TRI = Context->TRI;
1080 const MachineRegisterInfo *MRI = Context->MRI;
1081 const SIInstrInfo *TII = Context->TII;
1082
1083 if (T == EXP_CNT) {
1084 // Put score on the source vgprs. If this is a store, just use those
1085 // specific register(s).
1086 if (TII->isDS(Inst) && Inst.mayLoadOrStore()) {
1087 // All GDS operations must protect their address register (same as
1088 // export.)
1089 if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
1090 setScoreByOperand(*AddrOp, EXP_CNT, CurrScore);
1091
1092 if (Inst.mayStore()) {
1093 if (const auto *Data0 =
1094 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
1095 setScoreByOperand(*Data0, EXP_CNT, CurrScore);
1096 if (const auto *Data1 =
1097 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
1098 setScoreByOperand(*Data1, EXP_CNT, CurrScore);
1099 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
1100 Inst.getOpcode() != AMDGPU::DS_APPEND &&
1101 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
1102 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1103 for (const MachineOperand &Op : Inst.all_uses()) {
1104 if (TRI->isVectorRegister(*MRI, Op.getReg()))
1105 setScoreByOperand(Op, EXP_CNT, CurrScore);
1106 }
1107 }
1108 } else if (TII->isFLAT(Inst)) {
1109 if (Inst.mayStore()) {
1110 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1111 EXP_CNT, CurrScore);
1112 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1113 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1114 EXP_CNT, CurrScore);
1115 }
1116 } else if (TII->isMIMG(Inst)) {
1117 if (Inst.mayStore()) {
1118 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
1119 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1120 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1121 EXP_CNT, CurrScore);
1122 }
1123 } else if (TII->isMTBUF(Inst)) {
1124 if (Inst.mayStore())
1125 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
1126 } else if (TII->isMUBUF(Inst)) {
1127 if (Inst.mayStore()) {
1128 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
1129 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1130 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1131 EXP_CNT, CurrScore);
1132 }
1133 } else if (TII->isLDSDIR(Inst)) {
1134 // LDSDIR instructions attach the score to the destination.
1135 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
1136 EXP_CNT, CurrScore);
1137 } else {
1138 if (TII->isEXP(Inst)) {
1139 // For export the destination registers are really temps that
1140 // can be used as the actual source after export patching, so
1141 // we need to treat them like sources and set the EXP_CNT
1142 // score.
1143 for (MachineOperand &DefMO : Inst.all_defs()) {
1144 if (TRI->isVGPR(*MRI, DefMO.getReg())) {
1145 setScoreByOperand(DefMO, EXP_CNT, CurrScore);
1146 }
1147 }
1148 }
1149 for (const MachineOperand &Op : Inst.all_uses()) {
1150 if (TRI->isVectorRegister(*MRI, Op.getReg()))
1151 setScoreByOperand(Op, EXP_CNT, CurrScore);
1152 }
1153 }
1154 } else if (T == X_CNT) {
1155 WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1156 if (PendingEvents.contains(OtherEvent)) {
1157 // Hardware inserts an implicit xcnt between interleaved
1158 // SMEM and VMEM operations. So there will never be
1159 // outstanding address translations for both SMEM and
1160 // VMEM at the same time.
1161 setScoreLB(T, getScoreUB(T) - 1);
1162 PendingEvents.remove(OtherEvent);
1163 }
1164 for (const MachineOperand &Op : Inst.all_uses())
1165 setScoreByOperand(Op, T, CurrScore);
1166 } else if (T == VA_VDST || T == VM_VSRC) {
1167 // Match the score to the VGPR destination or source registers as
1168 // appropriate
1169 for (const MachineOperand &Op : Inst.operands()) {
1170 if (!Op.isReg() || (T == VA_VDST && Op.isUse()) ||
1171 (T == VM_VSRC && Op.isDef()))
1172 continue;
1173 if (TRI->isVectorRegister(*Context->MRI, Op.getReg()))
1174 setScoreByOperand(Op, T, CurrScore);
1175 }
1176 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1177 // Match the score to the destination registers.
1178 //
1179 // Check only explicit operands. Stores, especially spill stores, include
1180 // implicit uses and defs of their super registers which would create an
1181 // artificial dependency, while these are there only for register liveness
1182 // accounting purposes.
1183 //
1184 // Special cases where implicit register defs exists, such as M0 or VCC,
1185 // but none with memory instructions.
1186 for (const MachineOperand &Op : Inst.defs()) {
1187 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
1188 if (!TRI->isVectorRegister(*MRI, Op.getReg())) // TODO: add wrapper
1189 continue;
1190 if (updateVMCntOnly(Inst)) {
1191 // updateVMCntOnly should only leave us with VGPRs
1192 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1193 // defs. That's required for a sane index into `VgprMemTypes` below
1194 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
1195 VmemType V = getVmemType(Inst);
1196 unsigned char TypesMask = 1 << V;
1197 // If instruction can have Point Sample Accel applied, we have to flag
1198 // this with another potential dependency
1199 if (hasPointSampleAccel(Inst))
1200 TypesMask |= 1 << VMEM_NOSAMPLER;
1201 for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))
1202 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1203 }
1204 }
1205 setScoreByOperand(Op, T, CurrScore);
1206 }
1207 if (Inst.mayStore() &&
1208 (TII->isDS(Inst) || Context->isNonAsyncLdsDmaWrite(Inst))) {
1209 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1210 // written can be accessed. A load from LDS to VMEM does not need a wait.
1211 //
1212 // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
1213 // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
1214 // store. The "Slot" is the index into LDSDMAStores + 1.
1215 unsigned Slot = 0;
1216 for (const auto *MemOp : Inst.memoperands()) {
1217 if (!MemOp->isStore() ||
1218 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1219 continue;
1220 // Comparing just AA info does not guarantee memoperands are equal
1221 // in general, but this is so for LDS DMA in practice.
1222 auto AAI = MemOp->getAAInfo();
1223 // Alias scope information gives a way to definitely identify an
1224 // original memory object and practically produced in the module LDS
1225 // lowering pass. If there is no scope available we will not be able
1226 // to disambiguate LDS aliasing as after the module lowering all LDS
1227 // is squashed into a single big object.
1228 if (!AAI || !AAI.Scope)
1229 break;
1230 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1231 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1232 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1233 Slot = I + 1;
1234 break;
1235 }
1236 }
1237 }
1238 if (Slot)
1239 break;
1240 // The slot may not be valid because it can be >= NUM_LDSDMA which
1241 // means the scoreboard cannot track it. We still want to preserve the
1242 // MI in order to check alias information, though.
1243 LDSDMAStores.push_back(&Inst);
1244 Slot = LDSDMAStores.size();
1245 break;
1246 }
1247 setVMemScore(LDSDMA_BEGIN, T, CurrScore);
1248 if (Slot && Slot < NUM_LDSDMA)
1249 setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
1250 }
1251
1252 // FIXME: Not supported on GFX12 yet. Newer async operations use other
1253 // counters too, so will need a map from instruction or event types to
1254 // counter types.
1255 if (Context->isAsyncLdsDmaWrite(Inst) && T == LOAD_CNT) {
1257 "unexpected GFX1250 instruction");
1258 AsyncScore[T] = CurrScore;
1259 }
1260
1262 setRegScore(AMDGPU::SCC, T, CurrScore);
1263 PendingSCCWrite = &Inst;
1264 }
1265 }
1266}
1267
1268void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1269 // In the absence of loops, AsyncMarks can grow linearly with the program
1270 // until we encounter an ASYNCMARK_WAIT. We could drop the oldest mark above a
1271 // limit every time we push a new mark, but that seems like unnecessary work
1272 // in practical cases. We do separately truncate the array when processing a
1273 // loop, which should be sufficient.
1274 AsyncMarks.push_back(AsyncScore);
1275 AsyncScore = {};
1276 LLVM_DEBUG({
1277 dbgs() << "recordAsyncMark:\n" << Inst;
1278 for (const auto &Mark : AsyncMarks) {
1279 llvm::interleaveComma(Mark, dbgs());
1280 dbgs() << '\n';
1281 }
1282 });
1283}
1284
1285void WaitcntBrackets::print(raw_ostream &OS) const {
1286 const GCNSubtarget *ST = Context->ST;
1287
1288 for (auto T : inst_counter_types(Context->MaxCounter)) {
1289 unsigned SR = getScoreRange(T);
1290 switch (T) {
1291 case LOAD_CNT:
1292 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1293 << SR << "):";
1294 break;
1295 case DS_CNT:
1296 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1297 << SR << "):";
1298 break;
1299 case EXP_CNT:
1300 OS << " EXP_CNT(" << SR << "):";
1301 break;
1302 case STORE_CNT:
1303 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1304 << SR << "):";
1305 break;
1306 case SAMPLE_CNT:
1307 OS << " SAMPLE_CNT(" << SR << "):";
1308 break;
1309 case BVH_CNT:
1310 OS << " BVH_CNT(" << SR << "):";
1311 break;
1312 case KM_CNT:
1313 OS << " KM_CNT(" << SR << "):";
1314 break;
1315 case X_CNT:
1316 OS << " X_CNT(" << SR << "):";
1317 break;
1318 case VA_VDST:
1319 OS << " VA_VDST(" << SR << "): ";
1320 break;
1321 case VM_VSRC:
1322 OS << " VM_VSRC(" << SR << "): ";
1323 break;
1324 default:
1325 OS << " UNKNOWN(" << SR << "):";
1326 break;
1327 }
1328
1329 if (SR != 0) {
1330 // Print vgpr scores.
1331 unsigned LB = getScoreLB(T);
1332
1333 SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
1334 sort(SortedVMEMIDs);
1335
1336 for (auto ID : SortedVMEMIDs) {
1337 unsigned RegScore = VMem.at(ID).Scores[T];
1338 if (RegScore <= LB)
1339 continue;
1340 unsigned RelScore = RegScore - LB - 1;
1341 if (ID < REGUNITS_END) {
1342 OS << ' ' << RelScore << ":vRU" << ID;
1343 } else {
1344 assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
1345 "Unhandled/unexpected ID value!");
1346 OS << ' ' << RelScore << ":LDSDMA" << ID;
1347 }
1348 }
1349
1350 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1351 if (isSmemCounter(T)) {
1352 SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
1353 sort(SortedSMEMIDs);
1354 for (auto ID : SortedSMEMIDs) {
1355 unsigned RegScore = SGPRs.at(ID).Scores[getSgprScoresIdx(T)];
1356 if (RegScore <= LB)
1357 continue;
1358 unsigned RelScore = RegScore - LB - 1;
1359 OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
1360 }
1361 }
1362
1363 if (T == KM_CNT && SCCScore > 0)
1364 OS << ' ' << SCCScore << ":scc";
1365 }
1366 OS << '\n';
1367 }
1368
1369 OS << "Pending Events: ";
1370 if (hasPendingEvent()) {
1371 ListSeparator LS;
1372 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1373 if (hasPendingEvent((WaitEventType)I)) {
1374 OS << LS << WaitEventTypeName[I];
1375 }
1376 }
1377 } else {
1378 OS << "none";
1379 }
1380 OS << '\n';
1381
1382 OS << "Async score: ";
1383 if (AsyncScore.empty())
1384 OS << "none";
1385 else
1386 llvm::interleaveComma(AsyncScore, OS);
1387 OS << '\n';
1388
1389 OS << "Async marks: " << AsyncMarks.size() << '\n';
1390
1391 for (const auto &Mark : AsyncMarks) {
1392 for (auto T : inst_counter_types()) {
1393 unsigned MarkedScore = Mark[T];
1394 switch (T) {
1395 case LOAD_CNT:
1396 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM")
1397 << "_CNT: " << MarkedScore;
1398 break;
1399 case DS_CNT:
1400 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM")
1401 << "_CNT: " << MarkedScore;
1402 break;
1403 case EXP_CNT:
1404 OS << " EXP_CNT: " << MarkedScore;
1405 break;
1406 case STORE_CNT:
1407 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS")
1408 << "_CNT: " << MarkedScore;
1409 break;
1410 case SAMPLE_CNT:
1411 OS << " SAMPLE_CNT: " << MarkedScore;
1412 break;
1413 case BVH_CNT:
1414 OS << " BVH_CNT: " << MarkedScore;
1415 break;
1416 case KM_CNT:
1417 OS << " KM_CNT: " << MarkedScore;
1418 break;
1419 case X_CNT:
1420 OS << " X_CNT: " << MarkedScore;
1421 break;
1422 default:
1423 OS << " UNKNOWN: " << MarkedScore;
1424 break;
1425 }
1426 }
1427 OS << '\n';
1428 }
1429 OS << '\n';
1430}
1431
1432/// Simplify \p UpdateWait by removing waits that are redundant based on the
1433/// current WaitcntBrackets and any other waits specified in \p CheckWait.
1434void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
1435 AMDGPU::Waitcnt &UpdateWait) const {
1436 simplifyWaitcnt(UpdateWait, LOAD_CNT);
1437 simplifyWaitcnt(UpdateWait, EXP_CNT);
1438 simplifyWaitcnt(UpdateWait, DS_CNT);
1439 simplifyWaitcnt(UpdateWait, STORE_CNT);
1440 simplifyWaitcnt(UpdateWait, SAMPLE_CNT);
1441 simplifyWaitcnt(UpdateWait, BVH_CNT);
1442 simplifyWaitcnt(UpdateWait, KM_CNT);
1443 simplifyXcnt(CheckWait, UpdateWait);
1444 simplifyWaitcnt(UpdateWait, VA_VDST);
1445 simplifyVmVsrc(CheckWait, UpdateWait);
1446}
1447
1448void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1449 unsigned &Count) const {
1450 // The number of outstanding events for this type, T, can be calculated
1451 // as (UB - LB). If the current Count is greater than or equal to the number
1452 // of outstanding events, then the wait for this counter is redundant.
1453 if (Count >= getScoreRange(T))
1454 Count = ~0u;
1455}
1456
1457void WaitcntBrackets::simplifyWaitcnt(Waitcnt &Wait, InstCounterType T) const {
1458 unsigned Cnt = Wait.get(T);
1459 simplifyWaitcnt(T, Cnt);
1460 Wait.set(T, Cnt);
1461}
1462
1463void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
1464 AMDGPU::Waitcnt &UpdateWait) const {
1465 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1466 // optimizations. On entry to a block with multiple predescessors, there may
1467 // be pending SMEM and VMEM events active at the same time.
1468 // In such cases, only clear one active event at a time.
1469 // TODO: Revisit xcnt optimizations for gfx1250.
1470 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1471 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1472 // zero.
1473 if (CheckWait.get(KM_CNT) == 0 && hasPendingEvent(SMEM_GROUP))
1474 UpdateWait.set(X_CNT, ~0u);
1475 // If we have pending store we cannot optimize XCnt because we do not wait for
1476 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1477 // decremented to the same number as LOADCnt.
1478 if (CheckWait.get(LOAD_CNT) != ~0u && hasPendingEvent(VMEM_GROUP) &&
1479 !hasPendingEvent(STORE_CNT) &&
1480 CheckWait.get(X_CNT) >= CheckWait.get(LOAD_CNT))
1481 UpdateWait.set(X_CNT, ~0u);
1482 simplifyWaitcnt(UpdateWait, X_CNT);
1483}
1484
1485void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
1486 AMDGPU::Waitcnt &UpdateWait) const {
1487 // Waiting for some counters implies waiting for VM_VSRC, since an
1488 // instruction that decrements a counter on completion would have
1489 // decremented VM_VSRC once its VGPR operands had been read.
1490 if (CheckWait.get(VM_VSRC) >=
1491 std::min({CheckWait.get(LOAD_CNT), CheckWait.get(STORE_CNT),
1492 CheckWait.get(SAMPLE_CNT), CheckWait.get(BVH_CNT),
1493 CheckWait.get(DS_CNT)}))
1494 UpdateWait.set(VM_VSRC, ~0u);
1495 simplifyWaitcnt(UpdateWait, VM_VSRC);
1496}
1497
1498void WaitcntBrackets::purgeEmptyTrackingData() {
1499 for (auto &[K, V] : make_early_inc_range(VMem)) {
1500 if (V.empty())
1501 VMem.erase(K);
1502 }
1503 for (auto &[K, V] : make_early_inc_range(SGPRs)) {
1504 if (V.empty())
1505 SGPRs.erase(K);
1506 }
1507}
1508
1509void WaitcntBrackets::determineWaitForScore(InstCounterType T,
1510 unsigned ScoreToWait,
1511 AMDGPU::Waitcnt &Wait) const {
1512 const unsigned LB = getScoreLB(T);
1513 const unsigned UB = getScoreUB(T);
1514
1515 // If the score falls within the bracket, we need a waitcnt.
1516 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1517 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1518 !Context->ST->hasFlatLgkmVMemCountInOrder()) {
1519 // If there is a pending FLAT operation, and this is a VMem or LGKM
1520 // waitcnt and the target can report early completion, then we need
1521 // to force a waitcnt 0.
1522 addWait(Wait, T, 0);
1523 } else if (counterOutOfOrder(T)) {
1524 // Counter can get decremented out-of-order when there
1525 // are multiple types event in the bracket. Also emit an s_wait counter
1526 // with a conservative value of 0 for the counter.
1527 addWait(Wait, T, 0);
1528 } else {
1529 // If a counter has been maxed out avoid overflow by waiting for
1530 // MAX(CounterType) - 1 instead.
1531 unsigned NeededWait = std::min(
1532 UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1);
1533 addWait(Wait, T, NeededWait);
1534 }
1535 }
1536}
1537
1538AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
1539 LLVM_DEBUG({
1540 dbgs() << "Need " << N << " async marks. Found " << AsyncMarks.size()
1541 << ":\n";
1542 for (const auto &Mark : AsyncMarks) {
1543 llvm::interleaveComma(Mark, dbgs());
1544 dbgs() << '\n';
1545 }
1546 });
1547
1548 if (AsyncMarks.size() == MaxAsyncMarks) {
1549 // Enforcing MaxAsyncMarks here is unnecessary work because the size of
1550 // MaxAsyncMarks is linear when traversing straightline code. But we do
1551 // need to check if truncation may have occured at a merge, and adjust N
1552 // to ensure that a wait is generated.
1553 LLVM_DEBUG(dbgs() << "Possible truncation. Ensuring a non-trivial wait.\n");
1554 N = std::min(N, (unsigned)MaxAsyncMarks - 1);
1555 }
1556
1557 AMDGPU::Waitcnt Wait;
1558 if (AsyncMarks.size() <= N) {
1559 LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n");
1560 return Wait;
1561 }
1562
1563 size_t MarkIndex = AsyncMarks.size() - N - 1;
1564 const auto &RequiredMark = AsyncMarks[MarkIndex];
1566 determineWaitForScore(T, RequiredMark[T], Wait);
1567
1568 // Immediately remove the waited mark and all older ones
1569 // This happens BEFORE the wait is actually inserted, which is fine
1570 // because we've already extracted the wait requirements
1571 LLVM_DEBUG({
1572 dbgs() << "Removing " << (MarkIndex + 1)
1573 << " async marks after determining wait\n";
1574 });
1575 AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
1576
1577 LLVM_DEBUG(dbgs() << "Waits to add: " << Wait);
1578 return Wait;
1579}
1580
1581void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
1582 AMDGPU::Waitcnt &Wait) const {
1583 if (Reg == AMDGPU::SCC) {
1584 determineWaitForScore(T, SCCScore, Wait);
1585 } else {
1586 bool IsVGPR = Context->TRI->isVectorRegister(*Context->MRI, Reg);
1587 for (MCRegUnit RU : regunits(Reg))
1588 determineWaitForScore(
1589 T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
1590 Wait);
1591 }
1592}
1593
1594void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
1595 AMDGPU::Waitcnt &Wait) const {
1596 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1597 determineWaitForScore(T, getVMemScore(TID, T), Wait);
1598}
1599
1600void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1601 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1602 // SCC has landed
1603 if (PendingSCCWrite &&
1604 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1605 PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
1606 WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
1607 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1608 if ((PendingEvents & Context->getWaitEvents(KM_CNT)) ==
1609 SCC_WRITE_PendingEvent) {
1610 setScoreLB(KM_CNT, getScoreUB(KM_CNT));
1611 }
1612
1613 PendingEvents.remove(SCC_WRITE_PendingEvent);
1614 PendingSCCWrite = nullptr;
1615 }
1616}
1617
1618void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1620 applyWaitcnt(Wait, T);
1621}
1622
1623void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1624 const unsigned UB = getScoreUB(T);
1625 if (Count >= UB)
1626 return;
1627 if (Count != 0) {
1628 if (counterOutOfOrder(T))
1629 return;
1630 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1631 } else {
1632 setScoreLB(T, UB);
1633 PendingEvents.remove(Context->getWaitEvents(T));
1634 }
1635
1636 if (T == KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) {
1637 if (!hasMixedPendingEvents(X_CNT))
1638 applyWaitcnt(X_CNT, 0);
1639 else
1640 PendingEvents.remove(SMEM_GROUP);
1641 }
1642 if (T == LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
1643 !hasPendingEvent(STORE_CNT)) {
1644 if (!hasMixedPendingEvents(X_CNT))
1645 applyWaitcnt(X_CNT, Count);
1646 else if (Count == 0)
1647 PendingEvents.remove(VMEM_GROUP);
1648 }
1649}
1650
1651void WaitcntBrackets::applyWaitcnt(const Waitcnt &Wait, InstCounterType T) {
1652 unsigned Cnt = Wait.get(T);
1653 applyWaitcnt(T, Cnt);
1654}
1655
1656// Where there are multiple types of event in the bracket of a counter,
1657// the decrement may go out of order.
1658bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1659 // Scalar memory read always can go out of order.
1660 if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1661 (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
1662 return true;
1663
1664 // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
1665 // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
1666 // out-of-order completion.
1667 if (T == LOAD_CNT) {
1668 unsigned Events = hasPendingEvent(T);
1669 // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
1670 // events
1671 Events &= ~(1 << GLOBAL_INV_ACCESS);
1672 // Return true only if there are still multiple event types after removing
1673 // GLOBAL_INV
1674 return Events & (Events - 1);
1675 }
1676
1677 return hasMixedPendingEvents(T);
1678}
1679
1680INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1681 false, false)
1684INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1686
1687char SIInsertWaitcntsLegacy::ID = 0;
1688
1689char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1690
1692 return new SIInsertWaitcntsLegacy();
1693}
1694
1695static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1696 unsigned NewEnc) {
1697 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1698 assert(OpIdx >= 0);
1699
1700 MachineOperand &MO = MI.getOperand(OpIdx);
1701
1702 if (NewEnc == MO.getImm())
1703 return false;
1704
1705 MO.setImm(NewEnc);
1706 return true;
1707}
1708
1709/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1710/// and if so, which counter it is waiting on.
1711static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1712 switch (Opcode) {
1713 case AMDGPU::S_WAIT_LOADCNT:
1714 return LOAD_CNT;
1715 case AMDGPU::S_WAIT_EXPCNT:
1716 return EXP_CNT;
1717 case AMDGPU::S_WAIT_STORECNT:
1718 return STORE_CNT;
1719 case AMDGPU::S_WAIT_SAMPLECNT:
1720 return SAMPLE_CNT;
1721 case AMDGPU::S_WAIT_BVHCNT:
1722 return BVH_CNT;
1723 case AMDGPU::S_WAIT_DSCNT:
1724 return DS_CNT;
1725 case AMDGPU::S_WAIT_KMCNT:
1726 return KM_CNT;
1727 case AMDGPU::S_WAIT_XCNT:
1728 return X_CNT;
1729 default:
1730 return {};
1731 }
1732}
1733
1734bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1735 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1736 if (Opcode == Waitcnt->getOpcode())
1737 return false;
1738
1739 Waitcnt->setDesc(TII.get(Opcode));
1740 return true;
1741}
1742
1743/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1744/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1745/// from \p Wait that were added by previous passes. Currently this pass
1746/// conservatively assumes that these preexisting waits are required for
1747/// correctness.
1748bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1749 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1750 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1751 assert(isNormalMode(MaxCounter));
1752
1753 bool Modified = false;
1754 MachineInstr *WaitcntInstr = nullptr;
1755 MachineInstr *WaitcntVsCntInstr = nullptr;
1756
1757 LLVM_DEBUG({
1758 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1759 if (It.isEnd())
1760 dbgs() << "end of block\n";
1761 else
1762 dbgs() << *It;
1763 });
1764
1765 for (auto &II :
1766 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1767 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1768 if (II.isMetaInstruction()) {
1769 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1770 continue;
1771 }
1772
1773 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1774 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1775
1776 // Update required wait count. If this is a soft waitcnt (= it was added
1777 // by an earlier pass), it may be entirely removed.
1778 if (Opcode == AMDGPU::S_WAITCNT) {
1779 unsigned IEnc = II.getOperand(0).getImm();
1780 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1781 if (TrySimplify)
1782 ScoreBrackets.simplifyWaitcnt(OldWait);
1783 Wait = Wait.combined(OldWait);
1784
1785 // Merge consecutive waitcnt of the same type by erasing multiples.
1786 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1787 II.eraseFromParent();
1788 Modified = true;
1789 } else
1790 WaitcntInstr = &II;
1791 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1792 assert(ST.hasVMemToLDSLoad());
1793 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1794 << "Before: " << Wait << '\n';);
1795 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, LDSDMA_BEGIN, Wait);
1796 LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
1797
1798 // It is possible (but unlikely) that this is the only wait instruction,
1799 // in which case, we exit this loop without a WaitcntInstr to consume
1800 // `Wait`. But that works because `Wait` was passed in by reference, and
1801 // the callee eventually calls createNewWaitcnt on it. We test this
1802 // possibility in an articial MIR test since such a situation cannot be
1803 // recreated by running the memory legalizer.
1804 II.eraseFromParent();
1805 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1806 unsigned N = II.getOperand(0).getImm();
1807 LLVM_DEBUG(dbgs() << "Processing WAIT_ASYNCMARK: " << II << '\n';);
1808 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
1809 Wait = Wait.combined(OldWait);
1810 } else {
1811 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1812 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1813
1814 unsigned OldVSCnt =
1815 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1816 if (TrySimplify)
1817 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1818 Wait.set(STORE_CNT, std::min(Wait.get(STORE_CNT), OldVSCnt));
1819
1820 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1821 II.eraseFromParent();
1822 Modified = true;
1823 } else
1824 WaitcntVsCntInstr = &II;
1825 }
1826 }
1827
1828 if (WaitcntInstr) {
1829 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1831 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1832
1833 ScoreBrackets.applyWaitcnt(Wait, LOAD_CNT);
1834 ScoreBrackets.applyWaitcnt(Wait, EXP_CNT);
1835 ScoreBrackets.applyWaitcnt(Wait, DS_CNT);
1836 Wait.set(LOAD_CNT, ~0u);
1837 Wait.set(EXP_CNT, ~0u);
1838 Wait.set(DS_CNT, ~0u);
1839
1840 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1841 << "New Instr at block end: "
1842 << *WaitcntInstr << '\n'
1843 : dbgs() << "applied pre-existing waitcnt\n"
1844 << "Old Instr: " << *It
1845 << "New Instr: " << *WaitcntInstr << '\n');
1846 }
1847
1848 if (WaitcntVsCntInstr) {
1850 *WaitcntVsCntInstr, AMDGPU::OpName::simm16, Wait.get(STORE_CNT));
1851 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1852
1853 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.get(STORE_CNT));
1854 Wait.set(STORE_CNT, ~0u);
1855
1856 LLVM_DEBUG(It.isEnd()
1857 ? dbgs() << "applied pre-existing waitcnt\n"
1858 << "New Instr at block end: " << *WaitcntVsCntInstr
1859 << '\n'
1860 : dbgs() << "applied pre-existing waitcnt\n"
1861 << "Old Instr: " << *It
1862 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1863 }
1864
1865 return Modified;
1866}
1867
1868/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1869/// required counters in \p Wait
1870bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1871 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1872 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
1873 assert(isNormalMode(MaxCounter));
1874
1875 bool Modified = false;
1876 const DebugLoc &DL = Block.findDebugLoc(It);
1877
1878 // Helper to emit expanded waitcnt sequence for profiling.
1879 // Emits waitcnts from (Outstanding-1) down to Target.
1880 // The EmitWaitcnt callback emits a single waitcnt.
1881 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
1882 auto EmitWaitcnt) {
1883 do {
1884 EmitWaitcnt(--Outstanding);
1885 } while (Outstanding > Target);
1886 Modified = true;
1887 };
1888
1889 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1890 // single instruction while VScnt has its own instruction.
1891 if (Wait.hasWaitExceptStoreCnt()) {
1892 // If profiling expansion is enabled, emit an expanded sequence
1893 if (ExpandWaitcntProfiling) {
1894 // Check if any of the counters to be waited on are out-of-order.
1895 // If so, fall back to normal (non-expanded) behavior since expansion
1896 // would provide misleading profiling information.
1897 bool AnyOutOfOrder = false;
1898 for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
1899 unsigned WaitCnt = Wait.get(CT);
1900 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
1901 AnyOutOfOrder = true;
1902 break;
1903 }
1904 }
1905
1906 if (AnyOutOfOrder) {
1907 // Fall back to non-expanded wait
1908 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1909 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
1910 Modified = true;
1911 } else {
1912 // All counters are in-order, safe to expand
1913 for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
1914 unsigned WaitCnt = Wait.get(CT);
1915 if (WaitCnt == ~0u)
1916 continue;
1917
1918 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
1919 getWaitCountMax(getLimits(), CT) - 1);
1920 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
1921 AMDGPU::Waitcnt W;
1922 W.set(CT, Count);
1923 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT))
1925 });
1926 }
1927 }
1928 } else {
1929 // Normal behavior: emit single combined waitcnt
1930 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1931 [[maybe_unused]] auto SWaitInst =
1932 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
1933 Modified = true;
1934
1935 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1936 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1937 dbgs() << "New Instr: " << *SWaitInst << '\n');
1938 }
1939 }
1940
1941 if (Wait.hasWaitStoreCnt()) {
1942 assert(ST.hasVscnt());
1943
1944 if (ExpandWaitcntProfiling && Wait.get(STORE_CNT) != ~0u &&
1945 !ScoreBrackets.counterOutOfOrder(STORE_CNT)) {
1946 // Only expand if counter is not out-of-order
1947 unsigned Outstanding =
1948 std::min(ScoreBrackets.getOutstanding(STORE_CNT),
1949 getWaitCountMax(getLimits(), STORE_CNT) - 1);
1950 EmitExpandedWaitcnt(
1951 Outstanding, Wait.get(STORE_CNT), [&](unsigned Count) {
1952 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
1953 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1954 .addImm(Count);
1955 });
1956 } else {
1957 [[maybe_unused]] auto SWaitInst =
1958 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
1959 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1960 .addImm(Wait.get(STORE_CNT));
1961 Modified = true;
1962
1963 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1964 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1965 dbgs() << "New Instr: " << *SWaitInst << '\n');
1966 }
1967 }
1968
1969 return Modified;
1970}
1971
1972AMDGPU::Waitcnt
1973WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1974 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST.hasVscnt() ? 0 : ~0u);
1975}
1976
1977AMDGPU::Waitcnt
1978WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1979 unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
1980 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1981 ~0u /* XCNT */, ExpertVal, ExpertVal);
1982}
1983
1984/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1985/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1986/// were added by previous passes. Currently this pass conservatively
1987/// assumes that these preexisting waits are required for correctness.
1988bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1989 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1990 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1991 assert(!isNormalMode(MaxCounter));
1992
1993 bool Modified = false;
1994 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1995 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1996 MachineInstr *WaitcntDepctrInstr = nullptr;
1997 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1998
1999 LLVM_DEBUG({
2000 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
2001 if (It.isEnd())
2002 dbgs() << "end of block\n";
2003 else
2004 dbgs() << *It;
2005 });
2006
2007 // Accumulate waits that should not be simplified.
2008 AMDGPU::Waitcnt RequiredWait;
2009
2010 for (auto &II :
2011 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
2012 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
2013 if (II.isMetaInstruction()) {
2014 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
2015 continue;
2016 }
2017
2018 // Update required wait count. If this is a soft waitcnt (= it was added
2019 // by an earlier pass), it may be entirely removed.
2020
2021 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
2022 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
2023
2024 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
2025 // attempt to do more than that either.
2026 if (Opcode == AMDGPU::S_WAITCNT)
2027 continue;
2028
2029 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
2030 unsigned OldEnc =
2031 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2032 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
2033 if (TrySimplify)
2034 Wait = Wait.combined(OldWait);
2035 else
2036 RequiredWait = RequiredWait.combined(OldWait);
2037 // Keep the first wait_loadcnt, erase the rest.
2038 if (CombinedLoadDsCntInstr == nullptr) {
2039 CombinedLoadDsCntInstr = &II;
2040 } else {
2041 II.eraseFromParent();
2042 Modified = true;
2043 }
2044 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
2045 unsigned OldEnc =
2046 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2047 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
2048 if (TrySimplify)
2049 Wait = Wait.combined(OldWait);
2050 else
2051 RequiredWait = RequiredWait.combined(OldWait);
2052 // Keep the first wait_storecnt, erase the rest.
2053 if (CombinedStoreDsCntInstr == nullptr) {
2054 CombinedStoreDsCntInstr = &II;
2055 } else {
2056 II.eraseFromParent();
2057 Modified = true;
2058 }
2059 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
2060 unsigned OldEnc =
2061 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2062 AMDGPU::Waitcnt OldWait;
2065 if (TrySimplify)
2066 ScoreBrackets.simplifyWaitcnt(OldWait);
2067 Wait = Wait.combined(OldWait);
2068 if (WaitcntDepctrInstr == nullptr) {
2069 WaitcntDepctrInstr = &II;
2070 } else {
2071 // S_WAITCNT_DEPCTR requires special care. Don't remove a
2072 // duplicate if it is waiting on things other than VA_VDST or
2073 // VM_VSRC. If that is the case, just make sure the VA_VDST and
2074 // VM_VSRC subfields of the operand are set to the "no wait"
2075 // values.
2076
2077 unsigned Enc =
2078 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2079 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u);
2080 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u);
2081
2082 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
2083 Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc);
2084 Modified |= promoteSoftWaitCnt(&II);
2085 } else {
2086 II.eraseFromParent();
2087 Modified = true;
2088 }
2089 }
2090 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
2091 // Architectures higher than GFX10 do not have direct loads to
2092 // LDS, so no work required here yet.
2093 II.eraseFromParent();
2094 Modified = true;
2095 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
2096 reportFatalUsageError("WAIT_ASYNCMARK is not ready for GFX12 yet");
2097 } else {
2098 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
2099 assert(CT.has_value());
2100 unsigned OldCnt =
2101 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
2102 if (TrySimplify)
2103 addWait(Wait, CT.value(), OldCnt);
2104 else
2105 addWait(RequiredWait, CT.value(), OldCnt);
2106 // Keep the first wait of its kind, erase the rest.
2107 if (WaitInstrs[CT.value()] == nullptr) {
2108 WaitInstrs[CT.value()] = &II;
2109 } else {
2110 II.eraseFromParent();
2111 Modified = true;
2112 }
2113 }
2114 }
2115
2116 ScoreBrackets.simplifyWaitcnt(Wait.combined(RequiredWait), Wait);
2117 Wait = Wait.combined(RequiredWait);
2118
2119 if (CombinedLoadDsCntInstr) {
2120 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
2121 // to be waited for. Otherwise, let the instruction be deleted so
2122 // the appropriate single counter wait instruction can be inserted
2123 // instead, when new S_WAIT_*CNT instructions are inserted by
2124 // createNewWaitcnt(). As a side effect, resetting the wait counts will
2125 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
2126 // the loop below that deals with single counter instructions.
2127 //
2128 // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
2129 // instructions that have decremented LOAD_CNT or DS_CNT on completion
2130 // will have needed to wait for their register sources to be available
2131 // first.
2132 if (Wait.get(LOAD_CNT) != ~0u && Wait.get(DS_CNT) != ~0u) {
2133 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2134 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
2135 AMDGPU::OpName::simm16, NewEnc);
2136 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
2137 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.get(LOAD_CNT));
2138 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.get(DS_CNT));
2139 Wait.set(LOAD_CNT, ~0u);
2140 Wait.set(DS_CNT, ~0u);
2141
2142 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2143 << "New Instr at block end: "
2144 << *CombinedLoadDsCntInstr << '\n'
2145 : dbgs() << "applied pre-existing waitcnt\n"
2146 << "Old Instr: " << *It << "New Instr: "
2147 << *CombinedLoadDsCntInstr << '\n');
2148 } else {
2149 CombinedLoadDsCntInstr->eraseFromParent();
2150 Modified = true;
2151 }
2152 }
2153
2154 if (CombinedStoreDsCntInstr) {
2155 // Similarly for S_WAIT_STORECNT_DSCNT.
2156 if (Wait.get(STORE_CNT) != ~0u && Wait.get(DS_CNT) != ~0u) {
2157 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2158 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
2159 AMDGPU::OpName::simm16, NewEnc);
2160 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2161 ScoreBrackets.applyWaitcnt(Wait, STORE_CNT);
2162 ScoreBrackets.applyWaitcnt(Wait, DS_CNT);
2163 Wait.set(STORE_CNT, ~0u);
2164 Wait.set(DS_CNT, ~0u);
2165
2166 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2167 << "New Instr at block end: "
2168 << *CombinedStoreDsCntInstr << '\n'
2169 : dbgs() << "applied pre-existing waitcnt\n"
2170 << "Old Instr: " << *It << "New Instr: "
2171 << *CombinedStoreDsCntInstr << '\n');
2172 } else {
2173 CombinedStoreDsCntInstr->eraseFromParent();
2174 Modified = true;
2175 }
2176 }
2177
2178 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
2179 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
2180 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
2181 // instructions so that createNewWaitcnt() will create new combined
2182 // instructions to replace them.
2183
2184 if (Wait.get(DS_CNT) != ~0u) {
2185 // This is a vector of addresses in WaitInstrs pointing to instructions
2186 // that should be removed if they are present.
2188
2189 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
2190 // both) need to be waited for, ensure that there are no existing
2191 // individual wait count instructions for these.
2192
2193 if (Wait.get(LOAD_CNT) != ~0u) {
2194 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
2195 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
2196 } else if (Wait.get(STORE_CNT) != ~0u) {
2197 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
2198 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
2199 }
2200
2201 for (MachineInstr **WI : WaitsToErase) {
2202 if (!*WI)
2203 continue;
2204
2205 (*WI)->eraseFromParent();
2206 *WI = nullptr;
2207 Modified = true;
2208 }
2209 }
2210
2212 if (!WaitInstrs[CT])
2213 continue;
2214
2215 unsigned NewCnt = Wait.get(CT);
2216 if (NewCnt != ~0u) {
2217 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
2218 AMDGPU::OpName::simm16, NewCnt);
2219 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2220
2221 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2222 setNoWait(Wait, CT);
2223
2224 LLVM_DEBUG(It.isEnd()
2225 ? dbgs() << "applied pre-existing waitcnt\n"
2226 << "New Instr at block end: " << *WaitInstrs[CT]
2227 << '\n'
2228 : dbgs() << "applied pre-existing waitcnt\n"
2229 << "Old Instr: " << *It
2230 << "New Instr: " << *WaitInstrs[CT] << '\n');
2231 } else {
2232 WaitInstrs[CT]->eraseFromParent();
2233 Modified = true;
2234 }
2235 }
2236
2237 if (WaitcntDepctrInstr) {
2238 // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
2239 // subfields with the new required values.
2240 unsigned Enc =
2241 TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2242 ->getImm();
2245
2246 ScoreBrackets.applyWaitcnt(VA_VDST, Wait.get(VA_VDST));
2247 ScoreBrackets.applyWaitcnt(VM_VSRC, Wait.get(VM_VSRC));
2248 Wait.set(VA_VDST, ~0u);
2249 Wait.set(VM_VSRC, ~0u);
2250
2251 // If that new encoded Depctr immediate would actually still wait
2252 // for anything, update the instruction's operand. Otherwise it can
2253 // just be deleted.
2254 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
2255 Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr,
2256 AMDGPU::OpName::simm16, Enc);
2257 LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n"
2258 << "New Instr at block end: "
2259 << *WaitcntDepctrInstr << '\n'
2260 : dbgs() << "applyPreexistingWaitcnt\n"
2261 << "Old Instr: " << *It << "New Instr: "
2262 << *WaitcntDepctrInstr << '\n');
2263 } else {
2264 WaitcntDepctrInstr->eraseFromParent();
2265 Modified = true;
2266 }
2267 }
2268
2269 return Modified;
2270}
2271
2272/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
2273bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2274 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
2275 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
2276 assert(!isNormalMode(MaxCounter));
2277
2278 bool Modified = false;
2279 const DebugLoc &DL = Block.findDebugLoc(It);
2280
2281 // Helper to emit expanded waitcnt sequence for profiling.
2282 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
2283 auto EmitWaitcnt) {
2284 for (unsigned I = Outstanding - 1; I > Target && I != ~0u; --I)
2285 EmitWaitcnt(I);
2286 EmitWaitcnt(Target);
2287 Modified = true;
2288 };
2289
2290 // For GFX12+, we use separate wait instructions, which makes expansion
2291 // simpler
2292 if (ExpandWaitcntProfiling) {
2294 unsigned Count = Wait.get(CT);
2295 if (Count == ~0u)
2296 continue;
2297
2298 // Skip expansion for out-of-order counters - emit normal wait instead
2299 if (ScoreBrackets.counterOutOfOrder(CT)) {
2300 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2301 .addImm(Count);
2302 Modified = true;
2303 continue;
2304 }
2305
2306 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
2307 getWaitCountMax(getLimits(), CT) - 1);
2308 EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
2309 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2310 .addImm(Val);
2311 });
2312 }
2313 return Modified;
2314 }
2315
2316 // Normal behavior (no expansion)
2317 // Check for opportunities to use combined wait instructions.
2318 if (Wait.get(DS_CNT) != ~0u) {
2319 MachineInstr *SWaitInst = nullptr;
2320
2321 if (Wait.get(LOAD_CNT) != ~0u) {
2322 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2323
2324 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2325 .addImm(Enc);
2326
2327 Wait.set(LOAD_CNT, ~0u);
2328 Wait.set(DS_CNT, ~0u);
2329 } else if (Wait.get(STORE_CNT) != ~0u) {
2330 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2331
2332 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_STORECNT_DSCNT))
2333 .addImm(Enc);
2334
2335 Wait.set(STORE_CNT, ~0u);
2336 Wait.set(DS_CNT, ~0u);
2337 }
2338
2339 if (SWaitInst) {
2340 Modified = true;
2341
2342 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2343 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2344 dbgs() << "New Instr: " << *SWaitInst << '\n');
2345 }
2346 }
2347
2348 // Generate an instruction for any remaining counter that needs
2349 // waiting for.
2350
2352 unsigned Count = Wait.get(CT);
2353 if (Count == ~0u)
2354 continue;
2355
2356 [[maybe_unused]] auto SWaitInst =
2357 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2358 .addImm(Count);
2359
2360 Modified = true;
2361
2362 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2363 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2364 dbgs() << "New Instr: " << *SWaitInst << '\n');
2365 }
2366
2367 if (Wait.hasWaitDepctr()) {
2368 assert(IsExpertMode);
2369 unsigned Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Wait.get(VM_VSRC), ST);
2371
2372 [[maybe_unused]] auto SWaitInst =
2373 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_DEPCTR)).addImm(Enc);
2374
2375 Modified = true;
2376
2377 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
2378 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2379 dbgs() << "New Instr: " << *SWaitInst << '\n');
2380 }
2381
2382 return Modified;
2383}
2384
2385/// Generate s_waitcnt instruction to be placed before cur_Inst.
2386/// Instructions of a given type are returned in order,
2387/// but instructions of different types can complete out of order.
2388/// We rely on this in-order completion
2389/// and simply assign a score to the memory access instructions.
2390/// We keep track of the active "score bracket" to determine
2391/// if an access of a memory read requires an s_waitcnt
2392/// and if so what the value of each counter is.
2393/// The "score bracket" is bound by the lower bound and upper bound
2394/// scores (*_score_LB and *_score_ub respectively).
2395/// If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here.
2396/// If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here
2397/// (GFX12+ only, where DS_CNT is a separate counter).
2398bool SIInsertWaitcnts::generateWaitcntInstBefore(
2399 MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
2400 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2401 LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs()););
2402 setForceEmitWaitcnt();
2403
2404 assert(!MI.isMetaInstruction());
2405
2406 AMDGPU::Waitcnt Wait;
2407 const unsigned Opc = MI.getOpcode();
2408
2409 switch (Opc) {
2410 case AMDGPU::BUFFER_WBINVL1:
2411 case AMDGPU::BUFFER_WBINVL1_SC:
2412 case AMDGPU::BUFFER_WBINVL1_VOL:
2413 case AMDGPU::BUFFER_GL0_INV:
2414 case AMDGPU::BUFFER_GL1_INV: {
2415 // FIXME: This should have already been handled by the memory legalizer.
2416 // Removing this currently doesn't affect any lit tests, but we need to
2417 // verify that nothing was relying on this. The number of buffer invalidates
2418 // being handled here should not be expanded.
2419 Wait.set(LOAD_CNT, 0);
2420 break;
2421 }
2422 case AMDGPU::SI_RETURN_TO_EPILOG:
2423 case AMDGPU::SI_RETURN:
2424 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2425 case AMDGPU::S_SETPC_B64_return: {
2426 // All waits must be resolved at call return.
2427 // NOTE: this could be improved with knowledge of all call sites or
2428 // with knowledge of the called routines.
2429 ReturnInsts.insert(&MI);
2430 AMDGPU::Waitcnt AllZeroWait =
2431 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2432 // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
2433 // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
2434 // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
2435 // no need to wait for it at function boundaries.
2436 if (ST->hasExtendedWaitCounts() &&
2437 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
2438 AllZeroWait.set(LOAD_CNT, ~0u);
2439 Wait = AllZeroWait;
2440 break;
2441 }
2442 case AMDGPU::S_ENDPGM:
2443 case AMDGPU::S_ENDPGM_SAVED: {
2444 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
2445 // Technically the hardware will do this on its own if we don't, but that
2446 // might cost extra cycles compared to doing it explicitly.
2447 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
2448 // have to wait for outstanding VMEM stores. In this case it can be useful
2449 // to send a message to explicitly release all VGPRs before the stores have
2450 // completed, but it is only safe to do this if there are no outstanding
2451 // scratch stores.
2452 EndPgmInsts[&MI] = !ScoreBrackets.empty(STORE_CNT) &&
2453 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
2454 break;
2455 }
2456 case AMDGPU::S_SENDMSG:
2457 case AMDGPU::S_SENDMSGHALT: {
2458 if (ST->hasLegacyGeometry() &&
2459 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
2461 // Resolve vm waits before gs-done.
2462 Wait.set(LOAD_CNT, 0);
2463 break;
2464 }
2465 [[fallthrough]];
2466 }
2467 default: {
2468
2469 // Export & GDS instructions do not read the EXEC mask until after the
2470 // export is granted (which can occur well after the instruction is issued).
2471 // The shader program must flush all EXP operations on the export-count
2472 // before overwriting the EXEC mask.
2473 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
2474 // Export and GDS are tracked individually, either may trigger a waitcnt
2475 // for EXEC.
2476 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2477 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2478 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2479 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2480 Wait.set(EXP_CNT, 0);
2481 }
2482 }
2483
2484 // Wait for any pending GDS instruction to complete before any
2485 // "Always GDS" instruction.
2486 if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
2487 addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
2488
2489 if (MI.isCall()) {
2490 // The function is going to insert a wait on everything in its prolog.
2491 // This still needs to be careful if the call target is a load (e.g. a GOT
2492 // load). We also need to check WAW dependency with saved PC.
2493 CallInsts.insert(&MI);
2494 Wait = AMDGPU::Waitcnt();
2495
2496 const MachineOperand &CallAddrOp = TII->getCalleeOperand(MI);
2497 if (CallAddrOp.isReg()) {
2498 ScoreBrackets.determineWaitForPhysReg(
2499 SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait);
2500
2501 if (const auto *RtnAddrOp =
2502 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
2503 ScoreBrackets.determineWaitForPhysReg(
2504 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait);
2505 }
2506 }
2507 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
2508 ScoreBrackets.tryClearSCCWriteEvent(&MI);
2509 } else {
2510 // FIXME: Should not be relying on memoperands.
2511 // Look at the source operands of every instruction to see if
2512 // any of them results from a previous memory operation that affects
2513 // its current usage. If so, an s_waitcnt instruction needs to be
2514 // emitted.
2515 // If the source operand was defined by a load, add the s_waitcnt
2516 // instruction.
2517 //
2518 // Two cases are handled for destination operands:
2519 // 1) If the destination operand was defined by a load, add the s_waitcnt
2520 // instruction to guarantee the right WAW order.
2521 // 2) If a destination operand that was used by a recent export/store ins,
2522 // add s_waitcnt on exp_cnt to guarantee the WAR order.
2523
2524 for (const MachineMemOperand *Memop : MI.memoperands()) {
2525 const Value *Ptr = Memop->getValue();
2526 if (Memop->isStore()) {
2527 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
2528 addWait(Wait, SmemAccessCounter, 0);
2529 if (PDT.dominates(MI.getParent(), It->second))
2530 SLoadAddresses.erase(It);
2531 }
2532 }
2533 unsigned AS = Memop->getAddrSpace();
2535 continue;
2536 // No need to wait before load from VMEM to LDS.
2537 if (TII->mayWriteLDSThroughDMA(MI))
2538 continue;
2539
2540 // LOAD_CNT is only relevant to vgpr or LDS.
2541 unsigned TID = LDSDMA_BEGIN;
2542 if (Ptr && Memop->getAAInfo()) {
2543 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2544 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2545 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
2546 if ((I + 1) >= NUM_LDSDMA) {
2547 // We didn't have enough slot to track this LDS DMA store, it
2548 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
2549 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
2550 break;
2551 }
2552
2553 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID + I + 1, Wait);
2554 }
2555 }
2556 } else {
2557 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
2558 }
2559 if (Memop->isStore()) {
2560 ScoreBrackets.determineWaitForLDSDMA(EXP_CNT, TID, Wait);
2561 }
2562 }
2563
2564 // Loop over use and def operands.
2565 for (const MachineOperand &Op : MI.operands()) {
2566 if (!Op.isReg())
2567 continue;
2568
2569 // If the instruction does not read tied source, skip the operand.
2570 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
2571 continue;
2572
2573 MCPhysReg Reg = Op.getReg().asMCReg();
2574
2575 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
2576 if (IsVGPR) {
2577 // Implicit VGPR defs and uses are never a part of the memory
2578 // instructions description and usually present to account for
2579 // super-register liveness.
2580 // TODO: Most of the other instructions also have implicit uses
2581 // for the liveness accounting only.
2582 if (Op.isImplicit() && MI.mayLoadOrStore())
2583 continue;
2584
2585 ScoreBrackets.determineWaitForPhysReg(VA_VDST, Reg, Wait);
2586 if (Op.isDef())
2587 ScoreBrackets.determineWaitForPhysReg(VM_VSRC, Reg, Wait);
2588 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2589 // previous write and this write are the same type of VMEM
2590 // instruction, in which case they are (in some architectures)
2591 // guaranteed to write their results in order anyway.
2592 // Additionally check instructions where Point Sample Acceleration
2593 // might be applied.
2594 if (Op.isUse() || !updateVMCntOnly(MI) ||
2595 ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||
2596 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
2597 !ST->hasVmemWriteVgprInOrder()) {
2598 ScoreBrackets.determineWaitForPhysReg(LOAD_CNT, Reg, Wait);
2599 ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT, Reg, Wait);
2600 ScoreBrackets.determineWaitForPhysReg(BVH_CNT, Reg, Wait);
2601 ScoreBrackets.clearVgprVmemTypes(Reg);
2602 }
2603
2604 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2605 ScoreBrackets.determineWaitForPhysReg(EXP_CNT, Reg, Wait);
2606 }
2607 ScoreBrackets.determineWaitForPhysReg(DS_CNT, Reg, Wait);
2608 } else if (Op.getReg() == AMDGPU::SCC) {
2609 ScoreBrackets.determineWaitForPhysReg(KM_CNT, Reg, Wait);
2610 } else {
2611 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait);
2612 }
2613
2614 if (ST->hasWaitXcnt() && Op.isDef())
2615 ScoreBrackets.determineWaitForPhysReg(X_CNT, Reg, Wait);
2616 }
2617 }
2618 }
2619 }
2620
2621 // Ensure safety against exceptions from outstanding memory operations while
2622 // waiting for a barrier:
2623 //
2624 // * Some subtargets safely handle backing off the barrier in hardware
2625 // when an exception occurs.
2626 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2627 // there can be no outstanding memory operations during the wait.
2628 // * Subtargets with split barriers don't need to back off the barrier; it
2629 // is up to the trap handler to preserve the user barrier state correctly.
2630 //
2631 // In all other cases, ensure safety by ensuring that there are no outstanding
2632 // memory operations.
2633 if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() &&
2634 !ST->hasBackOffBarrier()) {
2635 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2636 }
2637
2638 // TODO: Remove this work-around, enable the assert for Bug 457939
2639 // after fixing the scheduler. Also, the Shader Compiler code is
2640 // independent of target.
2641 if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() &&
2642 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2643 Wait.set(DS_CNT, 0);
2644 }
2645
2646 // Verify that the wait is actually needed.
2647 ScoreBrackets.simplifyWaitcnt(Wait);
2648
2649 // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
2650 // waits on VA_VDST if the instruction it would precede is not a VALU
2651 // instruction, since hardware handles VALU->VGPR->VALU hazards in
2652 // expert scheduling mode.
2653 if (TII->isVALU(MI))
2654 Wait.set(VA_VDST, ~0u);
2655
2656 // Since the translation for VMEM addresses occur in-order, we can apply the
2657 // XCnt if the current instruction is of VMEM type and has a memory
2658 // dependency with another VMEM instruction in flight.
2659 if (Wait.get(X_CNT) != ~0u && isVmemAccess(MI)) {
2660 ScoreBrackets.applyWaitcnt(Wait, X_CNT);
2661 Wait.set(X_CNT, ~0u);
2662 }
2663
2664 // When forcing emit, we need to skip terminators because that would break the
2665 // terminators of the MBB if we emit a waitcnt between terminators.
2666 if (ForceEmitZeroFlag && !MI.isTerminator())
2667 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2668
2669 // If we force waitcnt then update Wait accordingly.
2671 if (!ForceEmitWaitcnt[T])
2672 continue;
2673 Wait.set(T, 0);
2674 }
2675
2676 if (FlushFlags.FlushVmCnt) {
2678 Wait.set(T, 0);
2679 }
2680
2681 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
2682 Wait.set(DS_CNT, 0);
2683
2684 if (ForceEmitZeroLoadFlag && Wait.get(LOAD_CNT) != ~0u)
2685 Wait.set(LOAD_CNT, 0);
2686
2687 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2688 OldWaitcntInstr);
2689}
2690
2691bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2693 MachineBasicBlock &Block,
2694 WaitcntBrackets &ScoreBrackets,
2695 MachineInstr *OldWaitcntInstr) {
2696 bool Modified = false;
2697
2698 if (OldWaitcntInstr)
2699 // Try to merge the required wait with preexisting waitcnt instructions.
2700 // Also erase redundant waitcnt.
2701 Modified =
2702 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2703
2704 // ExpCnt can be merged into VINTERP.
2705 if (Wait.get(EXP_CNT) != ~0u && It != Block.instr_end() &&
2707 MachineOperand *WaitExp =
2708 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2709 if (Wait.get(EXP_CNT) < WaitExp->getImm()) {
2710 WaitExp->setImm(Wait.get(EXP_CNT));
2711 Modified = true;
2712 }
2713 // Apply ExpCnt before resetting it, so applyWaitcnt below sees all counts.
2714 ScoreBrackets.applyWaitcnt(Wait, EXP_CNT);
2715 Wait.set(EXP_CNT, ~0u);
2716
2717 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2718 << "Update Instr: " << *It);
2719 }
2720
2721 if (WCG->createNewWaitcnt(Block, It, Wait, ScoreBrackets))
2722 Modified = true;
2723
2724 // Any counts that could have been applied to any existing waitcnt
2725 // instructions will have been done so, now deal with any remaining.
2726 ScoreBrackets.applyWaitcnt(Wait);
2727
2728 return Modified;
2729}
2730
2731std::optional<WaitEventType>
2732SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
2733 if (TII->isVALU(Inst)) {
2734 // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
2735 // out-of-order with respect to each other, so each of these classes
2736 // has its own event.
2737
2738 if (TII->isXDL(Inst))
2739 return VGPR_XDL_WRITE;
2740
2741 if (TII->isTRANS(Inst))
2742 return VGPR_TRANS_WRITE;
2743
2745 return VGPR_DPMACC_WRITE;
2746
2747 return VGPR_CSMACC_WRITE;
2748 }
2749
2750 // FLAT and LDS instructions may read their VGPR sources out-of-order
2751 // with respect to each other and all other VMEM instructions, so
2752 // each of these also has a separate event.
2753
2754 if (TII->isFLAT(Inst))
2755 return VGPR_FLAT_READ;
2756
2757 if (TII->isDS(Inst))
2758 return VGPR_LDS_READ;
2759
2760 if (TII->isVMEM(Inst) || TII->isVIMAGE(Inst) || TII->isVSAMPLE(Inst))
2761 return VGPR_VMEM_READ;
2762
2763 // Otherwise, no hazard.
2764
2765 return {};
2766}
2767
2768bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2769 return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||
2770 (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2771}
2772
2773// Return true if the next instruction is S_ENDPGM, following fallthrough
2774// blocks if necessary.
2775bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2776 MachineBasicBlock *Block) const {
2777 auto BlockEnd = Block->getParent()->end();
2778 auto BlockIter = Block->getIterator();
2779
2780 while (true) {
2781 if (It.isEnd()) {
2782 if (++BlockIter != BlockEnd) {
2783 It = BlockIter->instr_begin();
2784 continue;
2785 }
2786
2787 return false;
2788 }
2789
2790 if (!It->isMetaInstruction())
2791 break;
2792
2793 It++;
2794 }
2795
2796 assert(!It.isEnd());
2797
2798 return It->getOpcode() == AMDGPU::S_ENDPGM;
2799}
2800
2801// Add a wait after an instruction if architecture requirements mandate one.
2802bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2803 MachineBasicBlock &Block,
2804 WaitcntBrackets &ScoreBrackets) {
2805 AMDGPU::Waitcnt Wait;
2806 bool NeedsEndPGMCheck = false;
2807
2808 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2809 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2811
2812 if (TII->isAlwaysGDS(Inst.getOpcode())) {
2813 Wait.set(DS_CNT, 0);
2814 NeedsEndPGMCheck = true;
2815 }
2816
2817 ScoreBrackets.simplifyWaitcnt(Wait);
2818
2819 auto SuccessorIt = std::next(Inst.getIterator());
2820 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2821 /*OldWaitcntInstr=*/nullptr);
2822
2823 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2824 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
2825 .addImm(0);
2826 }
2827
2828 return Result;
2829}
2830
2831WaitEventSet SIInsertWaitcnts::getEventsFor(const MachineInstr &Inst) const {
2832 WaitEventSet Events;
2833 if (IsExpertMode) {
2834 if (const auto ET = getExpertSchedulingEventType(Inst))
2835 Events.insert(*ET);
2836 }
2837
2838 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
2839 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
2840 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2841 Events.insert(GDS_ACCESS);
2842 Events.insert(GDS_GPR_LOCK);
2843 } else {
2844 Events.insert(LDS_ACCESS);
2845 }
2846 } else if (TII->isFLAT(Inst)) {
2848 Events.insert(getVmemWaitEventType(Inst));
2849 } else {
2850 assert(Inst.mayLoadOrStore());
2851 if (TII->mayAccessVMEMThroughFlat(Inst)) {
2852 if (ST->hasWaitXcnt())
2853 Events.insert(VMEM_GROUP);
2854 Events.insert(getVmemWaitEventType(Inst));
2855 }
2856 if (TII->mayAccessLDSThroughFlat(Inst))
2857 Events.insert(LDS_ACCESS);
2858 }
2859 } else if (SIInstrInfo::isVMEM(Inst) &&
2861 Inst.getOpcode() == AMDGPU::BUFFER_WBL2)) {
2862 // BUFFER_WBL2 is included here because unlike invalidates, has to be
2863 // followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has
2864 // completed.
2865 if (ST->hasWaitXcnt())
2866 Events.insert(VMEM_GROUP);
2867 Events.insert(getVmemWaitEventType(Inst));
2868 if (ST->vmemWriteNeedsExpWaitcnt() &&
2869 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2870 Events.insert(VMW_GPR_LOCK);
2871 }
2872 } else if (TII->isSMRD(Inst)) {
2873 if (ST->hasWaitXcnt())
2874 Events.insert(SMEM_GROUP);
2875 Events.insert(SMEM_ACCESS);
2876 } else if (SIInstrInfo::isLDSDIR(Inst)) {
2877 Events.insert(EXP_LDS_ACCESS);
2878 } else if (SIInstrInfo::isEXP(Inst)) {
2879 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2881 Events.insert(EXP_PARAM_ACCESS);
2883 Events.insert(EXP_POS_ACCESS);
2884 else
2885 Events.insert(EXP_GPR_LOCK);
2886 } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
2887 Events.insert(SCC_WRITE);
2888 } else {
2889 switch (Inst.getOpcode()) {
2890 case AMDGPU::S_SENDMSG:
2891 case AMDGPU::S_SENDMSG_RTN_B32:
2892 case AMDGPU::S_SENDMSG_RTN_B64:
2893 case AMDGPU::S_SENDMSGHALT:
2894 Events.insert(SQ_MESSAGE);
2895 break;
2896 case AMDGPU::S_MEMTIME:
2897 case AMDGPU::S_MEMREALTIME:
2898 case AMDGPU::S_GET_BARRIER_STATE_M0:
2899 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2900 Events.insert(SMEM_ACCESS);
2901 break;
2902 }
2903 }
2904 return Events;
2905}
2906
2907void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2908 WaitcntBrackets *ScoreBrackets) {
2909
2910 WaitEventSet InstEvents = getEventsFor(Inst);
2911 for (WaitEventType E : wait_events()) {
2912 if (InstEvents.contains(E))
2913 ScoreBrackets->updateByEvent(E, Inst);
2914 }
2915
2916 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
2917 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
2918 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2919 ScoreBrackets->setPendingGDS();
2920 }
2921 } else if (TII->isFLAT(Inst)) {
2922 if (Inst.mayLoadOrStore() && TII->mayAccessVMEMThroughFlat(Inst) &&
2923 TII->mayAccessLDSThroughFlat(Inst) && !SIInstrInfo::isLDSDMA(Inst))
2924 // Async/LDSDMA operations have FLAT encoding but do not actually use flat
2925 // pointers. They do have two operands that each access global and LDS,
2926 // thus making it appear at this point that they are using a flat pointer.
2927 // Filter them out, and for the rest, generate a dependency on flat
2928 // pointers so that both VM and LGKM counters are flushed.
2929 ScoreBrackets->setPendingFlat();
2930 } else if (Inst.isCall()) {
2931 // Act as a wait on everything
2932 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2933 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2934 } else if (TII->isVINTERP(Inst)) {
2935 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2936 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2937 }
2938}
2939
2940bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2941 unsigned OtherScore) {
2942 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2943 unsigned OtherShifted =
2944 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2945 Score = std::max(MyShifted, OtherShifted);
2946 return OtherShifted > MyShifted;
2947}
2948
2949bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
2950 ArrayRef<CounterValueArray> OtherMarks) {
2951 bool StrictDom = false;
2952
2953 LLVM_DEBUG(dbgs() << "Merging async marks ...");
2954 // Early exit: both empty
2955 if (AsyncMarks.empty() && OtherMarks.empty()) {
2956 LLVM_DEBUG(dbgs() << " nothing to merge\n");
2957 return false;
2958 }
2959 LLVM_DEBUG(dbgs() << '\n');
2960
2961 // Determine maximum length needed after merging
2962 auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.size());
2963 MaxSize = std::min(MaxSize, MaxAsyncMarks);
2964
2965 // Keep only the most recent marks within our limit.
2966 if (AsyncMarks.size() > MaxSize)
2967 AsyncMarks.erase(AsyncMarks.begin(),
2968 AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
2969
2970 // Pad with zero-filled marks if our list is shorter. Zero represents "no
2971 // pending async operations at this checkpoint" and acts as the identity
2972 // element for max() during merging. We pad at the beginning since the marks
2973 // need to be aligned in most-recent order.
2974 constexpr CounterValueArray ZeroMark{};
2975 AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
2976
2977 LLVM_DEBUG({
2978 dbgs() << "Before merge:\n";
2979 for (const auto &Mark : AsyncMarks) {
2980 llvm::interleaveComma(Mark, dbgs());
2981 dbgs() << '\n';
2982 }
2983 dbgs() << "Other marks:\n";
2984 for (const auto &Mark : OtherMarks) {
2985 llvm::interleaveComma(Mark, dbgs());
2986 dbgs() << '\n';
2987 }
2988 });
2989
2990 // Merge element-wise using the existing mergeScore function and the
2991 // appropriate MergeInfo for each counter type. Iterate only while we have
2992 // elements in both vectors.
2993 unsigned OtherSize = OtherMarks.size();
2994 unsigned OurSize = AsyncMarks.size();
2995 unsigned MergeCount = std::min(OtherSize, OurSize);
2996 for (auto Idx : seq_inclusive<unsigned>(1, MergeCount)) {
2997 for (auto T : inst_counter_types(Context->MaxCounter)) {
2998 StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],
2999 OtherMarks[OtherSize - Idx][T]);
3000 }
3001 }
3002
3003 LLVM_DEBUG({
3004 dbgs() << "After merge:\n";
3005 for (const auto &Mark : AsyncMarks) {
3006 llvm::interleaveComma(Mark, dbgs());
3007 dbgs() << '\n';
3008 }
3009 });
3010
3011 return StrictDom;
3012}
3013
3014/// Merge the pending events and associater score brackets of \p Other into
3015/// this brackets status.
3016///
3017/// Returns whether the merge resulted in a change that requires tighter waits
3018/// (i.e. the merged brackets strictly dominate the original brackets).
3019bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
3020 bool StrictDom = false;
3021
3022 // Check if "other" has keys we don't have, and create default entries for
3023 // those. If they remain empty after merging, we will clean it up after.
3024 for (auto K : Other.VMem.keys())
3025 VMem.try_emplace(K);
3026 for (auto K : Other.SGPRs.keys())
3027 SGPRs.try_emplace(K);
3028
3029 // Array to store MergeInfo for each counter type
3030 MergeInfo MergeInfos[NUM_INST_CNTS];
3031
3032 for (auto T : inst_counter_types(Context->MaxCounter)) {
3033 // Merge event flags for this counter
3034 const WaitEventSet &EventsForT = Context->getWaitEvents(T);
3035 const WaitEventSet OldEvents = PendingEvents & EventsForT;
3036 const WaitEventSet OtherEvents = Other.PendingEvents & EventsForT;
3037 if (!OldEvents.contains(OtherEvents))
3038 StrictDom = true;
3039 PendingEvents |= OtherEvents;
3040
3041 // Merge scores for this counter
3042 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
3043 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
3044 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
3045 if (NewUB < ScoreLBs[T])
3046 report_fatal_error("waitcnt score overflow");
3047
3048 MergeInfo &M = MergeInfos[T];
3049 M.OldLB = ScoreLBs[T];
3050 M.OtherLB = Other.ScoreLBs[T];
3051 M.MyShift = NewUB - ScoreUBs[T];
3052 M.OtherShift = NewUB - Other.ScoreUBs[T];
3053
3054 ScoreUBs[T] = NewUB;
3055
3056 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
3057
3058 if (T == DS_CNT)
3059 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
3060
3061 if (T == KM_CNT) {
3062 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
3063 if (Other.hasPendingEvent(SCC_WRITE)) {
3064 if (!OldEvents.contains(SCC_WRITE)) {
3065 PendingSCCWrite = Other.PendingSCCWrite;
3066 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
3067 PendingSCCWrite = nullptr;
3068 }
3069 }
3070 }
3071
3072 for (auto &[RegID, Info] : VMem)
3073 StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));
3074
3075 if (isSmemCounter(T)) {
3076 unsigned Idx = getSgprScoresIdx(T);
3077 for (auto &[RegID, Info] : SGPRs) {
3078 auto It = Other.SGPRs.find(RegID);
3079 unsigned OtherScore =
3080 (It != Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;
3081 StrictDom |= mergeScore(M, Info.Scores[Idx], OtherScore);
3082 }
3083 }
3084 }
3085
3086 for (auto &[TID, Info] : VMem) {
3087 if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {
3088 unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;
3089 StrictDom |= NewVmemTypes != Info.VMEMTypes;
3090 Info.VMEMTypes = NewVmemTypes;
3091 }
3092 }
3093
3094 StrictDom |= mergeAsyncMarks(MergeInfos, Other.AsyncMarks);
3095 for (auto T : inst_counter_types(Context->MaxCounter))
3096 StrictDom |= mergeScore(MergeInfos[T], AsyncScore[T], Other.AsyncScore[T]);
3097
3098 purgeEmptyTrackingData();
3099 return StrictDom;
3100}
3101
3102static bool isWaitInstr(MachineInstr &Inst) {
3103 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
3104 return Opcode == AMDGPU::S_WAITCNT ||
3105 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
3106 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
3107 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
3108 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
3109 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
3110 Opcode == AMDGPU::WAIT_ASYNCMARK ||
3111 counterTypeForInstr(Opcode).has_value();
3112}
3113
3114void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
3116 bool ExpertMode) const {
3117 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
3119 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SETREG_IMM32_B32))
3120 .addImm(ExpertMode ? 2 : 0)
3121 .addImm(EncodedReg);
3122}
3123
3124namespace {
3125// TODO: Remove this work-around after fixing the scheduler.
3126// There are two reasons why vccz might be incorrect; see ST->hasReadVCCZBug()
3127// and ST->partialVCCWritesUpdateVCCZ().
3128// i. VCCZBug: There is a hardware bug on CI/SI where SMRD instruction may
3129// corrupt vccz bit, so when we detect that an instruction may read from
3130// a corrupt vccz bit, we need to:
3131// 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
3132// operations to complete.
3133// 2. Recompute the correct value of vccz by writing the current value
3134// of vcc back to vcc.
3135// ii. Partial writes to vcc don't update vccz, so we need to recompute the
3136// correct value of vccz by reading vcc and writing it back to vcc.
3137// No waitcnt is needed in this case.
3138class VCCZWorkaround {
3139 const WaitcntBrackets &ScoreBrackets;
3140 const GCNSubtarget &ST;
3141 const SIInstrInfo &TII;
3142 const SIRegisterInfo &TRI;
3143 bool VCCZCorruptionBug = false;
3144 bool VCCZNotUpdatedByPartialWrites = false;
3145 /// vccz could be incorrect at a basic block boundary if a predecessor wrote
3146 /// to vcc and then issued an smem load, so initialize to true.
3147 bool MustRecomputeVCCZ = true;
3148
3149public:
3150 VCCZWorkaround(const WaitcntBrackets &ScoreBrackets, const GCNSubtarget &ST,
3151 const SIInstrInfo &TII, const SIRegisterInfo &TRI)
3152 : ScoreBrackets(ScoreBrackets), ST(ST), TII(TII), TRI(TRI) {
3153 VCCZCorruptionBug = ST.hasReadVCCZBug();
3154 VCCZNotUpdatedByPartialWrites = !ST.partialVCCWritesUpdateVCCZ();
3155 }
3156 /// If \p MI reads vccz and we must recompute it based on MustRecomputeVCCZ,
3157 /// then emit a vccz recompute instruction before \p MI. This needs to be
3158 /// called on every instruction in the basic block because it also tracks the
3159 /// state and updates MustRecomputeVCCZ accordingly. Returns true if it
3160 /// modified the IR.
3161 bool tryRecomputeVCCZ(MachineInstr &MI) {
3162 // No need to run this if neither bug is present.
3163 if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)
3164 return false;
3165
3166 // If MI is an SMEM and it can corrupt vccz on this target, then we need
3167 // both to emit a waitcnt and to recompute vccz.
3168 // But we don't actually emit a waitcnt here. This is done in
3169 // generateWaitcntInstBefore() because it tracks all the necessary waitcnt
3170 // state, and can either skip emitting a waitcnt if there is already one in
3171 // the IR, or emit an "optimized" combined waitcnt.
3172 // If this is an smem read, it could complete and clobber vccz at any time.
3173 MustRecomputeVCCZ |= VCCZCorruptionBug && TII.isSMRD(MI);
3174
3175 // If the target partial vcc writes don't update vccz, and MI is such an
3176 // instruction then we must recompute vccz.
3177 // Note: We are using PartiallyWritesToVCCOpt optional to avoid calling
3178 // `definesRegister()` more than needed, because it's not very cheap.
3179 std::optional<bool> PartiallyWritesToVCCOpt;
3180 auto PartiallyWritesToVCC = [](MachineInstr &MI) {
3181 return MI.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
3182 MI.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr);
3183 };
3184 if (VCCZNotUpdatedByPartialWrites) {
3185 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
3186 // If this is a partial VCC write but won't update vccz, then we must
3187 // recompute vccz.
3188 MustRecomputeVCCZ |= *PartiallyWritesToVCCOpt;
3189 }
3190
3191 // If MI is a vcc write with no pending smem, or there is a pending smem
3192 // but the target does not suffer from the vccz corruption bug, then we
3193 // don't need to recompute vccz as this write will recompute it anyway.
3194 if (!ScoreBrackets.hasPendingEvent(SMEM_ACCESS) || !VCCZCorruptionBug) {
3195 // Compute PartiallyWritesToVCCOpt if we haven't done so already.
3196 if (!PartiallyWritesToVCCOpt)
3197 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
3198 bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&
3199 MI.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr);
3200 // If we write to the full vcc or we write partially and the target
3201 // updates vccz on partial writes, then vccz will be updated correctly.
3202 bool UpdatesVCCZ = FullyWritesToVCC || (!VCCZNotUpdatedByPartialWrites &&
3203 *PartiallyWritesToVCCOpt);
3204 if (UpdatesVCCZ)
3205 MustRecomputeVCCZ = false;
3206 }
3207
3208 // If MI is a branch that reads VCCZ then emit a waitcnt and a vccz
3209 // restore instruction if either is needed.
3210 if (SIInstrInfo::isCBranchVCCZRead(MI) && MustRecomputeVCCZ) {
3211 // Recompute the vccz bit. Any time a value is written to vcc, the vccz
3212 // bit is updated, so we can restore the bit by reading the value of vcc
3213 // and then writing it back to the register.
3214 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
3215 TII.get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
3216 TRI.getVCC())
3217 .addReg(TRI.getVCC());
3218 MustRecomputeVCCZ = false;
3219 return true;
3220 }
3221 return false;
3222 }
3223};
3224
3225} // namespace
3226
3227// Generate s_waitcnt instructions where needed.
3228bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
3229 MachineBasicBlock &Block,
3230 WaitcntBrackets &ScoreBrackets) {
3231 bool Modified = false;
3232
3233 LLVM_DEBUG({
3234 dbgs() << "*** Begin Block: ";
3235 Block.printName(dbgs());
3236 ScoreBrackets.dump();
3237 });
3238 VCCZWorkaround VCCZW(ScoreBrackets, *ST, *TII, *TRI);
3239
3240 // Walk over the instructions.
3241 MachineInstr *OldWaitcntInstr = nullptr;
3242
3243 // NOTE: We may append instrs after Inst while iterating.
3244 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
3245 E = Block.instr_end();
3246 Iter != E; ++Iter) {
3247 MachineInstr &Inst = *Iter;
3248 if (Inst.isMetaInstruction())
3249 continue;
3250 // Track pre-existing waitcnts that were added in earlier iterations or by
3251 // the memory legalizer.
3252 if (isWaitInstr(Inst) ||
3253 (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3254 if (!OldWaitcntInstr)
3255 OldWaitcntInstr = &Inst;
3256 continue;
3257 }
3258
3259 PreheaderFlushFlags FlushFlags;
3260 if (Block.getFirstTerminator() == Inst)
3261 FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3262
3263 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
3264 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3265 FlushFlags);
3266 OldWaitcntInstr = nullptr;
3267
3268 if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
3269 // FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
3270 //
3271 // Asyncmarks record the current wait state and so should not allow
3272 // waitcnts that occur after them to be merged into waitcnts that occur
3273 // before.
3274 assert(ST->getGeneration() < AMDGPUSubtarget::GFX12);
3275 ScoreBrackets.recordAsyncMark(Inst);
3276 continue;
3277 }
3278
3279 if (TII->isSMRD(Inst)) {
3280 for (const MachineMemOperand *Memop : Inst.memoperands()) {
3281 // No need to handle invariant loads when avoiding WAR conflicts, as
3282 // there cannot be a vector store to the same memory location.
3283 if (!Memop->isInvariant()) {
3284 const Value *Ptr = Memop->getValue();
3285 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
3286 }
3287 }
3288 }
3289
3290 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3291
3292 // Note: insertForcedWaitAfter() may add instrs after Iter that need to be
3293 // visited by the loop.
3294 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
3295
3296 LLVM_DEBUG({
3297 Inst.print(dbgs());
3298 ScoreBrackets.dump();
3299 });
3300
3301 // If the target suffers from the vccz bugs, this may emit the necessary
3302 // vccz recompute instruction before \p Inst if needed.
3303 Modified |= VCCZW.tryRecomputeVCCZ(Inst);
3304 }
3305
3306 // Flush counters at the end of the block if needed (for preheaders with no
3307 // terminator).
3308 AMDGPU::Waitcnt Wait;
3309 if (Block.getFirstTerminator() == Block.end()) {
3310 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3311 if (FlushFlags.FlushVmCnt) {
3312 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
3313 Wait.set(LOAD_CNT, 0);
3314 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
3315 Wait.set(SAMPLE_CNT, 0);
3316 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
3317 Wait.set(BVH_CNT, 0);
3318 }
3319 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
3320 Wait.set(DS_CNT, 0);
3321 }
3322
3323 // Combine or remove any redundant waitcnts at the end of the block.
3324 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
3325 OldWaitcntInstr);
3326
3327 LLVM_DEBUG({
3328 dbgs() << "*** End Block: ";
3329 Block.printName(dbgs());
3330 ScoreBrackets.dump();
3331 });
3332
3333 return Modified;
3334}
3335
3336bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &Block) {
3337 if (Block.size() <= 1)
3338 return false;
3339 // The Memory Legalizer conservatively inserts a soft xcnt before each
3340 // atomic RMW operation. However, for sequences of back-to-back atomic
3341 // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
3342 // the redundant soft xcnts.
3343 bool Modified = false;
3344 // Remember the last atomic with a soft xcnt right before it.
3345 MachineInstr *LastAtomicWithSoftXcnt = nullptr;
3346
3347 for (MachineInstr &MI : drop_begin(Block)) {
3348 // Ignore last atomic if non-LDS VMEM and SMEM.
3349 bool IsLDS =
3350 TII->isDS(MI) || (TII->isFLAT(MI) && TII->mayAccessLDSThroughFlat(MI));
3351 if (!IsLDS && (MI.mayLoad() ^ MI.mayStore()))
3352 LastAtomicWithSoftXcnt = nullptr;
3353
3354 bool IsAtomicRMW = (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) &&
3355 MI.mayLoad() && MI.mayStore();
3356 MachineInstr &PrevMI = *MI.getPrevNode();
3357 // This is an atomic with a soft xcnt.
3358 if (PrevMI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3359 // If we have already found an atomic with a soft xcnt, remove this soft
3360 // xcnt as it's redundant.
3361 if (LastAtomicWithSoftXcnt) {
3362 PrevMI.eraseFromParent();
3363 Modified = true;
3364 }
3365 LastAtomicWithSoftXcnt = &MI;
3366 }
3367 }
3368 return Modified;
3369}
3370
3371// Return flags indicating which counters should be flushed in the preheader.
3372PreheaderFlushFlags
3373SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
3374 const WaitcntBrackets &ScoreBrackets) {
3375 auto [Iterator, IsInserted] =
3376 PreheadersToFlush.try_emplace(&MBB, PreheaderFlushFlags());
3377 if (!IsInserted)
3378 return Iterator->second;
3379
3380 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
3381 if (!Succ)
3382 return PreheaderFlushFlags();
3383
3384 MachineLoop *Loop = MLI.getLoopFor(Succ);
3385 if (!Loop)
3386 return PreheaderFlushFlags();
3387
3388 if (Loop->getLoopPreheader() == &MBB) {
3389 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3390 return Iterator->second;
3391 }
3392
3393 return PreheaderFlushFlags();
3394}
3395
3396bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
3398 return TII->mayAccessVMEMThroughFlat(MI);
3399 return SIInstrInfo::isVMEM(MI);
3400}
3401
3402bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {
3403 return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore();
3404}
3405
3406// Check if instruction is a store to LDS that is counted via DSCNT
3407// (where that counter exists).
3408bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {
3409 return MI.mayStore() && SIInstrInfo::isDS(MI);
3410}
3411
3412// Return flags indicating which counters should be flushed in the preheader of
3413// the given loop. We currently decide to flush in the following situations:
3414// For VMEM (FlushVmCnt):
3415// 1. The loop contains vmem store(s), no vmem load and at least one use of a
3416// vgpr containing a value that is loaded outside of the loop. (Only on
3417// targets with no vscnt counter).
3418// 2. The loop contains vmem load(s), but the loaded values are not used in the
3419// loop, and at least one use of a vgpr containing a value that is loaded
3420// outside of the loop.
3421// For DS (FlushDsCnt, GFX12+ only):
3422// 3. The loop contains no DS reads, and at least one use of a vgpr containing
3423// a value that is DS read outside of the loop.
3424// 4. The loop contains DS read(s), loaded values are not used in the same
3425// iteration but in the next iteration (prefetch pattern), and at least one
3426// use of a vgpr containing a value that is DS read outside of the loop.
3427// Flushing in preheader reduces wait overhead if the wait requirement in
3428// iteration 1 would otherwise be more strict (but unfortunately preheader
3429// flush decision is taken before knowing that).
3430// 5. (Single-block loops only) The loop has DS prefetch reads with flush point
3431// tracking. Some DS reads may be used in the same iteration (creating
3432// "flush points"), but others remain unflushed at the backedge. When a DS
3433// read is consumed in the same iteration, it and all prior reads are
3434// "flushed" (FIFO order). No DS writes are allowed in the loop.
3435// TODO: Find a way to extend to multi-block loops.
3436PreheaderFlushFlags
3437SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
3438 const WaitcntBrackets &Brackets) {
3439 PreheaderFlushFlags Flags;
3440 bool HasVMemLoad = false;
3441 bool HasVMemStore = false;
3442 bool UsesVgprVMEMLoadedOutside = false;
3443 bool UsesVgprDSReadOutside = false;
3444 bool VMemInvalidated = false;
3445 // DS optimization only applies to GFX12+ where DS_CNT is separate.
3446 // Tracking status for "no DS read in loop" or "pure DS prefetch
3447 // (use only in next iteration)".
3448 bool TrackSimpleDSOpt = ST->hasExtendedWaitCounts();
3449 DenseSet<MCRegUnit> VgprUse;
3450 DenseSet<MCRegUnit> VgprDefVMEM;
3451 DenseSet<MCRegUnit> VgprDefDS;
3452
3453 // Track DS reads for prefetch pattern with flush points (single-block only).
3454 // Keeps track of the last DS read (position counted from the top of the loop)
3455 // to each VGPR. Read is considered consumed (and thus needs flushing) if
3456 // the dest register has a use or is overwritten (by any later opertions).
3457 DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
3458 unsigned DSReadPosition = 0;
3459 bool IsSingleBlock = ML->getNumBlocks() == 1;
3460 bool TrackDSFlushPoint = ST->hasExtendedWaitCounts() && IsSingleBlock;
3461 unsigned LastDSFlushPosition = 0;
3462
3463 for (MachineBasicBlock *MBB : ML->blocks()) {
3464 for (MachineInstr &MI : *MBB) {
3465 if (isVMEMOrFlatVMEM(MI)) {
3466 HasVMemLoad |= MI.mayLoad();
3467 HasVMemStore |= MI.mayStore();
3468 }
3469 // TODO: Can we relax DSStore check? There may be cases where
3470 // these DS stores are drained prior to the end of MBB (or loop).
3471 if (mayStoreIncrementingDSCNT(MI)) {
3472 // Early exit if none of the optimizations are feasible.
3473 // Otherwise, set tracking status appropriately and continue.
3474 if (VMemInvalidated)
3475 return Flags;
3476 TrackSimpleDSOpt = false;
3477 TrackDSFlushPoint = false;
3478 }
3479 bool IsDSRead = isDSRead(MI);
3480 if (IsDSRead)
3481 ++DSReadPosition;
3482
3483 // Helper: if RU has a pending DS read, update LastDSFlushPosition
3484 auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
3485 if (!TrackDSFlushPoint)
3486 return;
3487 if (auto It = LastDSReadPositionMap.find(RU);
3488 It != LastDSReadPositionMap.end()) {
3489 // RU defined by DSRead is used or overwritten. Need to complete
3490 // the read, if not already implied by a later DSRead (to any RU)
3491 // needing to complete in FIFO order.
3492 LastDSFlushPosition = std::max(LastDSFlushPosition, It->second);
3493 }
3494 };
3495
3496 for (const MachineOperand &Op : MI.all_uses()) {
3497 if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
3498 continue;
3499 // Vgpr use
3500 for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
3501 // If we find a register that is loaded inside the loop, 1. and 2.
3502 // are invalidated.
3503 if (VgprDefVMEM.contains(RU))
3504 VMemInvalidated = true;
3505
3506 // Check for DS reads used inside the loop
3507 if (VgprDefDS.contains(RU))
3508 TrackSimpleDSOpt = false;
3509
3510 // Early exit if all optimizations are invalidated
3511 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3512 return Flags;
3513
3514 // Check for flush points (DS read used in same iteration)
3515 updateDSReadFlushTracking(RU);
3516
3517 VgprUse.insert(RU);
3518 // Check if this register has a pending VMEM load from outside the
3519 // loop (value loaded outside and used inside).
3520 VMEMID ID = toVMEMID(RU);
3521 if (Brackets.hasPendingVMEM(ID, LOAD_CNT) ||
3522 Brackets.hasPendingVMEM(ID, SAMPLE_CNT) ||
3523 Brackets.hasPendingVMEM(ID, BVH_CNT))
3524 UsesVgprVMEMLoadedOutside = true;
3525 // Check if loaded outside the loop via DS (not VMEM/FLAT).
3526 // Only consider it a DS read if there's no pending VMEM load for
3527 // this register, since FLAT can set both counters.
3528 else if (Brackets.hasPendingVMEM(ID, DS_CNT))
3529 UsesVgprDSReadOutside = true;
3530 }
3531 }
3532
3533 // VMem load vgpr def
3534 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
3535 for (const MachineOperand &Op : MI.all_defs()) {
3536 for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
3537 // If we find a register that is loaded inside the loop, 1. and 2.
3538 // are invalidated.
3539 if (VgprUse.contains(RU))
3540 VMemInvalidated = true;
3541 VgprDefVMEM.insert(RU);
3542 }
3543 }
3544 // Early exit if all optimizations are invalidated
3545 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3546 return Flags;
3547 }
3548
3549 // DS read vgpr def
3550 // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
3551 // If USE comes before DEF, it's the prefetch pattern (use value from
3552 // previous iteration, read for next iteration). We should still flush
3553 // in preheader so iteration 1 doesn't need to wait inside the loop.
3554 // Only invalidate when DEF comes before USE (same-iteration consumption,
3555 // checked above when processing uses).
3556 if (IsDSRead || TrackDSFlushPoint) {
3557 for (const MachineOperand &Op : MI.all_defs()) {
3558 if (!TRI->isVectorRegister(*MRI, Op.getReg()))
3559 continue;
3560 for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
3561 // Check for overwrite of pending DS read (flush point) by any
3562 // instruction
3563 updateDSReadFlushTracking(RU);
3564 if (IsDSRead) {
3565 VgprDefDS.insert(RU);
3566 if (TrackDSFlushPoint)
3567 LastDSReadPositionMap[RU] = DSReadPosition;
3568 }
3569 }
3570 }
3571 }
3572 }
3573 }
3574
3575 // VMEM flush decision
3576 if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
3577 ((!ST->hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3578 (HasVMemLoad && ST->hasVmemWriteVgprInOrder())))
3579 Flags.FlushVmCnt = true;
3580
3581 // DS flush decision:
3582 // Simple DS Opt: flush if loop uses DS read values from outside
3583 // and either has no DS reads in the loop, or DS reads whose results
3584 // are not used in the loop.
3585 bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
3586 // Prefetch with flush points: some DS reads used in same iteration,
3587 // but unflushed reads remain at backedge
3588 bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
3589 bool DSFlushPointPrefetch =
3590 TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
3591
3592 if (SimpleDSOpt || DSFlushPointPrefetch)
3593 Flags.FlushDsCnt = true;
3594
3595 return Flags;
3596}
3597
3598bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3599 auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3600 auto &PDT =
3601 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3602 AliasAnalysis *AA = nullptr;
3603 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3604 AA = &AAR->getAAResults();
3605
3606 return SIInsertWaitcnts(MLI, PDT, AA, MF).run();
3607}
3608
3609PreservedAnalyses
3612 auto &MLI = MFAM.getResult<MachineLoopAnalysis>(MF);
3613 auto &PDT = MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
3615 .getManager()
3616 .getCachedResult<AAManager>(MF.getFunction());
3617
3618 if (!SIInsertWaitcnts(MLI, PDT, AA, MF).run())
3619 return PreservedAnalyses::all();
3620
3623 .preserve<AAManager>();
3624}
3625
3626bool SIInsertWaitcnts::run() {
3627 ST = &MF.getSubtarget<GCNSubtarget>();
3628 TII = ST->getInstrInfo();
3629 TRI = &TII->getRegisterInfo();
3630 MRI = &MF.getRegInfo();
3632
3634
3635 // Initialize hardware limits first, as they're needed by the generators.
3636 Limits = AMDGPU::HardwareLimits(IV);
3637
3638 if (ST->hasExtendedWaitCounts()) {
3639 IsExpertMode = ST->hasExpertSchedulingMode() &&
3640 (ExpertSchedulingModeFlag.getNumOccurrences()
3642 : MF.getFunction()
3643 .getFnAttribute("amdgpu-expert-scheduling-mode")
3644 .getValueAsBool());
3645 MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS;
3646 // Initialize WCG per MF. It contains state that depends on MF attributes.
3647 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, &Limits,
3648 IsExpertMode);
3649 } else {
3650 MaxCounter = NUM_NORMAL_INST_CNTS;
3651 // Initialize WCG per MF. It contains state that depends on MF attributes.
3652 WCG = std::make_unique<WaitcntGeneratorPreGFX12>(MF, NUM_NORMAL_INST_CNTS,
3653 &Limits);
3654 }
3655
3656 for (auto T : inst_counter_types())
3657 ForceEmitWaitcnt[T] = false;
3658
3659 SmemAccessCounter = getCounterFromEvent(SMEM_ACCESS);
3660
3661 BlockInfos.clear();
3662 bool Modified = false;
3663
3664 MachineBasicBlock &EntryBB = MF.front();
3665
3666 if (!MFI->isEntryFunction()) {
3667 // Wait for any outstanding memory operations that the input registers may
3668 // depend on. We can't track them and it's better to do the wait after the
3669 // costly call sequence.
3670
3671 // TODO: Could insert earlier and schedule more liberally with operations
3672 // that only use caller preserved registers.
3674 while (I != EntryBB.end() && I->isMetaInstruction())
3675 ++I;
3676
3677 if (ST->hasExtendedWaitCounts()) {
3678 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
3679 .addImm(0);
3681 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
3682 continue;
3683
3684 if (!ST->hasImageInsts() &&
3685 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
3686 continue;
3687
3688 BuildMI(EntryBB, I, DebugLoc(),
3689 TII->get(instrsForExtendedCounterTypes[CT]))
3690 .addImm(0);
3691 }
3692 if (IsExpertMode) {
3693 unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, *ST);
3695 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3696 .addImm(Enc);
3697 }
3698 } else {
3699 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
3700 }
3701
3702 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
3703 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3704 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3705
3706 Modified = true;
3707 }
3708
3709 // Keep iterating over the blocks in reverse post order, inserting and
3710 // updating s_waitcnt where needed, until a fix point is reached.
3711 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3712 BlockInfos.try_emplace(MBB);
3713
3714 std::unique_ptr<WaitcntBrackets> Brackets;
3715 bool Repeat;
3716 do {
3717 Repeat = false;
3718
3719 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
3720 ++BII) {
3721 MachineBasicBlock *MBB = BII->first;
3722 BlockInfo &BI = BII->second;
3723 if (!BI.Dirty)
3724 continue;
3725
3726 if (BI.Incoming) {
3727 if (!Brackets)
3728 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3729 else
3730 *Brackets = *BI.Incoming;
3731 } else {
3732 if (!Brackets) {
3733 Brackets = std::make_unique<WaitcntBrackets>(this);
3734 } else {
3735 // Reinitialize in-place. N.B. do not do this by assigning from a
3736 // temporary because the WaitcntBrackets class is large and it could
3737 // cause this function to use an unreasonable amount of stack space.
3738 Brackets->~WaitcntBrackets();
3739 new (Brackets.get()) WaitcntBrackets(this);
3740 }
3741 }
3742
3743 if (ST->hasWaitXcnt())
3744 Modified |= removeRedundantSoftXcnts(*MBB);
3745 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
3746 BI.Dirty = false;
3747
3748 if (Brackets->hasPendingEvent()) {
3749 BlockInfo *MoveBracketsToSucc = nullptr;
3750 for (MachineBasicBlock *Succ : MBB->successors()) {
3751 auto *SuccBII = BlockInfos.find(Succ);
3752 BlockInfo &SuccBI = SuccBII->second;
3753 if (!SuccBI.Incoming) {
3754 SuccBI.Dirty = true;
3755 if (SuccBII <= BII) {
3756 LLVM_DEBUG(dbgs() << "Repeat on backedge without merge\n");
3757 Repeat = true;
3758 }
3759 if (!MoveBracketsToSucc) {
3760 MoveBracketsToSucc = &SuccBI;
3761 } else {
3762 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3763 }
3764 } else {
3765 LLVM_DEBUG({
3766 dbgs() << "Try to merge ";
3767 MBB->printName(dbgs());
3768 dbgs() << " into ";
3769 Succ->printName(dbgs());
3770 dbgs() << '\n';
3771 });
3772 if (SuccBI.Incoming->merge(*Brackets)) {
3773 SuccBI.Dirty = true;
3774 if (SuccBII <= BII) {
3775 LLVM_DEBUG(dbgs() << "Repeat on backedge with merge\n");
3776 Repeat = true;
3777 }
3778 }
3779 }
3780 }
3781 if (MoveBracketsToSucc)
3782 MoveBracketsToSucc->Incoming = std::move(Brackets);
3783 }
3784 }
3785 } while (Repeat);
3786
3787 if (ST->hasScalarStores()) {
3788 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3789 bool HaveScalarStores = false;
3790
3791 for (MachineBasicBlock &MBB : MF) {
3792 for (MachineInstr &MI : MBB) {
3793 if (!HaveScalarStores && TII->isScalarStore(MI))
3794 HaveScalarStores = true;
3795
3796 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
3797 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3798 EndPgmBlocks.push_back(&MBB);
3799 }
3800 }
3801
3802 if (HaveScalarStores) {
3803 // If scalar writes are used, the cache must be flushed or else the next
3804 // wave to reuse the same scratch memory can be clobbered.
3805 //
3806 // Insert s_dcache_wb at wave termination points if there were any scalar
3807 // stores, and only if the cache hasn't already been flushed. This could
3808 // be improved by looking across blocks for flushes in postdominating
3809 // blocks from the stores but an explicitly requested flush is probably
3810 // very rare.
3811 for (MachineBasicBlock *MBB : EndPgmBlocks) {
3812 bool SeenDCacheWB = false;
3813
3814 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
3815 I != E; ++I) {
3816 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
3817 SeenDCacheWB = true;
3818 else if (TII->isScalarStore(*I))
3819 SeenDCacheWB = false;
3820
3821 // FIXME: It would be better to insert this before a waitcnt if any.
3822 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
3823 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3824 !SeenDCacheWB) {
3825 Modified = true;
3826 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
3827 }
3828 }
3829 }
3830 }
3831 }
3832
3833 if (IsExpertMode) {
3834 // Enable expert scheduling on function entry. To satisfy ABI requirements
3835 // and to allow calls between function with different expert scheduling
3836 // settings, disable it around calls and before returns.
3837
3839 while (I != EntryBB.end() && I->isMetaInstruction())
3840 ++I;
3841 setSchedulingMode(EntryBB, I, true);
3842
3843 for (MachineInstr *MI : CallInsts) {
3844 MachineBasicBlock &MBB = *MI->getParent();
3845 setSchedulingMode(MBB, MI, false);
3846 setSchedulingMode(MBB, std::next(MI->getIterator()), true);
3847 }
3848
3849 for (MachineInstr *MI : ReturnInsts)
3850 setSchedulingMode(*MI->getParent(), MI, false);
3851
3852 Modified = true;
3853 }
3854
3855 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
3856 // This is done in different ways depending on how the VGPRs were allocated
3857 // (i.e. whether we're in dynamic VGPR mode or not).
3858 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
3859 // waveslot limited kernel runs slower with the deallocation.
3860 if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) {
3861 for (auto [MI, _] : EndPgmInsts) {
3862 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3863 TII->get(AMDGPU::S_ALLOC_VGPR))
3864 .addImm(0);
3865 Modified = true;
3866 }
3867 } else if (!WCG->isOptNone() &&
3868 ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
3869 (MF.getFrameInfo().hasCalls() ||
3870 ST->getOccupancyWithNumVGPRs(
3871 TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
3872 /*IsDynamicVGPR=*/false) <
3874 for (auto [MI, Flag] : EndPgmInsts) {
3875 if (Flag) {
3876 if (ST->requiresNopBeforeDeallocVGPRs()) {
3877 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3878 TII->get(AMDGPU::S_NOP))
3879 .addImm(0);
3880 }
3881 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3882 TII->get(AMDGPU::S_SENDMSG))
3884 Modified = true;
3885 }
3886 }
3887 }
3888
3889 CallInsts.clear();
3890 ReturnInsts.clear();
3891 EndPgmInsts.clear();
3892 PreheadersToFlush.clear();
3893 SLoadAddresses.clear();
3894
3895 return Modified;
3896}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
#define _
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
#define T
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
#define AMDGPU_EVENT_NAME(Name)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
Provides some synthesis utilities to produce sequences of values.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
Represents the counter values to wait for in an s_waitcnt instruction.
unsigned get(InstCounterType T) const
void set(InstCounterType T, unsigned Val)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:764
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
LLVM_ABI void printName(raw_ostream &os, unsigned printNameFlags=PrintNameIr, ModuleSlotTracker *moduleSlotTracker=nullptr) const
Print the basic block's name as:
MachineInstrBundleIterator< MachineInstr > iterator
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator end()
Definition MapVector.h:67
iterator find(const KeyT &Key)
Definition MapVector.h:154
iterator begin()
Definition MapVector.h:65
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
Definition Pass.cpp:140
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool usesASYNC_CNT(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:882
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
bool isDPMACCInstruction(unsigned Opc)
iota_range< InstCounterType > inst_counter_types(InstCounterType MaxCounter)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
bool empty() const
Definition BasicBlock.h:101
LLVM_ABI std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto seq_inclusive(T Begin, T End)
Iterate over an integral type from Begin to End inclusive.
Definition Sequence.h:325
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
APInt operator&(APInt a, const APInt &b)
Definition APInt.h:2138
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition Sequence.h:337
@ Wait
Definition Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2128
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2313
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
@ Async
"Asynchronous" unwind tables (instr precise)
Definition CodeGen.h:157
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool operator|=(SparseBitVector< ElementSize > &LHS, const SparseBitVector< ElementSize > *RHS)
APInt operator|(APInt a, const APInt &b)
Definition APInt.h:2158
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
#define N
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.