LLVM 23.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
31#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/Sequence.h"
39#include "llvm/IR/Dominators.h"
43
44using namespace llvm;
45
46#define DEBUG_TYPE "si-insert-waitcnts"
47
48DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
49 "Force emit s_waitcnt expcnt(0) instrs");
50DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
51 "Force emit s_waitcnt lgkmcnt(0) instrs");
52DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
53 "Force emit s_waitcnt vmcnt(0) instrs");
54
55static cl::opt<bool>
56 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
57 cl::desc("Force all waitcnt instrs to be emitted as "
58 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
59 cl::init(false), cl::Hidden);
60
62 "amdgpu-waitcnt-load-forcezero",
63 cl::desc("Force all waitcnt load counters to wait until 0"),
64 cl::init(false), cl::Hidden);
65
67 "amdgpu-expert-scheduling-mode",
68 cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
69 cl::init(false), cl::Hidden);
70
71namespace {
72// Class of object that encapsulates latest instruction counter score
73// associated with the operand. Used for determining whether
74// s_waitcnt instruction needs to be emitted.
75
76enum InstCounterType {
77 LOAD_CNT = 0, // VMcnt prior to gfx12.
78 DS_CNT, // LKGMcnt prior to gfx12.
79 EXP_CNT, //
80 STORE_CNT, // VScnt in gfx10/gfx11.
81 NUM_NORMAL_INST_CNTS,
82 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
83 BVH_CNT, // gfx12+ only.
84 KM_CNT, // gfx12+ only.
85 X_CNT, // gfx1250.
86 NUM_EXTENDED_INST_CNTS,
87 VA_VDST = NUM_EXTENDED_INST_CNTS, // gfx12+ expert mode only.
88 VM_VSRC, // gfx12+ expert mode only.
89 NUM_EXPERT_INST_CNTS,
90 NUM_INST_CNTS = NUM_EXPERT_INST_CNTS
91};
92} // namespace
93
94namespace llvm {
95template <> struct enum_iteration_traits<InstCounterType> {
96 static constexpr bool is_iterable = true;
97};
98} // namespace llvm
99
100namespace {
101// Return an iterator over all counters between LOAD_CNT (the first counter)
102// and \c MaxCounter (exclusive, default value yields an enumeration over
103// all counters).
104auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
105 return enum_seq(LOAD_CNT, MaxCounter);
106}
107
108// Get the maximum wait count value for a given counter type.
109static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
110 InstCounterType T) {
111 switch (T) {
112 case LOAD_CNT:
113 return Limits.LoadcntMax;
114 case DS_CNT:
115 return Limits.DscntMax;
116 case EXP_CNT:
117 return Limits.ExpcntMax;
118 case STORE_CNT:
119 return Limits.StorecntMax;
120 case SAMPLE_CNT:
121 return Limits.SamplecntMax;
122 case BVH_CNT:
123 return Limits.BvhcntMax;
124 case KM_CNT:
125 return Limits.KmcntMax;
126 case X_CNT:
127 return Limits.XcntMax;
128 case VA_VDST:
129 return Limits.VaVdstMax;
130 case VM_VSRC:
131 return Limits.VmVsrcMax;
132 default:
133 return 0;
134 }
135}
136
137static bool isSoftXcnt(MachineInstr &MI) {
138 return MI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft;
139}
140
141static bool isAtomicRMW(MachineInstr &MI) {
142 return (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) && MI.mayLoad() &&
143 MI.mayStore();
144}
145
146enum class AtomicRMWState {
147 NewBlock, // Start of a new atomic RMW block
148 InsideBlock, // Middle of an existing block
149 NotInBlock // Not in an atomic RMW block
150};
151
152/// Integer IDs used to track vector memory locations we may have to wait on.
153/// Encoded as u16 chunks:
154///
155/// [0, REGUNITS_END ): MCRegUnit
156/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
157///
158/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
159/// It gives (2 << 16) - 1 entries per category which is more than enough
160/// for all register units. MCPhysReg is u16 so we don't even support >u16
161/// physical register numbers at this time, let alone >u16 register units.
162/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
163/// is enough for all register units.
164using VMEMID = uint32_t;
165
166enum : VMEMID {
167 TRACKINGID_RANGE_LEN = (1 << 16),
168
169 // Important: MCRegUnits must always be tracked starting from 0, as we
170 // need to be able to convert between a MCRegUnit and a VMEMID freely.
171 REGUNITS_BEGIN = 0,
172 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
173
174 // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
175 // entry, which is updated for all LDS DMA operations encountered.
176 // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
177 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
178 LDSDMA_BEGIN = REGUNITS_END,
179 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
180};
181
182/// Convert a MCRegUnit to a VMEMID.
183static constexpr VMEMID toVMEMID(MCRegUnit RU) {
184 return static_cast<unsigned>(RU);
185}
186
187#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
188 DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \
189 DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
190 DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
191 DECL(GLOBAL_INV_ACCESS) /* GLOBAL_INV (gfx12+ only) */ \
192 DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
193 DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
194 DECL(VMEM_GROUP) /* vmem group */ \
195 DECL(LDS_ACCESS) /* lds read & write */ \
196 DECL(GDS_ACCESS) /* gds read & write */ \
197 DECL(SQ_MESSAGE) /* send message */ \
198 DECL(SCC_WRITE) /* write to SCC from barrier */ \
199 DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
200 DECL(SMEM_GROUP) /* scalar-memory group */ \
201 DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
202 DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
203 DECL(EXP_POS_ACCESS) /* write to export position */ \
204 DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
205 DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
206 DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */ \
207 DECL(VGPR_CSMACC_WRITE) /* write VGPR dest in Core/Side-MACC VALU */ \
208 DECL(VGPR_DPMACC_WRITE) /* write VGPR dest in DPMACC VALU */ \
209 DECL(VGPR_TRANS_WRITE) /* write VGPR dest in TRANS VALU */ \
210 DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \
211 DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
212 DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
213 DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */
214
215// clang-format off
216#define AMDGPU_EVENT_ENUM(Name) Name,
217enum WaitEventType {
219 NUM_WAIT_EVENTS
220};
221#undef AMDGPU_EVENT_ENUM
222} // namespace
223
224namespace llvm {
225template <> struct enum_iteration_traits<WaitEventType> {
226 static constexpr bool is_iterable = true;
227};
228} // namespace llvm
229
230namespace {
231
232/// Return an iterator over all events between VMEM_ACCESS (the first event)
233/// and \c MaxEvent (exclusive, default value yields an enumeration over
234/// all counters).
235auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
236 return enum_seq(VMEM_ACCESS, MaxEvent);
237}
238
239#define AMDGPU_EVENT_NAME(Name) #Name,
240static constexpr StringLiteral WaitEventTypeName[] = {
242};
243#undef AMDGPU_EVENT_NAME
244static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
245 return WaitEventTypeName[Event];
246}
247// clang-format on
248
249// Enumerate different types of result-returning VMEM operations. Although
250// s_waitcnt orders them all with a single vmcnt counter, in the absence of
251// s_waitcnt only instructions of the same VmemType are guaranteed to write
252// their results in order -- so there is no need to insert an s_waitcnt between
253// two instructions of the same type that write the same vgpr.
254enum VmemType {
255 // BUF instructions and MIMG instructions without a sampler.
256 VMEM_NOSAMPLER,
257 // MIMG instructions with a sampler.
258 VMEM_SAMPLER,
259 // BVH instructions
260 VMEM_BVH,
261 NUM_VMEM_TYPES
262};
263
264// Maps values of InstCounterType to the instruction that waits on that
265// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
266// returns true, and does not cover VA_VDST or VM_VSRC.
267static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
268 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
269 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
270 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
271
272static bool updateVMCntOnly(const MachineInstr &Inst) {
273 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
275}
276
277#ifndef NDEBUG
278static bool isNormalMode(InstCounterType MaxCounter) {
279 return MaxCounter == NUM_NORMAL_INST_CNTS;
280}
281#endif // NDEBUG
282
283VmemType getVmemType(const MachineInstr &Inst) {
284 assert(updateVMCntOnly(Inst));
285 if (!SIInstrInfo::isImage(Inst))
286 return VMEM_NOSAMPLER;
287 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
288 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
289 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
290
291 if (BaseInfo->BVH)
292 return VMEM_BVH;
293
294 // We have to make an additional check for isVSAMPLE here since some
295 // instructions don't have a sampler, but are still classified as sampler
296 // instructions for the purposes of e.g. waitcnt.
297 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
298 return VMEM_SAMPLER;
299
300 return VMEM_NOSAMPLER;
301}
302
303unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
304 switch (T) {
305 case LOAD_CNT:
306 return Wait.LoadCnt;
307 case EXP_CNT:
308 return Wait.ExpCnt;
309 case DS_CNT:
310 return Wait.DsCnt;
311 case STORE_CNT:
312 return Wait.StoreCnt;
313 case SAMPLE_CNT:
314 return Wait.SampleCnt;
315 case BVH_CNT:
316 return Wait.BvhCnt;
317 case KM_CNT:
318 return Wait.KmCnt;
319 case X_CNT:
320 return Wait.XCnt;
321 case VA_VDST:
322 return Wait.VaVdst;
323 case VM_VSRC:
324 return Wait.VmVsrc;
325 default:
326 llvm_unreachable("bad InstCounterType");
327 }
328}
329
330void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
331 unsigned &WC = getCounterRef(Wait, T);
332 WC = std::min(WC, Count);
333}
334
335void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
336 getCounterRef(Wait, T) = ~0u;
337}
338
339unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
340 return getCounterRef(Wait, T);
341}
342
343/// A small set of events.
344class WaitEventSet {
345 unsigned Mask = 0;
346
347public:
348 WaitEventSet() = default;
349 explicit constexpr WaitEventSet(WaitEventType Event) {
350 static_assert(NUM_WAIT_EVENTS <= sizeof(Mask) * 8,
351 "Not enough bits in Mask for all the events");
352 Mask |= 1 << Event;
353 }
354 constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
355 for (auto &E : Events) {
356 Mask |= 1 << E;
357 }
358 }
359 void insert(const WaitEventType &Event) { Mask |= 1 << Event; }
360 void remove(const WaitEventType &Event) { Mask &= ~(1 << Event); }
361 void remove(const WaitEventSet &Other) { Mask &= ~Other.Mask; }
362 bool contains(const WaitEventType &Event) const {
363 return Mask & (1 << Event);
364 }
365 /// \Returns true if this set contains all elements of \p Other.
366 bool contains(const WaitEventSet &Other) const {
367 return (~Mask & Other.Mask) == 0;
368 }
369 /// \Returns the intersection of this and \p Other.
370 WaitEventSet operator&(const WaitEventSet &Other) const {
371 auto Copy = *this;
372 Copy.Mask &= Other.Mask;
373 return Copy;
374 }
375 /// \Returns the union of this and \p Other.
376 WaitEventSet operator|(const WaitEventSet &Other) const {
377 auto Copy = *this;
378 Copy.Mask |= Other.Mask;
379 return Copy;
380 }
381 /// This set becomes the union of this and \p Other.
382 WaitEventSet &operator|=(const WaitEventSet &Other) {
383 Mask |= Other.Mask;
384 return *this;
385 }
386 /// This set becomes the intersection of this and \p Other.
387 WaitEventSet &operator&=(const WaitEventSet &Other) {
388 Mask &= Other.Mask;
389 return *this;
390 }
391 bool operator==(const WaitEventSet &Other) const {
392 return Mask == Other.Mask;
393 }
394 bool operator!=(const WaitEventSet &Other) const { return !(*this == Other); }
395 bool empty() const { return Mask == 0; }
396 /// \Returns true if the set contains more than one element.
397 bool twoOrMore() const { return Mask & (Mask - 1); }
398 operator bool() const { return !empty(); }
399 void print(raw_ostream &OS) const {
400 ListSeparator LS(", ");
401 for (WaitEventType Event : wait_events()) {
402 OS << LS << getWaitEventTypeName(Event);
403 }
404 }
405 LLVM_DUMP_METHOD void dump() const;
406};
407
408void WaitEventSet::dump() const {
409 print(dbgs());
410 dbgs() << "\n";
411}
412
413// Mapping from event to counter according to the table masks.
414InstCounterType eventCounter(const WaitEventSet *masks, WaitEventType E) {
415 for (auto T : inst_counter_types()) {
416 if (masks[T].contains(E))
417 return T;
418 }
419 llvm_unreachable("event type has no associated counter");
420}
421
422class WaitcntBrackets;
423
424// This abstracts the logic for generating and updating S_WAIT* instructions
425// away from the analysis that determines where they are needed. This was
426// done because the set of counters and instructions for waiting on them
427// underwent a major shift with gfx12, sufficiently so that having this
428// abstraction allows the main analysis logic to be simpler than it would
429// otherwise have had to become.
430class WaitcntGenerator {
431protected:
432 const GCNSubtarget *ST = nullptr;
433 const SIInstrInfo *TII = nullptr;
434 AMDGPU::IsaVersion IV;
435 InstCounterType MaxCounter;
436 bool OptNone;
437 bool ExpandWaitcntProfiling = false;
438 const AMDGPU::HardwareLimits *Limits = nullptr;
439
440public:
441 WaitcntGenerator() = delete;
442 WaitcntGenerator(const WaitcntGenerator &) = delete;
443 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter,
444 const AMDGPU::HardwareLimits *Limits)
445 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
446 IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
447 OptNone(MF.getFunction().hasOptNone() ||
448 MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
449 ExpandWaitcntProfiling(
450 MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),
451 Limits(Limits) {}
452
453 // Return true if the current function should be compiled with no
454 // optimization.
455 bool isOptNone() const { return OptNone; }
456
457 const AMDGPU::HardwareLimits &getLimits() const { return *Limits; }
458
459 // Edits an existing sequence of wait count instructions according
460 // to an incoming Waitcnt value, which is itself updated to reflect
461 // any new wait count instructions which may need to be generated by
462 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
463 // were made.
464 //
465 // This editing will usually be merely updated operands, but it may also
466 // delete instructions if the incoming Wait value indicates they are not
467 // needed. It may also remove existing instructions for which a wait
468 // is needed if it can be determined that it is better to generate new
469 // instructions later, as can happen on gfx12.
470 virtual bool
471 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
472 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
474
475 // Transform a soft waitcnt into a normal one.
476 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
477
478 // Generates new wait count instructions according to the value of
479 // Wait, returning true if any new instructions were created.
480 // ScoreBrackets is used for profiling expansion.
481 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
483 AMDGPU::Waitcnt Wait,
484 const WaitcntBrackets &ScoreBrackets) = 0;
485
486 // Returns an array of WaitEventSets which can be used to map values in
487 // WaitEventType to corresponding counter values in InstCounterType.
488 virtual const WaitEventSet *getWaitEventMask() const = 0;
489
490 // Returns a new waitcnt with all counters except VScnt set to 0. If
491 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
492 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
493
494 virtual ~WaitcntGenerator() = default;
495};
496
497class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
498 static constexpr const WaitEventSet
499 WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
500 WaitEventSet(
501 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
502 WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
503 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
504 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
505 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
506 WaitEventSet(),
507 WaitEventSet(),
508 WaitEventSet(),
509 WaitEventSet(),
510 WaitEventSet(),
511 WaitEventSet()};
512
513public:
514 using WaitcntGenerator::WaitcntGenerator;
515 bool
516 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
517 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
518 MachineBasicBlock::instr_iterator It) const override;
519
520 bool createNewWaitcnt(MachineBasicBlock &Block,
522 AMDGPU::Waitcnt Wait,
523 const WaitcntBrackets &ScoreBrackets) override;
524
525 const WaitEventSet *getWaitEventMask() const override {
526 assert(ST);
527 return WaitEventMaskForInstPreGFX12;
528 }
529
530 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
531};
532
533class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
534protected:
535 bool IsExpertMode;
536 static constexpr const WaitEventSet
537 WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
538 WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
539 WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
540 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
541 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
542 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
543 WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),
544 WaitEventSet({VMEM_BVH_READ_ACCESS}),
545 WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
546 WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
547 WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
548 VGPR_XDL_WRITE}),
549 WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
550
551public:
552 WaitcntGeneratorGFX12Plus() = delete;
553 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
554 InstCounterType MaxCounter,
555 const AMDGPU::HardwareLimits *Limits,
556 bool IsExpertMode)
557 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
558
559 bool
560 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
561 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
562 MachineBasicBlock::instr_iterator It) const override;
563
564 bool createNewWaitcnt(MachineBasicBlock &Block,
566 AMDGPU::Waitcnt Wait,
567 const WaitcntBrackets &ScoreBrackets) override;
568
569 const WaitEventSet *getWaitEventMask() const override {
570 assert(ST);
571 return WaitEventMaskForInstGFX12Plus;
572 }
573
574 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
575};
576
577// Flags indicating which counters should be flushed in a loop preheader.
578struct PreheaderFlushFlags {
579 bool FlushVmCnt = false;
580 bool FlushDsCnt = false;
581};
582
583class SIInsertWaitcnts {
584public:
585 const GCNSubtarget *ST;
586 const SIInstrInfo *TII = nullptr;
587 const SIRegisterInfo *TRI = nullptr;
588 const MachineRegisterInfo *MRI = nullptr;
589 InstCounterType SmemAccessCounter;
590 InstCounterType MaxCounter;
591 bool IsExpertMode = false;
592
593private:
594 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
595 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
596 MachineLoopInfo *MLI;
597 MachinePostDominatorTree *PDT;
598 AliasAnalysis *AA = nullptr;
599
600 struct BlockInfo {
601 std::unique_ptr<WaitcntBrackets> Incoming;
602 bool Dirty = true;
603 };
604
605 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
606
607 bool ForceEmitWaitcnt[NUM_INST_CNTS];
608
609 std::unique_ptr<WaitcntGenerator> WCG;
610
611 // Remember call and return instructions in the function.
612 DenseSet<MachineInstr *> CallInsts;
613 DenseSet<MachineInstr *> ReturnInsts;
614
615 // Remember all S_ENDPGM instructions. The boolean flag is true if there might
616 // be outstanding stores but definitely no outstanding scratch stores, to help
617 // with insertion of DEALLOC_VGPRS messages.
618 DenseMap<MachineInstr *, bool> EndPgmInsts;
619
620 AMDGPU::HardwareLimits Limits;
621
622public:
623 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
624 AliasAnalysis *AA)
625 : MLI(MLI), PDT(PDT), AA(AA) {
626 (void)ForceExpCounter;
627 (void)ForceLgkmCounter;
628 (void)ForceVMCounter;
629 }
630
631 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
632
633 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
634 const WaitcntBrackets &Brackets);
635 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
636 const WaitcntBrackets &ScoreBrackets);
637 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
638 bool isDSRead(const MachineInstr &MI) const;
639 bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;
640 bool run(MachineFunction &MF);
641
642 void setForceEmitWaitcnt() {
643// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
644// For debug builds, get the debug counter info and adjust if need be
645#ifndef NDEBUG
646 if (DebugCounter::isCounterSet(ForceExpCounter) &&
647 DebugCounter::shouldExecute(ForceExpCounter)) {
648 ForceEmitWaitcnt[EXP_CNT] = true;
649 } else {
650 ForceEmitWaitcnt[EXP_CNT] = false;
651 }
652
653 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
654 DebugCounter::shouldExecute(ForceLgkmCounter)) {
655 ForceEmitWaitcnt[DS_CNT] = true;
656 ForceEmitWaitcnt[KM_CNT] = true;
657 } else {
658 ForceEmitWaitcnt[DS_CNT] = false;
659 ForceEmitWaitcnt[KM_CNT] = false;
660 }
661
662 if (DebugCounter::isCounterSet(ForceVMCounter) &&
663 DebugCounter::shouldExecute(ForceVMCounter)) {
664 ForceEmitWaitcnt[LOAD_CNT] = true;
665 ForceEmitWaitcnt[SAMPLE_CNT] = true;
666 ForceEmitWaitcnt[BVH_CNT] = true;
667 } else {
668 ForceEmitWaitcnt[LOAD_CNT] = false;
669 ForceEmitWaitcnt[SAMPLE_CNT] = false;
670 ForceEmitWaitcnt[BVH_CNT] = false;
671 }
672
673 ForceEmitWaitcnt[VA_VDST] = false;
674 ForceEmitWaitcnt[VM_VSRC] = false;
675#endif // NDEBUG
676 }
677
678 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
679 // instruction.
680 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
681 switch (Inst.getOpcode()) {
682 // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
683 case AMDGPU::GLOBAL_INV:
684 return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write
685 // VGPRs
686 case AMDGPU::GLOBAL_WB:
687 case AMDGPU::GLOBAL_WBINV:
688 return VMEM_WRITE_ACCESS; // tracked using storecnt
689 default:
690 break;
691 }
692
693 // Maps VMEM access types to their corresponding WaitEventType.
694 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
695 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
696
698 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
699 // these should use VM_CNT.
700 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
701 return VMEM_ACCESS;
702 if (Inst.mayStore() &&
703 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
704 if (TII->mayAccessScratch(Inst))
705 return SCRATCH_WRITE_ACCESS;
706 return VMEM_WRITE_ACCESS;
707 }
708 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
709 return VMEM_ACCESS;
710 return VmemReadMapping[getVmemType(Inst)];
711 }
712
713 std::optional<WaitEventType>
714 getExpertSchedulingEventType(const MachineInstr &Inst) const;
715
716 bool isVmemAccess(const MachineInstr &MI) const;
717 bool generateWaitcntInstBefore(MachineInstr &MI,
718 WaitcntBrackets &ScoreBrackets,
719 MachineInstr *OldWaitcntInstr,
720 PreheaderFlushFlags FlushFlags);
721 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
723 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
724 MachineInstr *OldWaitcntInstr);
725 void updateEventWaitcntAfter(MachineInstr &Inst,
726 WaitcntBrackets *ScoreBrackets);
727 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
728 MachineBasicBlock *Block) const;
729 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
730 WaitcntBrackets &ScoreBrackets);
731 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
732 WaitcntBrackets &ScoreBrackets);
733 void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
734 bool ExpertMode) const;
735 AtomicRMWState getAtomicRMWState(MachineInstr &MI,
736 AtomicRMWState PrevState) const;
737 const WaitEventSet *getWaitEventMask() const {
738 return WCG->getWaitEventMask();
739 }
740};
741
742// This objects maintains the current score brackets of each wait counter, and
743// a per-register scoreboard for each wait counter.
744//
745// We also maintain the latest score for every event type that can change the
746// waitcnt in order to know if there are multiple types of events within
747// the brackets. When multiple types of event happen in the bracket,
748// wait count may get decreased out of order, therefore we need to put in
749// "s_waitcnt 0" before use.
750class WaitcntBrackets {
751public:
752 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
753 assert(Context->TRI->getNumRegUnits() < REGUNITS_END);
754 }
755
756#ifndef NDEBUG
757 ~WaitcntBrackets() {
758 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
759 for (auto &[ID, Val] : VMem) {
760 if (Val.empty())
761 ++NumUnusedVmem;
762 }
763 for (auto &[ID, Val] : SGPRs) {
764 if (Val.empty())
765 ++NumUnusedSGPRs;
766 }
767
768 if (NumUnusedVmem || NumUnusedSGPRs) {
769 errs() << "WaitcntBracket had unused entries at destruction time: "
770 << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
771 << " SGPR unused entries\n";
772 std::abort();
773 }
774 }
775#endif
776
777 bool isSmemCounter(InstCounterType T) const {
778 return T == Context->SmemAccessCounter || T == X_CNT;
779 }
780
781 unsigned getSgprScoresIdx(InstCounterType T) const {
782 assert(isSmemCounter(T) && "Invalid SMEM counter");
783 return T == X_CNT ? 1 : 0;
784 }
785
786 unsigned getScoreLB(InstCounterType T) const {
787 assert(T < NUM_INST_CNTS);
788 return ScoreLBs[T];
789 }
790
791 unsigned getScoreUB(InstCounterType T) const {
792 assert(T < NUM_INST_CNTS);
793 return ScoreUBs[T];
794 }
795
796 unsigned getScoreRange(InstCounterType T) const {
797 return getScoreUB(T) - getScoreLB(T);
798 }
799
800 unsigned getSGPRScore(MCRegUnit RU, InstCounterType T) const {
801 auto It = SGPRs.find(RU);
802 return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(T)] : 0;
803 }
804
805 unsigned getVMemScore(VMEMID TID, InstCounterType T) const {
806 auto It = VMem.find(TID);
807 return It != VMem.end() ? It->second.Scores[T] : 0;
808 }
809
810 bool merge(const WaitcntBrackets &Other);
811
812 bool counterOutOfOrder(InstCounterType T) const;
813 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
814 simplifyWaitcnt(Wait, Wait);
815 }
816 void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
817 AMDGPU::Waitcnt &UpdateWait) const;
818 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
819 void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
820 AMDGPU::Waitcnt &UpdateWait) const;
821 void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
822 AMDGPU::Waitcnt &UpdateWait) const;
823
824 void determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
825 AMDGPU::Waitcnt &Wait) const;
826 void determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
827 AMDGPU::Waitcnt &Wait) const;
828 void tryClearSCCWriteEvent(MachineInstr *Inst);
829
830 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
831 void applyWaitcnt(InstCounterType T, unsigned Count);
832 void updateByEvent(WaitEventType E, MachineInstr &MI);
833
834 bool hasPendingEvent() const { return !PendingEvents.empty(); }
835 bool hasPendingEvent(WaitEventType E) const {
836 return PendingEvents.contains(E);
837 }
838 bool hasPendingEvent(InstCounterType T) const {
839 bool HasPending = PendingEvents & Context->getWaitEventMask()[T];
840 assert(HasPending == (getScoreRange(T) != 0) &&
841 "Expected no pending events iff scoreboard is empty");
842 return HasPending;
843 }
844
845 bool hasMixedPendingEvents(InstCounterType T) const {
846 WaitEventSet Events = PendingEvents & Context->getWaitEventMask()[T];
847 // Return true if more than one bit is set in Events.
848 return Events.twoOrMore();
849 }
850
851 bool hasPendingFlat() const {
852 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
853 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
854 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
855 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
856 }
857
858 void setPendingFlat() {
859 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
860 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
861 }
862
863 bool hasPendingGDS() const {
864 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
865 }
866
867 unsigned getPendingGDSWait() const {
868 return std::min(getScoreUB(DS_CNT) - LastGDS,
869 getWaitCountMax(Context->getLimits(), DS_CNT) - 1);
870 }
871
872 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
873
874 // Return true if there might be pending writes to the vgpr-interval by VMEM
875 // instructions with types different from V.
876 bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
877 for (MCRegUnit RU : regunits(Reg)) {
878 auto It = VMem.find(toVMEMID(RU));
879 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
880 return true;
881 }
882 return false;
883 }
884
885 void clearVgprVmemTypes(MCPhysReg Reg) {
886 for (MCRegUnit RU : regunits(Reg)) {
887 if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
888 It->second.VMEMTypes = 0;
889 if (It->second.empty())
890 VMem.erase(It);
891 }
892 }
893 }
894
895 void setStateOnFunctionEntryOrReturn() {
896 setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) +
897 getWaitCountMax(Context->getLimits(), STORE_CNT));
898 PendingEvents |= Context->getWaitEventMask()[STORE_CNT];
899 }
900
901 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
902 return LDSDMAStores;
903 }
904
905 bool hasPointSampleAccel(const MachineInstr &MI) const;
906 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
907 MCPhysReg RU) const;
908
909 void print(raw_ostream &) const;
910 void dump() const { print(dbgs()); }
911
912 // Free up memory by removing empty entries from the DenseMap that track event
913 // scores.
914 void purgeEmptyTrackingData();
915
916private:
917 struct MergeInfo {
918 unsigned OldLB;
919 unsigned OtherLB;
920 unsigned MyShift;
921 unsigned OtherShift;
922 };
923
924 void determineWaitForScore(InstCounterType T, unsigned Score,
925 AMDGPU::Waitcnt &Wait) const;
926
927 static bool mergeScore(const MergeInfo &M, unsigned &Score,
928 unsigned OtherScore);
929
931 assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
932 if (!Context->TRI->isInAllocatableClass(Reg))
933 return {{}, {}};
934 const TargetRegisterClass *RC = Context->TRI->getPhysRegBaseClass(Reg);
935 unsigned Size = Context->TRI->getRegSizeInBits(*RC);
936 if (Size == 16 && Context->ST->hasD16Writes32BitVgpr())
937 Reg = Context->TRI->get32BitRegister(Reg);
938 return Context->TRI->regunits(Reg);
939 }
940
941 void setScoreLB(InstCounterType T, unsigned Val) {
942 assert(T < NUM_INST_CNTS);
943 ScoreLBs[T] = Val;
944 }
945
946 void setScoreUB(InstCounterType T, unsigned Val) {
947 assert(T < NUM_INST_CNTS);
948 ScoreUBs[T] = Val;
949
950 if (T != EXP_CNT)
951 return;
952
953 if (getScoreRange(EXP_CNT) > getWaitCountMax(Context->getLimits(), EXP_CNT))
954 ScoreLBs[EXP_CNT] =
955 ScoreUBs[EXP_CNT] - getWaitCountMax(Context->getLimits(), EXP_CNT);
956 }
957
958 void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {
959 const SIRegisterInfo *TRI = Context->TRI;
960 if (Reg == AMDGPU::SCC) {
961 SCCScore = Val;
962 } else if (TRI->isVectorRegister(*Context->MRI, Reg)) {
963 for (MCRegUnit RU : regunits(Reg))
964 VMem[toVMEMID(RU)].Scores[T] = Val;
965 } else if (TRI->isSGPRReg(*Context->MRI, Reg)) {
966 auto STy = getSgprScoresIdx(T);
967 for (MCRegUnit RU : regunits(Reg))
968 SGPRs[RU].Scores[STy] = Val;
969 } else {
970 llvm_unreachable("Register cannot be tracked/unknown register!");
971 }
972 }
973
974 void setVMemScore(VMEMID TID, InstCounterType T, unsigned Val) {
975 VMem[TID].Scores[T] = Val;
976 }
977
978 void setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy,
979 unsigned Val);
980
981 const SIInsertWaitcnts *Context;
982
983 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
984 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
985 WaitEventSet PendingEvents;
986 // Remember the last flat memory operation.
987 unsigned LastFlat[NUM_INST_CNTS] = {0};
988 // Remember the last GDS operation.
989 unsigned LastGDS = 0;
990
991 // The score tracking logic is fragmented as follows:
992 // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
993 // - SGPRs: SGPR RegUnits
994 // - SCC: Non-allocatable and not general purpose: not a SGPR.
995 //
996 // For the VMem case, if the key is within the range of LDS DMA IDs,
997 // then the corresponding index into the `LDSDMAStores` vector below is:
998 // Key - LDSDMA_BEGIN - 1
999 // This is because LDSDMA_BEGIN is a generic entry and does not have an
1000 // associated MachineInstr.
1001 //
1002 // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
1003
1004 struct VMEMInfo {
1005 // Scores for all instruction counters.
1006 std::array<unsigned, NUM_INST_CNTS> Scores = {0};
1007 // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
1008 unsigned VMEMTypes = 0;
1009
1010 bool empty() const { return all_of(Scores, equal_to(0)) && !VMEMTypes; }
1011 };
1012
1013 struct SGPRInfo {
1014 // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
1015 // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
1016 // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps
1017 // the X_CNT score.
1018 std::array<unsigned, 2> Scores = {0};
1019
1020 bool empty() const { return !Scores[0] && !Scores[1]; }
1021 };
1022
1023 DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
1024 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
1025
1026 // Reg score for SCC.
1027 unsigned SCCScore = 0;
1028 // The unique instruction that has an SCC write pending, if there is one.
1029 const MachineInstr *PendingSCCWrite = nullptr;
1030
1031 // Store representative LDS DMA operations. The only useful info here is
1032 // alias info. One store is kept per unique AAInfo.
1033 SmallVector<const MachineInstr *> LDSDMAStores;
1034};
1035
1036class SIInsertWaitcntsLegacy : public MachineFunctionPass {
1037public:
1038 static char ID;
1039 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
1040
1041 bool runOnMachineFunction(MachineFunction &MF) override;
1042
1043 StringRef getPassName() const override {
1044 return "SI insert wait instructions";
1045 }
1046
1047 void getAnalysisUsage(AnalysisUsage &AU) const override {
1048 AU.setPreservesCFG();
1049 AU.addRequired<MachineLoopInfoWrapperPass>();
1050 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
1051 AU.addUsedIfAvailable<AAResultsWrapperPass>();
1052 AU.addPreserved<AAResultsWrapperPass>();
1054 }
1055};
1056
1057} // end anonymous namespace
1058
1059void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
1060 InstCounterType CntTy, unsigned Score) {
1061 setRegScore(Op.getReg().asMCReg(), CntTy, Score);
1062}
1063
1064// Return true if the subtarget is one that enables Point Sample Acceleration
1065// and the MachineInstr passed in is one to which it might be applied (the
1066// hardware makes this decision based on several factors, but we can't determine
1067// this at compile time, so we have to assume it might be applied if the
1068// instruction supports it).
1069bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
1070 if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
1071 return false;
1072
1073 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
1074 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
1076 return BaseInfo->PointSampleAccel;
1077}
1078
1079// Return true if the subtarget enables Point Sample Acceleration, the supplied
1080// MachineInstr is one to which it might be applied and the supplied interval is
1081// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
1082// (this is the type that a point sample accelerated instruction effectively
1083// becomes)
1084bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
1085 MCPhysReg Reg) const {
1086 if (!hasPointSampleAccel(MI))
1087 return false;
1088
1089 return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER);
1090}
1091
1092void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
1093 InstCounterType T = eventCounter(Context->getWaitEventMask(), E);
1094 assert(T < Context->MaxCounter);
1095
1096 unsigned UB = getScoreUB(T);
1097 unsigned CurrScore = UB + 1;
1098 if (CurrScore == 0)
1099 report_fatal_error("InsertWaitcnt score wraparound");
1100 // PendingEvents and ScoreUB need to be update regardless if this event
1101 // changes the score of a register or not.
1102 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
1103 PendingEvents.insert(E);
1104 setScoreUB(T, CurrScore);
1105
1106 const SIRegisterInfo *TRI = Context->TRI;
1107 const MachineRegisterInfo *MRI = Context->MRI;
1108 const SIInstrInfo *TII = Context->TII;
1109
1110 if (T == EXP_CNT) {
1111 // Put score on the source vgprs. If this is a store, just use those
1112 // specific register(s).
1113 if (TII->isDS(Inst) && Inst.mayLoadOrStore()) {
1114 // All GDS operations must protect their address register (same as
1115 // export.)
1116 if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
1117 setScoreByOperand(*AddrOp, EXP_CNT, CurrScore);
1118
1119 if (Inst.mayStore()) {
1120 if (const auto *Data0 =
1121 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
1122 setScoreByOperand(*Data0, EXP_CNT, CurrScore);
1123 if (const auto *Data1 =
1124 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
1125 setScoreByOperand(*Data1, EXP_CNT, CurrScore);
1126 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
1127 Inst.getOpcode() != AMDGPU::DS_APPEND &&
1128 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
1129 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1130 for (const MachineOperand &Op : Inst.all_uses()) {
1131 if (TRI->isVectorRegister(*MRI, Op.getReg()))
1132 setScoreByOperand(Op, EXP_CNT, CurrScore);
1133 }
1134 }
1135 } else if (TII->isFLAT(Inst)) {
1136 if (Inst.mayStore()) {
1137 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1138 EXP_CNT, CurrScore);
1139 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1140 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1141 EXP_CNT, CurrScore);
1142 }
1143 } else if (TII->isMIMG(Inst)) {
1144 if (Inst.mayStore()) {
1145 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
1146 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1147 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1148 EXP_CNT, CurrScore);
1149 }
1150 } else if (TII->isMTBUF(Inst)) {
1151 if (Inst.mayStore())
1152 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
1153 } else if (TII->isMUBUF(Inst)) {
1154 if (Inst.mayStore()) {
1155 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
1156 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1157 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1158 EXP_CNT, CurrScore);
1159 }
1160 } else if (TII->isLDSDIR(Inst)) {
1161 // LDSDIR instructions attach the score to the destination.
1162 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
1163 EXP_CNT, CurrScore);
1164 } else {
1165 if (TII->isEXP(Inst)) {
1166 // For export the destination registers are really temps that
1167 // can be used as the actual source after export patching, so
1168 // we need to treat them like sources and set the EXP_CNT
1169 // score.
1170 for (MachineOperand &DefMO : Inst.all_defs()) {
1171 if (TRI->isVGPR(*MRI, DefMO.getReg())) {
1172 setScoreByOperand(DefMO, EXP_CNT, CurrScore);
1173 }
1174 }
1175 }
1176 for (const MachineOperand &Op : Inst.all_uses()) {
1177 if (TRI->isVectorRegister(*MRI, Op.getReg()))
1178 setScoreByOperand(Op, EXP_CNT, CurrScore);
1179 }
1180 }
1181 } else if (T == X_CNT) {
1182 WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1183 if (PendingEvents.contains(OtherEvent)) {
1184 // Hardware inserts an implicit xcnt between interleaved
1185 // SMEM and VMEM operations. So there will never be
1186 // outstanding address translations for both SMEM and
1187 // VMEM at the same time.
1188 setScoreLB(T, getScoreUB(T) - 1);
1189 PendingEvents.remove(OtherEvent);
1190 }
1191 for (const MachineOperand &Op : Inst.all_uses())
1192 setScoreByOperand(Op, T, CurrScore);
1193 } else if (T == VA_VDST || T == VM_VSRC) {
1194 // Match the score to the VGPR destination or source registers as
1195 // appropriate
1196 for (const MachineOperand &Op : Inst.operands()) {
1197 if (!Op.isReg() || (T == VA_VDST && Op.isUse()) ||
1198 (T == VM_VSRC && Op.isDef()))
1199 continue;
1200 if (TRI->isVectorRegister(*Context->MRI, Op.getReg()))
1201 setScoreByOperand(Op, T, CurrScore);
1202 }
1203 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1204 // Match the score to the destination registers.
1205 //
1206 // Check only explicit operands. Stores, especially spill stores, include
1207 // implicit uses and defs of their super registers which would create an
1208 // artificial dependency, while these are there only for register liveness
1209 // accounting purposes.
1210 //
1211 // Special cases where implicit register defs exists, such as M0 or VCC,
1212 // but none with memory instructions.
1213 for (const MachineOperand &Op : Inst.defs()) {
1214 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
1215 if (!TRI->isVectorRegister(*MRI, Op.getReg())) // TODO: add wrapper
1216 continue;
1217 if (updateVMCntOnly(Inst)) {
1218 // updateVMCntOnly should only leave us with VGPRs
1219 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1220 // defs. That's required for a sane index into `VgprMemTypes` below
1221 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
1222 VmemType V = getVmemType(Inst);
1223 unsigned char TypesMask = 1 << V;
1224 // If instruction can have Point Sample Accel applied, we have to flag
1225 // this with another potential dependency
1226 if (hasPointSampleAccel(Inst))
1227 TypesMask |= 1 << VMEM_NOSAMPLER;
1228 for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))
1229 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1230 }
1231 }
1232 setScoreByOperand(Op, T, CurrScore);
1233 }
1234 if (Inst.mayStore() &&
1235 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
1236 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1237 // written can be accessed. A load from LDS to VMEM does not need a wait.
1238 //
1239 // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
1240 // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
1241 // store. The "Slot" is the index into LDSDMAStores + 1.
1242 unsigned Slot = 0;
1243 for (const auto *MemOp : Inst.memoperands()) {
1244 if (!MemOp->isStore() ||
1245 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1246 continue;
1247 // Comparing just AA info does not guarantee memoperands are equal
1248 // in general, but this is so for LDS DMA in practice.
1249 auto AAI = MemOp->getAAInfo();
1250 // Alias scope information gives a way to definitely identify an
1251 // original memory object and practically produced in the module LDS
1252 // lowering pass. If there is no scope available we will not be able
1253 // to disambiguate LDS aliasing as after the module lowering all LDS
1254 // is squashed into a single big object.
1255 if (!AAI || !AAI.Scope)
1256 break;
1257 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1258 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1259 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1260 Slot = I + 1;
1261 break;
1262 }
1263 }
1264 }
1265 if (Slot)
1266 break;
1267 // The slot may not be valid because it can be >= NUM_LDSDMA which
1268 // means the scoreboard cannot track it. We still want to preserve the
1269 // MI in order to check alias information, though.
1270 LDSDMAStores.push_back(&Inst);
1271 Slot = LDSDMAStores.size();
1272 break;
1273 }
1274 setVMemScore(LDSDMA_BEGIN, T, CurrScore);
1275 if (Slot && Slot < NUM_LDSDMA)
1276 setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
1277 }
1278
1280 setRegScore(AMDGPU::SCC, T, CurrScore);
1281 PendingSCCWrite = &Inst;
1282 }
1283 }
1284}
1285
1286void WaitcntBrackets::print(raw_ostream &OS) const {
1287 const GCNSubtarget *ST = Context->ST;
1288
1289 OS << '\n';
1290 for (auto T : inst_counter_types(Context->MaxCounter)) {
1291 unsigned SR = getScoreRange(T);
1292
1293 switch (T) {
1294 case LOAD_CNT:
1295 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1296 << SR << "):";
1297 break;
1298 case DS_CNT:
1299 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1300 << SR << "):";
1301 break;
1302 case EXP_CNT:
1303 OS << " EXP_CNT(" << SR << "):";
1304 break;
1305 case STORE_CNT:
1306 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1307 << SR << "):";
1308 break;
1309 case SAMPLE_CNT:
1310 OS << " SAMPLE_CNT(" << SR << "):";
1311 break;
1312 case BVH_CNT:
1313 OS << " BVH_CNT(" << SR << "):";
1314 break;
1315 case KM_CNT:
1316 OS << " KM_CNT(" << SR << "):";
1317 break;
1318 case X_CNT:
1319 OS << " X_CNT(" << SR << "):";
1320 break;
1321 case VA_VDST:
1322 OS << " VA_VDST(" << SR << "): ";
1323 break;
1324 case VM_VSRC:
1325 OS << " VM_VSRC(" << SR << "): ";
1326 break;
1327 default:
1328 OS << " UNKNOWN(" << SR << "):";
1329 break;
1330 }
1331
1332 if (SR != 0) {
1333 // Print vgpr scores.
1334 unsigned LB = getScoreLB(T);
1335
1336 SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
1337 sort(SortedVMEMIDs);
1338
1339 for (auto ID : SortedVMEMIDs) {
1340 unsigned RegScore = VMem.at(ID).Scores[T];
1341 if (RegScore <= LB)
1342 continue;
1343 unsigned RelScore = RegScore - LB - 1;
1344 if (ID < REGUNITS_END) {
1345 OS << ' ' << RelScore << ":vRU" << ID;
1346 } else {
1347 assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
1348 "Unhandled/unexpected ID value!");
1349 OS << ' ' << RelScore << ":LDSDMA" << ID;
1350 }
1351 }
1352
1353 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1354 if (isSmemCounter(T)) {
1355 SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
1356 sort(SortedSMEMIDs);
1357 for (auto ID : SortedSMEMIDs) {
1358 unsigned RegScore = SGPRs.at(ID).Scores[getSgprScoresIdx(T)];
1359 if (RegScore <= LB)
1360 continue;
1361 unsigned RelScore = RegScore - LB - 1;
1362 OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
1363 }
1364 }
1365
1366 if (T == KM_CNT && SCCScore > 0)
1367 OS << ' ' << SCCScore << ":scc";
1368 }
1369 OS << '\n';
1370 }
1371
1372 OS << "Pending Events: ";
1373 if (hasPendingEvent()) {
1374 ListSeparator LS;
1375 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1376 if (hasPendingEvent((WaitEventType)I)) {
1377 OS << LS << WaitEventTypeName[I];
1378 }
1379 }
1380 } else {
1381 OS << "none";
1382 }
1383 OS << '\n';
1384
1385 OS << '\n';
1386}
1387
1388/// Simplify \p UpdateWait by removing waits that are redundant based on the
1389/// current WaitcntBrackets and any other waits specified in \p CheckWait.
1390void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
1391 AMDGPU::Waitcnt &UpdateWait) const {
1392 simplifyWaitcnt(LOAD_CNT, UpdateWait.LoadCnt);
1393 simplifyWaitcnt(EXP_CNT, UpdateWait.ExpCnt);
1394 simplifyWaitcnt(DS_CNT, UpdateWait.DsCnt);
1395 simplifyWaitcnt(STORE_CNT, UpdateWait.StoreCnt);
1396 simplifyWaitcnt(SAMPLE_CNT, UpdateWait.SampleCnt);
1397 simplifyWaitcnt(BVH_CNT, UpdateWait.BvhCnt);
1398 simplifyWaitcnt(KM_CNT, UpdateWait.KmCnt);
1399 simplifyXcnt(CheckWait, UpdateWait);
1400 simplifyWaitcnt(VA_VDST, UpdateWait.VaVdst);
1401 simplifyVmVsrc(CheckWait, UpdateWait);
1402}
1403
1404void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1405 unsigned &Count) const {
1406 // The number of outstanding events for this type, T, can be calculated
1407 // as (UB - LB). If the current Count is greater than or equal to the number
1408 // of outstanding events, then the wait for this counter is redundant.
1409 if (Count >= getScoreRange(T))
1410 Count = ~0u;
1411}
1412
1413void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
1414 AMDGPU::Waitcnt &UpdateWait) const {
1415 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1416 // optimizations. On entry to a block with multiple predescessors, there may
1417 // be pending SMEM and VMEM events active at the same time.
1418 // In such cases, only clear one active event at a time.
1419 // TODO: Revisit xcnt optimizations for gfx1250.
1420 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1421 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1422 // zero.
1423 if (CheckWait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
1424 UpdateWait.XCnt = ~0u;
1425 // If we have pending store we cannot optimize XCnt because we do not wait for
1426 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1427 // decremented to the same number as LOADCnt.
1428 if (CheckWait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1429 !hasPendingEvent(STORE_CNT) && CheckWait.XCnt >= CheckWait.LoadCnt)
1430 UpdateWait.XCnt = ~0u;
1431 simplifyWaitcnt(X_CNT, UpdateWait.XCnt);
1432}
1433
1434void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
1435 AMDGPU::Waitcnt &UpdateWait) const {
1436 // Waiting for some counters implies waiting for VM_VSRC, since an
1437 // instruction that decrements a counter on completion would have
1438 // decremented VM_VSRC once its VGPR operands had been read.
1439 if (CheckWait.VmVsrc >=
1440 std::min({CheckWait.LoadCnt, CheckWait.StoreCnt, CheckWait.SampleCnt,
1441 CheckWait.BvhCnt, CheckWait.DsCnt}))
1442 UpdateWait.VmVsrc = ~0u;
1443 simplifyWaitcnt(VM_VSRC, UpdateWait.VmVsrc);
1444}
1445
1446void WaitcntBrackets::purgeEmptyTrackingData() {
1447 for (auto &[K, V] : make_early_inc_range(VMem)) {
1448 if (V.empty())
1449 VMem.erase(K);
1450 }
1451 for (auto &[K, V] : make_early_inc_range(SGPRs)) {
1452 if (V.empty())
1453 SGPRs.erase(K);
1454 }
1455}
1456
1457void WaitcntBrackets::determineWaitForScore(InstCounterType T,
1458 unsigned ScoreToWait,
1459 AMDGPU::Waitcnt &Wait) const {
1460 const unsigned LB = getScoreLB(T);
1461 const unsigned UB = getScoreUB(T);
1462
1463 // If the score falls within the bracket, we need a waitcnt.
1464 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1465 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1466 !Context->ST->hasFlatLgkmVMemCountInOrder()) {
1467 // If there is a pending FLAT operation, and this is a VMem or LGKM
1468 // waitcnt and the target can report early completion, then we need
1469 // to force a waitcnt 0.
1470 addWait(Wait, T, 0);
1471 } else if (counterOutOfOrder(T)) {
1472 // Counter can get decremented out-of-order when there
1473 // are multiple types event in the bracket. Also emit an s_wait counter
1474 // with a conservative value of 0 for the counter.
1475 addWait(Wait, T, 0);
1476 } else {
1477 // If a counter has been maxed out avoid overflow by waiting for
1478 // MAX(CounterType) - 1 instead.
1479 unsigned NeededWait = std::min(
1480 UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1);
1481 addWait(Wait, T, NeededWait);
1482 }
1483 }
1484}
1485
1486void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
1487 AMDGPU::Waitcnt &Wait) const {
1488 if (Reg == AMDGPU::SCC) {
1489 determineWaitForScore(T, SCCScore, Wait);
1490 } else {
1491 bool IsVGPR = Context->TRI->isVectorRegister(*Context->MRI, Reg);
1492 for (MCRegUnit RU : regunits(Reg))
1493 determineWaitForScore(
1494 T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
1495 Wait);
1496 }
1497}
1498
1499void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
1500 AMDGPU::Waitcnt &Wait) const {
1501 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1502 determineWaitForScore(T, getVMemScore(TID, T), Wait);
1503}
1504
1505void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1506 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1507 // SCC has landed
1508 if (PendingSCCWrite &&
1509 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1510 PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
1511 WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
1512 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1513 if ((PendingEvents & Context->getWaitEventMask()[KM_CNT]) ==
1514 SCC_WRITE_PendingEvent) {
1515 setScoreLB(KM_CNT, getScoreUB(KM_CNT));
1516 }
1517
1518 PendingEvents.remove(SCC_WRITE_PendingEvent);
1519 PendingSCCWrite = nullptr;
1520 }
1521}
1522
1523void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1524 applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1525 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1526 applyWaitcnt(DS_CNT, Wait.DsCnt);
1527 applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1528 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1529 applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1530 applyWaitcnt(KM_CNT, Wait.KmCnt);
1531 applyWaitcnt(X_CNT, Wait.XCnt);
1532 applyWaitcnt(VA_VDST, Wait.VaVdst);
1533 applyWaitcnt(VM_VSRC, Wait.VmVsrc);
1534}
1535
1536void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1537 const unsigned UB = getScoreUB(T);
1538 if (Count >= UB)
1539 return;
1540 if (Count != 0) {
1541 if (counterOutOfOrder(T))
1542 return;
1543 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1544 } else {
1545 setScoreLB(T, UB);
1546 PendingEvents.remove(Context->getWaitEventMask()[T]);
1547 }
1548
1549 if (T == KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) {
1550 if (!hasMixedPendingEvents(X_CNT))
1551 applyWaitcnt(X_CNT, 0);
1552 else
1553 PendingEvents.remove(SMEM_GROUP);
1554 }
1555 if (T == LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
1556 !hasPendingEvent(STORE_CNT)) {
1557 if (!hasMixedPendingEvents(X_CNT))
1558 applyWaitcnt(X_CNT, Count);
1559 else if (Count == 0)
1560 PendingEvents.remove(VMEM_GROUP);
1561 }
1562}
1563
1564// Where there are multiple types of event in the bracket of a counter,
1565// the decrement may go out of order.
1566bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1567 // Scalar memory read always can go out of order.
1568 if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1569 (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
1570 return true;
1571
1572 // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
1573 // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
1574 // out-of-order completion.
1575 if (T == LOAD_CNT) {
1576 unsigned Events = hasPendingEvent(T);
1577 // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
1578 // events
1579 Events &= ~(1 << GLOBAL_INV_ACCESS);
1580 // Return true only if there are still multiple event types after removing
1581 // GLOBAL_INV
1582 return Events & (Events - 1);
1583 }
1584
1585 return hasMixedPendingEvents(T);
1586}
1587
1588INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1589 false, false)
1592INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1594
1595char SIInsertWaitcntsLegacy::ID = 0;
1596
1597char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1598
1600 return new SIInsertWaitcntsLegacy();
1601}
1602
1603static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1604 unsigned NewEnc) {
1605 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1606 assert(OpIdx >= 0);
1607
1608 MachineOperand &MO = MI.getOperand(OpIdx);
1609
1610 if (NewEnc == MO.getImm())
1611 return false;
1612
1613 MO.setImm(NewEnc);
1614 return true;
1615}
1616
1617/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1618/// and if so, which counter it is waiting on.
1619static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1620 switch (Opcode) {
1621 case AMDGPU::S_WAIT_LOADCNT:
1622 return LOAD_CNT;
1623 case AMDGPU::S_WAIT_EXPCNT:
1624 return EXP_CNT;
1625 case AMDGPU::S_WAIT_STORECNT:
1626 return STORE_CNT;
1627 case AMDGPU::S_WAIT_SAMPLECNT:
1628 return SAMPLE_CNT;
1629 case AMDGPU::S_WAIT_BVHCNT:
1630 return BVH_CNT;
1631 case AMDGPU::S_WAIT_DSCNT:
1632 return DS_CNT;
1633 case AMDGPU::S_WAIT_KMCNT:
1634 return KM_CNT;
1635 case AMDGPU::S_WAIT_XCNT:
1636 return X_CNT;
1637 default:
1638 return {};
1639 }
1640}
1641
1642bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1643 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1644 if (Opcode == Waitcnt->getOpcode())
1645 return false;
1646
1647 Waitcnt->setDesc(TII->get(Opcode));
1648 return true;
1649}
1650
1651/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1652/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1653/// from \p Wait that were added by previous passes. Currently this pass
1654/// conservatively assumes that these preexisting waits are required for
1655/// correctness.
1656bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1657 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1658 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1659 assert(ST);
1660 assert(isNormalMode(MaxCounter));
1661
1662 bool Modified = false;
1663 MachineInstr *WaitcntInstr = nullptr;
1664 MachineInstr *WaitcntVsCntInstr = nullptr;
1665
1666 LLVM_DEBUG({
1667 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1668 if (It.isEnd())
1669 dbgs() << "end of block\n";
1670 else
1671 dbgs() << *It;
1672 });
1673
1674 for (auto &II :
1675 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1676 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1677 if (II.isMetaInstruction()) {
1678 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1679 continue;
1680 }
1681
1682 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1683 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1684
1685 // Update required wait count. If this is a soft waitcnt (= it was added
1686 // by an earlier pass), it may be entirely removed.
1687 if (Opcode == AMDGPU::S_WAITCNT) {
1688 unsigned IEnc = II.getOperand(0).getImm();
1689 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1690 if (TrySimplify)
1691 ScoreBrackets.simplifyWaitcnt(OldWait);
1692 Wait = Wait.combined(OldWait);
1693
1694 // Merge consecutive waitcnt of the same type by erasing multiples.
1695 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1696 II.eraseFromParent();
1697 Modified = true;
1698 } else
1699 WaitcntInstr = &II;
1700 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1701 assert(ST->hasVMemToLDSLoad());
1702 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1703 << "Before: " << Wait << '\n';);
1704 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, LDSDMA_BEGIN, Wait);
1705 LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
1706
1707 // It is possible (but unlikely) that this is the only wait instruction,
1708 // in which case, we exit this loop without a WaitcntInstr to consume
1709 // `Wait`. But that works because `Wait` was passed in by reference, and
1710 // the callee eventually calls createNewWaitcnt on it. We test this
1711 // possibility in an articial MIR test since such a situation cannot be
1712 // recreated by running the memory legalizer.
1713 II.eraseFromParent();
1714 } else {
1715 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1716 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1717
1718 unsigned OldVSCnt =
1719 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1720 if (TrySimplify)
1721 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1722 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1723
1724 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1725 II.eraseFromParent();
1726 Modified = true;
1727 } else
1728 WaitcntVsCntInstr = &II;
1729 }
1730 }
1731
1732 if (WaitcntInstr) {
1733 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1735 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1736
1737 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1738 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1739 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1740 Wait.LoadCnt = ~0u;
1741 Wait.ExpCnt = ~0u;
1742 Wait.DsCnt = ~0u;
1743
1744 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1745 << "New Instr at block end: "
1746 << *WaitcntInstr << '\n'
1747 : dbgs() << "applied pre-existing waitcnt\n"
1748 << "Old Instr: " << *It
1749 << "New Instr: " << *WaitcntInstr << '\n');
1750 }
1751
1752 if (WaitcntVsCntInstr) {
1753 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
1754 AMDGPU::OpName::simm16, Wait.StoreCnt);
1755 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1756
1757 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1758 Wait.StoreCnt = ~0u;
1759
1760 LLVM_DEBUG(It.isEnd()
1761 ? dbgs() << "applied pre-existing waitcnt\n"
1762 << "New Instr at block end: " << *WaitcntVsCntInstr
1763 << '\n'
1764 : dbgs() << "applied pre-existing waitcnt\n"
1765 << "Old Instr: " << *It
1766 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1767 }
1768
1769 return Modified;
1770}
1771
1772/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1773/// required counters in \p Wait
1774bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1775 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1776 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
1777 assert(ST);
1778 assert(isNormalMode(MaxCounter));
1779
1780 bool Modified = false;
1781 const DebugLoc &DL = Block.findDebugLoc(It);
1782
1783 // Helper to emit expanded waitcnt sequence for profiling.
1784 // Emits waitcnts from (Outstanding-1) down to Target.
1785 // The EmitWaitcnt callback emits a single waitcnt.
1786 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
1787 auto EmitWaitcnt) {
1788 do {
1789 EmitWaitcnt(--Outstanding);
1790 } while (Outstanding > Target);
1791 Modified = true;
1792 };
1793
1794 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1795 // single instruction while VScnt has its own instruction.
1796 if (Wait.hasWaitExceptStoreCnt()) {
1797 // If profiling expansion is enabled, emit an expanded sequence
1798 if (ExpandWaitcntProfiling) {
1799 // Check if any of the counters to be waited on are out-of-order.
1800 // If so, fall back to normal (non-expanded) behavior since expansion
1801 // would provide misleading profiling information.
1802 bool AnyOutOfOrder = false;
1803 for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
1804 unsigned &WaitCnt = getCounterRef(Wait, CT);
1805 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
1806 AnyOutOfOrder = true;
1807 break;
1808 }
1809 }
1810
1811 if (AnyOutOfOrder) {
1812 // Fall back to non-expanded wait
1813 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1814 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1815 Modified = true;
1816 } else {
1817 // All counters are in-order, safe to expand
1818 for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
1819 unsigned &WaitCnt = getCounterRef(Wait, CT);
1820 if (WaitCnt == ~0u)
1821 continue;
1822
1823 unsigned Outstanding = std::min(ScoreBrackets.getScoreUB(CT) -
1824 ScoreBrackets.getScoreLB(CT),
1825 getWaitCountMax(getLimits(), CT) - 1);
1826 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
1827 AMDGPU::Waitcnt W;
1828 getCounterRef(W, CT) = Count;
1829 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT))
1831 });
1832 }
1833 }
1834 } else {
1835 // Normal behavior: emit single combined waitcnt
1836 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1837 [[maybe_unused]] auto SWaitInst =
1838 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1839 Modified = true;
1840
1841 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1842 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1843 dbgs() << "New Instr: " << *SWaitInst << '\n');
1844 }
1845 }
1846
1847 if (Wait.hasWaitStoreCnt()) {
1848 assert(ST->hasVscnt());
1849
1850 if (ExpandWaitcntProfiling && Wait.StoreCnt != ~0u &&
1851 !ScoreBrackets.counterOutOfOrder(STORE_CNT)) {
1852 // Only expand if counter is not out-of-order
1853 unsigned Outstanding =
1854 std::min(ScoreBrackets.getScoreUB(STORE_CNT) -
1855 ScoreBrackets.getScoreLB(STORE_CNT),
1856 getWaitCountMax(getLimits(), STORE_CNT) - 1);
1857 EmitExpandedWaitcnt(Outstanding, Wait.StoreCnt, [&](unsigned Count) {
1858 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1859 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1860 .addImm(Count);
1861 });
1862 } else {
1863 [[maybe_unused]] auto SWaitInst =
1864 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1865 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1866 .addImm(Wait.StoreCnt);
1867 Modified = true;
1868
1869 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1870 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1871 dbgs() << "New Instr: " << *SWaitInst << '\n');
1872 }
1873 }
1874
1875 return Modified;
1876}
1877
1878AMDGPU::Waitcnt
1879WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1880 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1881}
1882
1883AMDGPU::Waitcnt
1884WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1885 unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
1886 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1887 ~0u /* XCNT */, ExpertVal, ExpertVal);
1888}
1889
1890/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1891/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1892/// were added by previous passes. Currently this pass conservatively
1893/// assumes that these preexisting waits are required for correctness.
1894bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1895 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1896 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1897 assert(ST);
1898 assert(!isNormalMode(MaxCounter));
1899
1900 bool Modified = false;
1901 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1902 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1903 MachineInstr *WaitcntDepctrInstr = nullptr;
1904 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1905
1906 LLVM_DEBUG({
1907 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
1908 if (It.isEnd())
1909 dbgs() << "end of block\n";
1910 else
1911 dbgs() << *It;
1912 });
1913
1914 // Accumulate waits that should not be simplified.
1915 AMDGPU::Waitcnt RequiredWait;
1916
1917 for (auto &II :
1918 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1919 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1920 if (II.isMetaInstruction()) {
1921 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1922 continue;
1923 }
1924
1925 MachineInstr **UpdatableInstr;
1926
1927 // Update required wait count. If this is a soft waitcnt (= it was added
1928 // by an earlier pass), it may be entirely removed.
1929
1930 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1931 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1932
1933 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1934 // attempt to do more than that either.
1935 if (Opcode == AMDGPU::S_WAITCNT)
1936 continue;
1937
1938 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1939 unsigned OldEnc =
1940 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1941 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
1942 if (TrySimplify)
1943 Wait = Wait.combined(OldWait);
1944 else
1945 RequiredWait = RequiredWait.combined(OldWait);
1946 UpdatableInstr = &CombinedLoadDsCntInstr;
1947 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1948 unsigned OldEnc =
1949 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1950 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
1951 if (TrySimplify)
1952 Wait = Wait.combined(OldWait);
1953 else
1954 RequiredWait = RequiredWait.combined(OldWait);
1955 UpdatableInstr = &CombinedStoreDsCntInstr;
1956 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
1957 unsigned OldEnc =
1958 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1959 AMDGPU::Waitcnt OldWait;
1960 OldWait.VaVdst = AMDGPU::DepCtr::decodeFieldVaVdst(OldEnc);
1961 OldWait.VmVsrc = AMDGPU::DepCtr::decodeFieldVmVsrc(OldEnc);
1962 if (TrySimplify)
1963 ScoreBrackets.simplifyWaitcnt(OldWait);
1964 Wait = Wait.combined(OldWait);
1965 UpdatableInstr = &WaitcntDepctrInstr;
1966 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1967 // Architectures higher than GFX10 do not have direct loads to
1968 // LDS, so no work required here yet.
1969 II.eraseFromParent();
1970 continue;
1971 } else {
1972 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1973 assert(CT.has_value());
1974 unsigned OldCnt =
1975 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1976 if (TrySimplify)
1977 addWait(Wait, CT.value(), OldCnt);
1978 else
1979 addWait(RequiredWait, CT.value(), OldCnt);
1980 UpdatableInstr = &WaitInstrs[CT.value()];
1981 }
1982
1983 // Merge consecutive waitcnt of the same type by erasing multiples.
1984 if (!*UpdatableInstr) {
1985 *UpdatableInstr = &II;
1986 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
1987 // S_WAITCNT_DEPCTR requires special care. Don't remove a
1988 // duplicate if it is waiting on things other than VA_VDST or
1989 // VM_VSRC. If that is the case, just make sure the VA_VDST and
1990 // VM_VSRC subfields of the operand are set to the "no wait"
1991 // values.
1992
1993 unsigned Enc = TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1994 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u);
1995 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u);
1996
1997 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST)) {
1998 Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc);
1999 Modified |= promoteSoftWaitCnt(&II);
2000 } else {
2001 II.eraseFromParent();
2002 Modified = true;
2003 }
2004 } else {
2005 II.eraseFromParent();
2006 Modified = true;
2007 }
2008 }
2009
2010 ScoreBrackets.simplifyWaitcnt(Wait.combined(RequiredWait), Wait);
2011 Wait = Wait.combined(RequiredWait);
2012
2013 if (CombinedLoadDsCntInstr) {
2014 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
2015 // to be waited for. Otherwise, let the instruction be deleted so
2016 // the appropriate single counter wait instruction can be inserted
2017 // instead, when new S_WAIT_*CNT instructions are inserted by
2018 // createNewWaitcnt(). As a side effect, resetting the wait counts will
2019 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
2020 // the loop below that deals with single counter instructions.
2021 //
2022 // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
2023 // instructions that have decremented LOAD_CNT or DS_CNT on completion
2024 // will have needed to wait for their register sources to be available
2025 // first.
2026 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
2027 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2028 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
2029 AMDGPU::OpName::simm16, NewEnc);
2030 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
2031 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
2032 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
2033 Wait.LoadCnt = ~0u;
2034 Wait.DsCnt = ~0u;
2035
2036 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2037 << "New Instr at block end: "
2038 << *CombinedLoadDsCntInstr << '\n'
2039 : dbgs() << "applied pre-existing waitcnt\n"
2040 << "Old Instr: " << *It << "New Instr: "
2041 << *CombinedLoadDsCntInstr << '\n');
2042 } else {
2043 CombinedLoadDsCntInstr->eraseFromParent();
2044 Modified = true;
2045 }
2046 }
2047
2048 if (CombinedStoreDsCntInstr) {
2049 // Similarly for S_WAIT_STORECNT_DSCNT.
2050 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
2051 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2052 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
2053 AMDGPU::OpName::simm16, NewEnc);
2054 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2055 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
2056 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
2057 Wait.StoreCnt = ~0u;
2058 Wait.DsCnt = ~0u;
2059
2060 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2061 << "New Instr at block end: "
2062 << *CombinedStoreDsCntInstr << '\n'
2063 : dbgs() << "applied pre-existing waitcnt\n"
2064 << "Old Instr: " << *It << "New Instr: "
2065 << *CombinedStoreDsCntInstr << '\n');
2066 } else {
2067 CombinedStoreDsCntInstr->eraseFromParent();
2068 Modified = true;
2069 }
2070 }
2071
2072 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
2073 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
2074 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
2075 // instructions so that createNewWaitcnt() will create new combined
2076 // instructions to replace them.
2077
2078 if (Wait.DsCnt != ~0u) {
2079 // This is a vector of addresses in WaitInstrs pointing to instructions
2080 // that should be removed if they are present.
2082
2083 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
2084 // both) need to be waited for, ensure that there are no existing
2085 // individual wait count instructions for these.
2086
2087 if (Wait.LoadCnt != ~0u) {
2088 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
2089 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
2090 } else if (Wait.StoreCnt != ~0u) {
2091 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
2092 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
2093 }
2094
2095 for (MachineInstr **WI : WaitsToErase) {
2096 if (!*WI)
2097 continue;
2098
2099 (*WI)->eraseFromParent();
2100 *WI = nullptr;
2101 Modified = true;
2102 }
2103 }
2104
2105 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2106 if (!WaitInstrs[CT])
2107 continue;
2108
2109 unsigned NewCnt = getWait(Wait, CT);
2110 if (NewCnt != ~0u) {
2111 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
2112 AMDGPU::OpName::simm16, NewCnt);
2113 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2114
2115 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2116 setNoWait(Wait, CT);
2117
2118 LLVM_DEBUG(It.isEnd()
2119 ? dbgs() << "applied pre-existing waitcnt\n"
2120 << "New Instr at block end: " << *WaitInstrs[CT]
2121 << '\n'
2122 : dbgs() << "applied pre-existing waitcnt\n"
2123 << "Old Instr: " << *It
2124 << "New Instr: " << *WaitInstrs[CT] << '\n');
2125 } else {
2126 WaitInstrs[CT]->eraseFromParent();
2127 Modified = true;
2128 }
2129 }
2130
2131 if (WaitcntDepctrInstr) {
2132 // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
2133 // subfields with the new required values.
2134 unsigned Enc =
2135 TII->getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2136 ->getImm();
2137 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, Wait.VmVsrc);
2138 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.VaVdst);
2139
2140 ScoreBrackets.applyWaitcnt(VA_VDST, Wait.VaVdst);
2141 ScoreBrackets.applyWaitcnt(VM_VSRC, Wait.VmVsrc);
2142 Wait.VaVdst = ~0u;
2143 Wait.VmVsrc = ~0u;
2144
2145 // If that new encoded Depctr immediate would actually still wait
2146 // for anything, update the instruction's operand. Otherwise it can
2147 // just be deleted.
2148 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST)) {
2149 Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr,
2150 AMDGPU::OpName::simm16, Enc);
2151 LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n"
2152 << "New Instr at block end: "
2153 << *WaitcntDepctrInstr << '\n'
2154 : dbgs() << "applyPreexistingWaitcnt\n"
2155 << "Old Instr: " << *It << "New Instr: "
2156 << *WaitcntDepctrInstr << '\n');
2157 } else {
2158 WaitcntDepctrInstr->eraseFromParent();
2159 Modified = true;
2160 }
2161 }
2162
2163 return Modified;
2164}
2165
2166/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
2167bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2168 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
2169 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
2170 assert(ST);
2171 assert(!isNormalMode(MaxCounter));
2172
2173 bool Modified = false;
2174 const DebugLoc &DL = Block.findDebugLoc(It);
2175
2176 // Helper to emit expanded waitcnt sequence for profiling.
2177 auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
2178 auto EmitWaitcnt) {
2179 for (unsigned I = Outstanding - 1; I > Target && I != ~0u; --I)
2180 EmitWaitcnt(I);
2181 EmitWaitcnt(Target);
2182 Modified = true;
2183 };
2184
2185 // For GFX12+, we use separate wait instructions, which makes expansion
2186 // simpler
2187 if (ExpandWaitcntProfiling) {
2188 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2189 unsigned Count = getWait(Wait, CT);
2190 if (Count == ~0u)
2191 continue;
2192
2193 // Skip expansion for out-of-order counters - emit normal wait instead
2194 if (ScoreBrackets.counterOutOfOrder(CT)) {
2195 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
2196 .addImm(Count);
2197 Modified = true;
2198 continue;
2199 }
2200
2201 unsigned Outstanding =
2202 std::min(ScoreBrackets.getScoreUB(CT) - ScoreBrackets.getScoreLB(CT),
2203 getWaitCountMax(getLimits(), CT) - 1);
2204 EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
2205 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
2206 .addImm(Val);
2207 });
2208 }
2209 return Modified;
2210 }
2211
2212 // Normal behavior (no expansion)
2213 // Check for opportunities to use combined wait instructions.
2214 if (Wait.DsCnt != ~0u) {
2215 MachineInstr *SWaitInst = nullptr;
2216
2217 if (Wait.LoadCnt != ~0u) {
2218 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2219
2220 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2221 .addImm(Enc);
2222
2223 Wait.LoadCnt = ~0u;
2224 Wait.DsCnt = ~0u;
2225 } else if (Wait.StoreCnt != ~0u) {
2226 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2227
2228 SWaitInst =
2229 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
2230 .addImm(Enc);
2231
2232 Wait.StoreCnt = ~0u;
2233 Wait.DsCnt = ~0u;
2234 }
2235
2236 if (SWaitInst) {
2237 Modified = true;
2238
2239 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2240 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2241 dbgs() << "New Instr: " << *SWaitInst << '\n');
2242 }
2243 }
2244
2245 // Generate an instruction for any remaining counter that needs
2246 // waiting for.
2247
2248 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2249 unsigned Count = getWait(Wait, CT);
2250 if (Count == ~0u)
2251 continue;
2252
2253 [[maybe_unused]] auto SWaitInst =
2254 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
2255 .addImm(Count);
2256
2257 Modified = true;
2258
2259 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2260 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2261 dbgs() << "New Instr: " << *SWaitInst << '\n');
2262 }
2263
2264 if (Wait.hasWaitDepctr()) {
2265 assert(IsExpertMode);
2266 unsigned Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Wait.VmVsrc, *ST);
2267 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.VaVdst);
2268
2269 [[maybe_unused]] auto SWaitInst =
2270 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_DEPCTR)).addImm(Enc);
2271
2272 Modified = true;
2273
2274 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
2275 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2276 dbgs() << "New Instr: " << *SWaitInst << '\n');
2277 }
2278
2279 return Modified;
2280}
2281
2282/// Generate s_waitcnt instruction to be placed before cur_Inst.
2283/// Instructions of a given type are returned in order,
2284/// but instructions of different types can complete out of order.
2285/// We rely on this in-order completion
2286/// and simply assign a score to the memory access instructions.
2287/// We keep track of the active "score bracket" to determine
2288/// if an access of a memory read requires an s_waitcnt
2289/// and if so what the value of each counter is.
2290/// The "score bracket" is bound by the lower bound and upper bound
2291/// scores (*_score_LB and *_score_ub respectively).
2292/// If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here.
2293/// If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here
2294/// (GFX12+ only, where DS_CNT is a separate counter).
2295bool SIInsertWaitcnts::generateWaitcntInstBefore(
2296 MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
2297 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2298 setForceEmitWaitcnt();
2299
2300 assert(!MI.isMetaInstruction());
2301
2302 AMDGPU::Waitcnt Wait;
2303 const unsigned Opc = MI.getOpcode();
2304
2305 // FIXME: This should have already been handled by the memory legalizer.
2306 // Removing this currently doesn't affect any lit tests, but we need to
2307 // verify that nothing was relying on this. The number of buffer invalidates
2308 // being handled here should not be expanded.
2309 if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC ||
2310 Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV ||
2311 Opc == AMDGPU::BUFFER_GL1_INV) {
2312 Wait.LoadCnt = 0;
2313 }
2314
2315 // All waits must be resolved at call return.
2316 // NOTE: this could be improved with knowledge of all call sites or
2317 // with knowledge of the called routines.
2318 if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN ||
2319 Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
2320 Opc == AMDGPU::S_SETPC_B64_return) {
2321 ReturnInsts.insert(&MI);
2322 AMDGPU::Waitcnt AllZeroWait =
2323 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2324 // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
2325 // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
2326 // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
2327 // no need to wait for it at function boundaries.
2328 if (ST->hasExtendedWaitCounts() &&
2329 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
2330 AllZeroWait.LoadCnt = ~0u;
2331 Wait = Wait.combined(AllZeroWait);
2332 }
2333 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
2334 // Technically the hardware will do this on its own if we don't, but that
2335 // might cost extra cycles compared to doing it explicitly.
2336 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
2337 // have to wait for outstanding VMEM stores. In this case it can be useful to
2338 // send a message to explicitly release all VGPRs before the stores have
2339 // completed, but it is only safe to do this if there are no outstanding
2340 // scratch stores.
2341 else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) {
2342 EndPgmInsts[&MI] = ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
2343 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
2344 }
2345 // Resolve vm waits before gs-done.
2346 else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) &&
2347 ST->hasLegacyGeometry() &&
2348 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
2350 Wait.LoadCnt = 0;
2351 }
2352
2353 // Export & GDS instructions do not read the EXEC mask until after the export
2354 // is granted (which can occur well after the instruction is issued).
2355 // The shader program must flush all EXP operations on the export-count
2356 // before overwriting the EXEC mask.
2357 else {
2358 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
2359 // Export and GDS are tracked individually, either may trigger a waitcnt
2360 // for EXEC.
2361 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2362 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2363 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2364 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2365 Wait.ExpCnt = 0;
2366 }
2367 }
2368
2369 // Wait for any pending GDS instruction to complete before any
2370 // "Always GDS" instruction.
2371 if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
2372 addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
2373
2374 if (MI.isCall()) {
2375 // The function is going to insert a wait on everything in its prolog.
2376 // This still needs to be careful if the call target is a load (e.g. a GOT
2377 // load). We also need to check WAW dependency with saved PC.
2378 CallInsts.insert(&MI);
2379 Wait = AMDGPU::Waitcnt();
2380
2381 const MachineOperand &CallAddrOp = TII->getCalleeOperand(MI);
2382 if (CallAddrOp.isReg()) {
2383 ScoreBrackets.determineWaitForPhysReg(
2384 SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait);
2385
2386 if (const auto *RtnAddrOp =
2387 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
2388 ScoreBrackets.determineWaitForPhysReg(
2389 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait);
2390 }
2391 }
2392 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
2393 ScoreBrackets.tryClearSCCWriteEvent(&MI);
2394 } else {
2395 // FIXME: Should not be relying on memoperands.
2396 // Look at the source operands of every instruction to see if
2397 // any of them results from a previous memory operation that affects
2398 // its current usage. If so, an s_waitcnt instruction needs to be
2399 // emitted.
2400 // If the source operand was defined by a load, add the s_waitcnt
2401 // instruction.
2402 //
2403 // Two cases are handled for destination operands:
2404 // 1) If the destination operand was defined by a load, add the s_waitcnt
2405 // instruction to guarantee the right WAW order.
2406 // 2) If a destination operand that was used by a recent export/store ins,
2407 // add s_waitcnt on exp_cnt to guarantee the WAR order.
2408
2409 for (const MachineMemOperand *Memop : MI.memoperands()) {
2410 const Value *Ptr = Memop->getValue();
2411 if (Memop->isStore()) {
2412 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
2413 addWait(Wait, SmemAccessCounter, 0);
2414 if (PDT->dominates(MI.getParent(), It->second))
2415 SLoadAddresses.erase(It);
2416 }
2417 }
2418 unsigned AS = Memop->getAddrSpace();
2420 continue;
2421 // No need to wait before load from VMEM to LDS.
2422 if (TII->mayWriteLDSThroughDMA(MI))
2423 continue;
2424
2425 // LOAD_CNT is only relevant to vgpr or LDS.
2426 unsigned TID = LDSDMA_BEGIN;
2427 if (Ptr && Memop->getAAInfo()) {
2428 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2429 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2430 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
2431 if ((I + 1) >= NUM_LDSDMA) {
2432 // We didn't have enough slot to track this LDS DMA store, it
2433 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
2434 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
2435 break;
2436 }
2437
2438 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID + I + 1, Wait);
2439 }
2440 }
2441 } else {
2442 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
2443 }
2444 if (Memop->isStore()) {
2445 ScoreBrackets.determineWaitForLDSDMA(EXP_CNT, TID, Wait);
2446 }
2447 }
2448
2449 // Loop over use and def operands.
2450 for (const MachineOperand &Op : MI.operands()) {
2451 if (!Op.isReg())
2452 continue;
2453
2454 // If the instruction does not read tied source, skip the operand.
2455 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
2456 continue;
2457
2458 MCPhysReg Reg = Op.getReg().asMCReg();
2459
2460 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
2461 if (IsVGPR) {
2462 // Implicit VGPR defs and uses are never a part of the memory
2463 // instructions description and usually present to account for
2464 // super-register liveness.
2465 // TODO: Most of the other instructions also have implicit uses
2466 // for the liveness accounting only.
2467 if (Op.isImplicit() && MI.mayLoadOrStore())
2468 continue;
2469
2470 ScoreBrackets.determineWaitForPhysReg(VA_VDST, Reg, Wait);
2471 if (Op.isDef())
2472 ScoreBrackets.determineWaitForPhysReg(VM_VSRC, Reg, Wait);
2473 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2474 // previous write and this write are the same type of VMEM
2475 // instruction, in which case they are (in some architectures)
2476 // guaranteed to write their results in order anyway.
2477 // Additionally check instructions where Point Sample Acceleration
2478 // might be applied.
2479 if (Op.isUse() || !updateVMCntOnly(MI) ||
2480 ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||
2481 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
2482 !ST->hasVmemWriteVgprInOrder()) {
2483 ScoreBrackets.determineWaitForPhysReg(LOAD_CNT, Reg, Wait);
2484 ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT, Reg, Wait);
2485 ScoreBrackets.determineWaitForPhysReg(BVH_CNT, Reg, Wait);
2486 ScoreBrackets.clearVgprVmemTypes(Reg);
2487 }
2488
2489 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2490 ScoreBrackets.determineWaitForPhysReg(EXP_CNT, Reg, Wait);
2491 }
2492 ScoreBrackets.determineWaitForPhysReg(DS_CNT, Reg, Wait);
2493 } else if (Op.getReg() == AMDGPU::SCC) {
2494 ScoreBrackets.determineWaitForPhysReg(KM_CNT, Reg, Wait);
2495 } else {
2496 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait);
2497 }
2498
2499 if (ST->hasWaitXcnt() && Op.isDef())
2500 ScoreBrackets.determineWaitForPhysReg(X_CNT, Reg, Wait);
2501 }
2502 }
2503 }
2504
2505 // Ensure safety against exceptions from outstanding memory operations while
2506 // waiting for a barrier:
2507 //
2508 // * Some subtargets safely handle backing off the barrier in hardware
2509 // when an exception occurs.
2510 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2511 // there can be no outstanding memory operations during the wait.
2512 // * Subtargets with split barriers don't need to back off the barrier; it
2513 // is up to the trap handler to preserve the user barrier state correctly.
2514 //
2515 // In all other cases, ensure safety by ensuring that there are no outstanding
2516 // memory operations.
2517 if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() &&
2518 !ST->hasBackOffBarrier()) {
2519 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2520 }
2521
2522 // TODO: Remove this work-around, enable the assert for Bug 457939
2523 // after fixing the scheduler. Also, the Shader Compiler code is
2524 // independent of target.
2525 if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() &&
2526 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2527 Wait.DsCnt = 0;
2528 }
2529
2530 // Verify that the wait is actually needed.
2531 ScoreBrackets.simplifyWaitcnt(Wait);
2532
2533 // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
2534 // waits on VA_VDST if the instruction it would precede is not a VALU
2535 // instruction, since hardware handles VALU->VGPR->VALU hazards in
2536 // expert scheduling mode.
2537 if (TII->isVALU(MI))
2538 Wait.VaVdst = ~0u;
2539
2540 // Since the translation for VMEM addresses occur in-order, we can apply the
2541 // XCnt if the current instruction is of VMEM type and has a memory
2542 // dependency with another VMEM instruction in flight.
2543 if (Wait.XCnt != ~0u && isVmemAccess(MI)) {
2544 ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt);
2545 Wait.XCnt = ~0u;
2546 }
2547
2548 // When forcing emit, we need to skip terminators because that would break the
2549 // terminators of the MBB if we emit a waitcnt between terminators.
2550 if (ForceEmitZeroFlag && !MI.isTerminator())
2551 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2552
2553 // If we force waitcnt then update Wait accordingly.
2554 for (InstCounterType T : inst_counter_types()) {
2555 if (!ForceEmitWaitcnt[T])
2556 continue;
2557 getCounterRef(Wait, T) = 0;
2558 }
2559
2560 if (FlushFlags.FlushVmCnt) {
2561 for (InstCounterType T : {LOAD_CNT, SAMPLE_CNT, BVH_CNT})
2562 getCounterRef(Wait, T) = 0;
2563 }
2564
2565 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
2566 Wait.DsCnt = 0;
2567
2568 if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
2569 Wait.LoadCnt = 0;
2570
2571 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2572 OldWaitcntInstr);
2573}
2574
2575bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2577 MachineBasicBlock &Block,
2578 WaitcntBrackets &ScoreBrackets,
2579 MachineInstr *OldWaitcntInstr) {
2580 bool Modified = false;
2581
2582 if (OldWaitcntInstr)
2583 // Try to merge the required wait with preexisting waitcnt instructions.
2584 // Also erase redundant waitcnt.
2585 Modified =
2586 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2587
2588 // ExpCnt can be merged into VINTERP.
2589 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
2591 MachineOperand *WaitExp =
2592 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2593 if (Wait.ExpCnt < WaitExp->getImm()) {
2594 WaitExp->setImm(Wait.ExpCnt);
2595 Modified = true;
2596 }
2597 // Apply ExpCnt before resetting it, so applyWaitcnt below sees all counts.
2598 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
2599 Wait.ExpCnt = ~0u;
2600
2601 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2602 << "Update Instr: " << *It);
2603 }
2604
2605 if (WCG->createNewWaitcnt(Block, It, Wait, ScoreBrackets))
2606 Modified = true;
2607
2608 // Any counts that could have been applied to any existing waitcnt
2609 // instructions will have been done so, now deal with any remaining.
2610 ScoreBrackets.applyWaitcnt(Wait);
2611
2612 return Modified;
2613}
2614
2615std::optional<WaitEventType>
2616SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
2617 if (TII->isVALU(Inst)) {
2618 // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
2619 // out-of-order with respect to each other, so each of these classes
2620 // has its own event.
2621
2622 if (TII->isXDL(Inst))
2623 return VGPR_XDL_WRITE;
2624
2625 if (TII->isTRANS(Inst))
2626 return VGPR_TRANS_WRITE;
2627
2629 return VGPR_DPMACC_WRITE;
2630
2631 return VGPR_CSMACC_WRITE;
2632 }
2633
2634 // FLAT and LDS instructions may read their VGPR sources out-of-order
2635 // with respect to each other and all other VMEM instructions, so
2636 // each of these also has a separate event.
2637
2638 if (TII->isFLAT(Inst))
2639 return VGPR_FLAT_READ;
2640
2641 if (TII->isDS(Inst))
2642 return VGPR_LDS_READ;
2643
2644 if (TII->isVMEM(Inst) || TII->isVIMAGE(Inst) || TII->isVSAMPLE(Inst))
2645 return VGPR_VMEM_READ;
2646
2647 // Otherwise, no hazard.
2648
2649 return {};
2650}
2651
2652bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2653 return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||
2654 (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2655}
2656
2657// Return true if the next instruction is S_ENDPGM, following fallthrough
2658// blocks if necessary.
2659bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2660 MachineBasicBlock *Block) const {
2661 auto BlockEnd = Block->getParent()->end();
2662 auto BlockIter = Block->getIterator();
2663
2664 while (true) {
2665 if (It.isEnd()) {
2666 if (++BlockIter != BlockEnd) {
2667 It = BlockIter->instr_begin();
2668 continue;
2669 }
2670
2671 return false;
2672 }
2673
2674 if (!It->isMetaInstruction())
2675 break;
2676
2677 It++;
2678 }
2679
2680 assert(!It.isEnd());
2681
2682 return It->getOpcode() == AMDGPU::S_ENDPGM;
2683}
2684
2685// Add a wait after an instruction if architecture requirements mandate one.
2686bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2687 MachineBasicBlock &Block,
2688 WaitcntBrackets &ScoreBrackets) {
2689 AMDGPU::Waitcnt Wait;
2690 bool NeedsEndPGMCheck = false;
2691
2692 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2693 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2695
2696 if (TII->isAlwaysGDS(Inst.getOpcode())) {
2697 Wait.DsCnt = 0;
2698 NeedsEndPGMCheck = true;
2699 }
2700
2701 ScoreBrackets.simplifyWaitcnt(Wait);
2702
2703 auto SuccessorIt = std::next(Inst.getIterator());
2704 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2705 /*OldWaitcntInstr=*/nullptr);
2706
2707 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2708 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
2709 .addImm(0);
2710 }
2711
2712 return Result;
2713}
2714
2715void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2716 WaitcntBrackets *ScoreBrackets) {
2717 // Now look at the instruction opcode. If it is a memory access
2718 // instruction, update the upper-bound of the appropriate counter's
2719 // bracket and the destination operand scores.
2720 // For architectures with X_CNT, mark the source address operands
2721 // with the appropriate counter values.
2722 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
2723
2724 bool IsVMEMAccess = false;
2725 bool IsSMEMAccess = false;
2726
2727 if (IsExpertMode) {
2728 if (const auto ET = getExpertSchedulingEventType(Inst))
2729 ScoreBrackets->updateByEvent(*ET, Inst);
2730 }
2731
2732 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
2733 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
2734 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2735 ScoreBrackets->updateByEvent(GDS_ACCESS, Inst);
2736 ScoreBrackets->updateByEvent(GDS_GPR_LOCK, Inst);
2737 ScoreBrackets->setPendingGDS();
2738 } else {
2739 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2740 }
2741 } else if (TII->isFLAT(Inst)) {
2743 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2744 return;
2745 }
2746
2747 assert(Inst.mayLoadOrStore());
2748
2749 int FlatASCount = 0;
2750
2751 if (TII->mayAccessVMEMThroughFlat(Inst)) {
2752 ++FlatASCount;
2753 IsVMEMAccess = true;
2754 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2755 }
2756
2757 if (TII->mayAccessLDSThroughFlat(Inst)) {
2758 ++FlatASCount;
2759 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2760 }
2761
2762 // Async/LDSDMA operations have FLAT encoding but do not actually use flat
2763 // pointers. They do have two operands that each access global and LDS, thus
2764 // making it appear at this point that they are using a flat pointer. Filter
2765 // them out, and for the rest, generate a dependency on flat pointers so
2766 // that both VM and LGKM counters are flushed.
2767 if (!SIInstrInfo::isLDSDMA(Inst) && FlatASCount > 1)
2768 ScoreBrackets->setPendingFlat();
2769 } else if (SIInstrInfo::isVMEM(Inst) &&
2771 IsVMEMAccess = true;
2772 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2773
2774 if (ST->vmemWriteNeedsExpWaitcnt() &&
2775 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2776 ScoreBrackets->updateByEvent(VMW_GPR_LOCK, Inst);
2777 }
2778 } else if (TII->isSMRD(Inst)) {
2779 IsSMEMAccess = true;
2780 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2781 } else if (Inst.isCall()) {
2782 // Act as a wait on everything
2783 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2784 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2785 } else if (SIInstrInfo::isLDSDIR(Inst)) {
2786 ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst);
2787 } else if (TII->isVINTERP(Inst)) {
2788 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2789 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2790 } else if (SIInstrInfo::isEXP(Inst)) {
2791 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2793 ScoreBrackets->updateByEvent(EXP_PARAM_ACCESS, Inst);
2794 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2795 ScoreBrackets->updateByEvent(EXP_POS_ACCESS, Inst);
2796 else
2797 ScoreBrackets->updateByEvent(EXP_GPR_LOCK, Inst);
2798 } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
2799 ScoreBrackets->updateByEvent(SCC_WRITE, Inst);
2800 } else {
2801 switch (Inst.getOpcode()) {
2802 case AMDGPU::S_SENDMSG:
2803 case AMDGPU::S_SENDMSG_RTN_B32:
2804 case AMDGPU::S_SENDMSG_RTN_B64:
2805 case AMDGPU::S_SENDMSGHALT:
2806 ScoreBrackets->updateByEvent(SQ_MESSAGE, Inst);
2807 break;
2808 case AMDGPU::S_MEMTIME:
2809 case AMDGPU::S_MEMREALTIME:
2810 case AMDGPU::S_GET_BARRIER_STATE_M0:
2811 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2812 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2813 break;
2814 }
2815 }
2816
2817 if (!ST->hasWaitXcnt())
2818 return;
2819
2820 if (IsVMEMAccess)
2821 ScoreBrackets->updateByEvent(VMEM_GROUP, Inst);
2822
2823 if (IsSMEMAccess)
2824 ScoreBrackets->updateByEvent(SMEM_GROUP, Inst);
2825}
2826
2827bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2828 unsigned OtherScore) {
2829 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2830 unsigned OtherShifted =
2831 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2832 Score = std::max(MyShifted, OtherShifted);
2833 return OtherShifted > MyShifted;
2834}
2835
2836/// Merge the pending events and associater score brackets of \p Other into
2837/// this brackets status.
2838///
2839/// Returns whether the merge resulted in a change that requires tighter waits
2840/// (i.e. the merged brackets strictly dominate the original brackets).
2841bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2842 bool StrictDom = false;
2843
2844 // Check if "other" has keys we don't have, and create default entries for
2845 // those. If they remain empty after merging, we will clean it up after.
2846 for (auto K : Other.VMem.keys())
2847 VMem.try_emplace(K);
2848 for (auto K : Other.SGPRs.keys())
2849 SGPRs.try_emplace(K);
2850
2851 for (auto T : inst_counter_types(Context->MaxCounter)) {
2852 // Merge event flags for this counter
2853 const WaitEventSet &EventsForT = Context->getWaitEventMask()[T];
2854 const WaitEventSet OldEvents = PendingEvents & EventsForT;
2855 const WaitEventSet OtherEvents = Other.PendingEvents & EventsForT;
2856 if (!OldEvents.contains(OtherEvents))
2857 StrictDom = true;
2858 PendingEvents |= OtherEvents;
2859
2860 // Merge scores for this counter
2861 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2862 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2863 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2864 if (NewUB < ScoreLBs[T])
2865 report_fatal_error("waitcnt score overflow");
2866
2867 MergeInfo M;
2868 M.OldLB = ScoreLBs[T];
2869 M.OtherLB = Other.ScoreLBs[T];
2870 M.MyShift = NewUB - ScoreUBs[T];
2871 M.OtherShift = NewUB - Other.ScoreUBs[T];
2872
2873 ScoreUBs[T] = NewUB;
2874
2875 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
2876
2877 if (T == DS_CNT)
2878 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
2879
2880 if (T == KM_CNT) {
2881 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
2882 if (Other.hasPendingEvent(SCC_WRITE)) {
2883 if (!OldEvents.contains(SCC_WRITE)) {
2884 PendingSCCWrite = Other.PendingSCCWrite;
2885 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
2886 PendingSCCWrite = nullptr;
2887 }
2888 }
2889 }
2890
2891 for (auto &[RegID, Info] : VMem)
2892 StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));
2893
2894 if (isSmemCounter(T)) {
2895 unsigned Idx = getSgprScoresIdx(T);
2896 for (auto &[RegID, Info] : SGPRs) {
2897 auto It = Other.SGPRs.find(RegID);
2898 unsigned OtherScore =
2899 (It != Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;
2900 StrictDom |= mergeScore(M, Info.Scores[Idx], OtherScore);
2901 }
2902 }
2903 }
2904
2905 for (auto &[TID, Info] : VMem) {
2906 if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {
2907 unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;
2908 StrictDom |= NewVmemTypes != Info.VMEMTypes;
2909 Info.VMEMTypes = NewVmemTypes;
2910 }
2911 }
2912
2913 purgeEmptyTrackingData();
2914 return StrictDom;
2915}
2916
2917static bool isWaitInstr(MachineInstr &Inst) {
2918 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2919 return Opcode == AMDGPU::S_WAITCNT ||
2920 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2921 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2922 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2923 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2924 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2925 counterTypeForInstr(Opcode).has_value();
2926}
2927
2928void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
2930 bool ExpertMode) const {
2931 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
2933 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SETREG_IMM32_B32))
2934 .addImm(ExpertMode ? 2 : 0)
2935 .addImm(EncodedReg);
2936}
2937
2938// Track back-to-back atomic RMW instructions, referred to as a block.
2939//
2940// Determines whether \p MI starts a new atomic RMW block, is inside
2941// an existing block, or is outside of a block. A block is broken when a
2942// CU-scoped memory op or an atomic store is encountered. ALU ops
2943// and non-memory instructions don't break a block. The function returns
2944// the new state after processing the current instruction based on
2945// \p PrevState, the previously captured state.
2946AtomicRMWState
2947SIInsertWaitcnts::getAtomicRMWState(MachineInstr &MI,
2948 AtomicRMWState PrevState) const {
2949 if (isAtomicRMW(MI)) {
2950 // Transition from NotInBlock -> NewBlock -> InsideBlock.
2951 if (PrevState == AtomicRMWState::NotInBlock)
2952 return AtomicRMWState::NewBlock;
2953 if (PrevState == AtomicRMWState::NewBlock)
2954 return AtomicRMWState::InsideBlock;
2955
2956 return PrevState;
2957 }
2958
2959 // LDS memory operations don't break the block.
2960 if (TII->isDS(MI) || (TII->isFLAT(MI) && TII->mayAccessLDSThroughFlat(MI)))
2961 return PrevState;
2962
2963 // Reset the atomic RMW block state when found other VMEM and SMEM operations.
2964 if (MI.mayLoad() ^ MI.mayStore())
2965 return AtomicRMWState::NotInBlock;
2966
2967 // Return the previous state otherwise.
2968 return PrevState;
2969}
2970
2971// Generate s_waitcnt instructions where needed.
2972bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2973 MachineBasicBlock &Block,
2974 WaitcntBrackets &ScoreBrackets) {
2975 bool Modified = false;
2976
2977 LLVM_DEBUG({
2978 dbgs() << "*** Begin Block: ";
2979 Block.printName(dbgs());
2980 ScoreBrackets.dump();
2981 });
2982
2983 // Track the correctness of vccz through this basic block. There are two
2984 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2985 // ST->partialVCCWritesUpdateVCCZ().
2986 bool VCCZCorrect = true;
2987 if (ST->hasReadVCCZBug()) {
2988 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2989 // to vcc and then issued an smem load.
2990 VCCZCorrect = false;
2991 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2992 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2993 // to vcc_lo or vcc_hi.
2994 VCCZCorrect = false;
2995 }
2996
2997 // Walk over the instructions.
2998 MachineInstr *OldWaitcntInstr = nullptr;
2999 AtomicRMWState RMWState = AtomicRMWState::NotInBlock;
3000
3001 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
3002 E = Block.instr_end();
3003 Iter != E;) {
3004 MachineInstr &Inst = *Iter;
3005 if (Inst.isMetaInstruction()) {
3006 ++Iter;
3007 continue;
3008 }
3009 // Get the atomic RMW block state for current instruction.
3010 RMWState = getAtomicRMWState(Inst, RMWState);
3011
3012 // Track pre-existing waitcnts that were added in earlier iterations or by
3013 // the memory legalizer.
3014 if (isWaitInstr(Inst) ||
3015 (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3016 ++Iter;
3017 bool IsSoftXcnt = isSoftXcnt(Inst);
3018 // The Memory Legalizer conservatively inserts a soft xcnt before each
3019 // atomic RMW operation. However, for sequences of back-to-back atomic
3020 // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
3021 // the redundant soft xcnts when we're inside an atomic RMW block.
3022 if (Iter != E && IsSoftXcnt) {
3023 // Check if the next instruction can potentially change the atomic RMW
3024 // state.
3025 RMWState = getAtomicRMWState(*Iter, RMWState);
3026 }
3027
3028 if (IsSoftXcnt && RMWState == AtomicRMWState::InsideBlock) {
3029 // Delete this soft xcnt.
3030 Inst.eraseFromParent();
3031 Modified = true;
3032 } else if (!OldWaitcntInstr) {
3033 OldWaitcntInstr = &Inst;
3034 }
3035 continue;
3036 }
3037
3038 PreheaderFlushFlags FlushFlags;
3039 if (Block.getFirstTerminator() == Inst)
3040 FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3041
3042 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
3043 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3044 FlushFlags);
3045 OldWaitcntInstr = nullptr;
3046
3047 // Restore vccz if it's not known to be correct already.
3048 bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::isCBranchVCCZRead(Inst);
3049
3050 // Don't examine operands unless we need to track vccz correctness.
3051 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
3052 if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
3053 Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) {
3054 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
3055 if (!ST->partialVCCWritesUpdateVCCZ())
3056 VCCZCorrect = false;
3057 } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) {
3058 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
3059 // vccz bit, so when we detect that an instruction may read from a
3060 // corrupt vccz bit, we need to:
3061 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
3062 // operations to complete.
3063 // 2. Restore the correct value of vccz by writing the current value
3064 // of vcc back to vcc.
3065 if (ST->hasReadVCCZBug() &&
3066 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
3067 // Writes to vcc while there's an outstanding smem read may get
3068 // clobbered as soon as any read completes.
3069 VCCZCorrect = false;
3070 } else {
3071 // Writes to vcc will fix any incorrect value in vccz.
3072 VCCZCorrect = true;
3073 }
3074 }
3075 }
3076
3077 if (TII->isSMRD(Inst)) {
3078 for (const MachineMemOperand *Memop : Inst.memoperands()) {
3079 // No need to handle invariant loads when avoiding WAR conflicts, as
3080 // there cannot be a vector store to the same memory location.
3081 if (!Memop->isInvariant()) {
3082 const Value *Ptr = Memop->getValue();
3083 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
3084 }
3085 }
3086 if (ST->hasReadVCCZBug()) {
3087 // This smem read could complete and clobber vccz at any time.
3088 VCCZCorrect = false;
3089 }
3090 }
3091
3092 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3093
3094 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
3095
3096 LLVM_DEBUG({
3097 Inst.print(dbgs());
3098 ScoreBrackets.dump();
3099 });
3100
3101 // TODO: Remove this work-around after fixing the scheduler and enable the
3102 // assert above.
3103 if (RestoreVCCZ) {
3104 // Restore the vccz bit. Any time a value is written to vcc, the vcc
3105 // bit is updated, so we can restore the bit by reading the value of
3106 // vcc and then writing it back to the register.
3107 BuildMI(Block, Inst, Inst.getDebugLoc(),
3108 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
3109 TRI->getVCC())
3110 .addReg(TRI->getVCC());
3111 VCCZCorrect = true;
3112 Modified = true;
3113 }
3114
3115 ++Iter;
3116 }
3117
3118 // Flush counters at the end of the block if needed (for preheaders with no
3119 // terminator).
3120 AMDGPU::Waitcnt Wait;
3121 if (Block.getFirstTerminator() == Block.end()) {
3122 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3123 if (FlushFlags.FlushVmCnt) {
3124 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
3125 Wait.LoadCnt = 0;
3126 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
3127 Wait.SampleCnt = 0;
3128 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
3129 Wait.BvhCnt = 0;
3130 }
3131 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
3132 Wait.DsCnt = 0;
3133 }
3134
3135 // Combine or remove any redundant waitcnts at the end of the block.
3136 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
3137 OldWaitcntInstr);
3138
3139 LLVM_DEBUG({
3140 dbgs() << "*** End Block: ";
3141 Block.printName(dbgs());
3142 ScoreBrackets.dump();
3143 });
3144
3145 return Modified;
3146}
3147
3148// Return flags indicating which counters should be flushed in the preheader.
3149PreheaderFlushFlags
3150SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
3151 const WaitcntBrackets &ScoreBrackets) {
3152 auto [Iterator, IsInserted] =
3153 PreheadersToFlush.try_emplace(&MBB, PreheaderFlushFlags());
3154 if (!IsInserted)
3155 return Iterator->second;
3156
3157 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
3158 if (!Succ)
3159 return PreheaderFlushFlags();
3160
3161 MachineLoop *Loop = MLI->getLoopFor(Succ);
3162 if (!Loop)
3163 return PreheaderFlushFlags();
3164
3165 if (Loop->getLoopPreheader() == &MBB) {
3166 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3167 return Iterator->second;
3168 }
3169
3170 return PreheaderFlushFlags();
3171}
3172
3173bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
3175 return TII->mayAccessVMEMThroughFlat(MI);
3176 return SIInstrInfo::isVMEM(MI);
3177}
3178
3179bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {
3180 return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore();
3181}
3182
3183// Check if instruction is a store to LDS that is counted via DSCNT
3184// (where that counter exists).
3185bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {
3186 if (!MI.mayStore())
3187 return false;
3188 if (SIInstrInfo::isDS(MI))
3189 return true;
3190 return false;
3191}
3192
3193// Return flags indicating which counters should be flushed in the preheader of
3194// the given loop. We currently decide to flush in a few situations:
3195// For VMEM (FlushVmCnt):
3196// 1. The loop contains vmem store(s), no vmem load and at least one use of a
3197// vgpr containing a value that is loaded outside of the loop. (Only on
3198// targets with no vscnt counter).
3199// 2. The loop contains vmem load(s), but the loaded values are not used in the
3200// loop, and at least one use of a vgpr containing a value that is loaded
3201// outside of the loop.
3202// For DS (FlushDsCnt, GFX12+ only):
3203// 3. The loop contains no DS reads, and at least one use of a vgpr containing
3204// a value that is DS loaded outside of the loop.
3205// 4. The loop contains DS read(s), loaded values are not used in the same
3206// iteration but in the next iteration (prefetch pattern), and at least one
3207// use of a vgpr containing a value that is DS loaded outside of the loop.
3208// Flushing in preheader reduces wait overhead if the wait requirement in
3209// iteration 1 would otherwise be more strict.
3210PreheaderFlushFlags
3211SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
3212 const WaitcntBrackets &Brackets) {
3213 PreheaderFlushFlags Flags;
3214 bool HasVMemLoad = false;
3215 bool HasVMemStore = false;
3216 bool SeenDSStoreInLoop = false;
3217 bool UsesVgprLoadedOutsideVMEM = false;
3218 bool UsesVgprLoadedOutsideDS = false;
3219 bool VMemInvalidated = false;
3220 // DS optimization only applies to GFX12+ where DS_CNT is separate.
3221 bool DSInvalidated = !ST->hasExtendedWaitCounts();
3222 DenseSet<MCRegUnit> VgprUse;
3223 DenseSet<MCRegUnit> VgprDefVMEM;
3224 DenseSet<MCRegUnit> VgprDefDS;
3225
3226 for (MachineBasicBlock *MBB : ML->blocks()) {
3227 bool SeenDSStoreInCurrMBB = false;
3228 for (MachineInstr &MI : *MBB) {
3229 if (isVMEMOrFlatVMEM(MI)) {
3230 HasVMemLoad |= MI.mayLoad();
3231 HasVMemStore |= MI.mayStore();
3232 }
3233 if (mayStoreIncrementingDSCNT(MI))
3234 SeenDSStoreInCurrMBB = true;
3235 // Stores postdominated by a barrier will have a wait at the barrier
3236 // and thus no need to be waited at the loop header. Barrier found
3237 // later in the same MBB during in-order traversal is used here as a
3238 // cheaper alternative to postdomination check.
3239 if (MI.getOpcode() == AMDGPU::S_BARRIER)
3240 SeenDSStoreInCurrMBB = false;
3241 for (const MachineOperand &Op : MI.all_uses()) {
3242 if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
3243 continue;
3244 // Vgpr use
3245 for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
3246 // If we find a register that is loaded inside the loop, 1. and 2.
3247 // are invalidated.
3248 if (VgprDefVMEM.contains(RU))
3249 VMemInvalidated = true;
3250
3251 // Check for DS loads used inside the loop
3252 if (VgprDefDS.contains(RU))
3253 DSInvalidated = true;
3254
3255 // Early exit if both optimizations are invalidated
3256 if (VMemInvalidated && DSInvalidated)
3257 return Flags;
3258
3259 VgprUse.insert(RU);
3260 // Check if this register has a pending VMEM load from outside the
3261 // loop (value loaded outside and used inside).
3262 VMEMID ID = toVMEMID(RU);
3263 bool HasPendingVMEM =
3264 Brackets.getVMemScore(ID, LOAD_CNT) >
3265 Brackets.getScoreLB(LOAD_CNT) ||
3266 Brackets.getVMemScore(ID, SAMPLE_CNT) >
3267 Brackets.getScoreLB(SAMPLE_CNT) ||
3268 Brackets.getVMemScore(ID, BVH_CNT) > Brackets.getScoreLB(BVH_CNT);
3269 if (HasPendingVMEM)
3270 UsesVgprLoadedOutsideVMEM = true;
3271 // Check if loaded outside the loop via DS (not VMEM/FLAT).
3272 // Only consider it a DS load if there's no pending VMEM load for
3273 // this register, since FLAT can set both counters.
3274 if (!HasPendingVMEM &&
3275 Brackets.getVMemScore(ID, DS_CNT) > Brackets.getScoreLB(DS_CNT))
3276 UsesVgprLoadedOutsideDS = true;
3277 }
3278 }
3279
3280 // VMem load vgpr def
3281 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
3282 for (const MachineOperand &Op : MI.all_defs()) {
3283 for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
3284 // If we find a register that is loaded inside the loop, 1. and 2.
3285 // are invalidated.
3286 if (VgprUse.contains(RU))
3287 VMemInvalidated = true;
3288 VgprDefVMEM.insert(RU);
3289 }
3290 }
3291 // Early exit if both optimizations are invalidated
3292 if (VMemInvalidated && DSInvalidated)
3293 return Flags;
3294 }
3295
3296 // DS read vgpr def
3297 // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
3298 // If USE comes before DEF, it's the prefetch pattern (use value from
3299 // previous iteration, load for next iteration). We should still flush
3300 // in preheader so iteration 1 doesn't need to wait inside the loop.
3301 // Only invalidate when DEF comes before USE (same-iteration consumption,
3302 // checked above when processing uses).
3303 if (isDSRead(MI)) {
3304 for (const MachineOperand &Op : MI.all_defs()) {
3305 for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
3306 VgprDefDS.insert(RU);
3307 }
3308 }
3309 }
3310 }
3311 // Accumulate unprotected DS stores from this MBB
3312 SeenDSStoreInLoop |= SeenDSStoreInCurrMBB;
3313 }
3314
3315 // VMEM flush decision
3316 if (!VMemInvalidated && UsesVgprLoadedOutsideVMEM &&
3317 ((!ST->hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3318 (HasVMemLoad && ST->hasVmemWriteVgprInOrder())))
3319 Flags.FlushVmCnt = true;
3320
3321 // DS flush decision: flush if loop uses DS-loaded values from outside
3322 // and either has no DS reads in the loop, or DS reads whose results
3323 // are not used in the loop.
3324 // DSInvalidated is pre-set to true on non-GFX12+ targets where DS_CNT
3325 // is LGKM_CNT which also tracks FLAT/SMEM.
3326 if (!DSInvalidated && !SeenDSStoreInLoop && UsesVgprLoadedOutsideDS)
3327 Flags.FlushDsCnt = true;
3328
3329 return Flags;
3330}
3331
3332bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3333 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3334 auto *PDT =
3335 &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3336 AliasAnalysis *AA = nullptr;
3337 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3338 AA = &AAR->getAAResults();
3339
3340 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
3341}
3342
3343PreservedAnalyses
3346 auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
3347 auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
3349 .getManager()
3350 .getCachedResult<AAManager>(MF.getFunction());
3351
3352 if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
3353 return PreservedAnalyses::all();
3354
3357 .preserve<AAManager>();
3358}
3359
3360bool SIInsertWaitcnts::run(MachineFunction &MF) {
3361 ST = &MF.getSubtarget<GCNSubtarget>();
3362 TII = ST->getInstrInfo();
3363 TRI = &TII->getRegisterInfo();
3364 MRI = &MF.getRegInfo();
3366
3368
3369 // Initialize hardware limits first, as they're needed by the generators.
3370 Limits = AMDGPU::HardwareLimits(IV);
3371
3372 if (ST->hasExtendedWaitCounts()) {
3373 IsExpertMode = ST->hasExpertSchedulingMode() &&
3374 (ExpertSchedulingModeFlag.getNumOccurrences()
3376 : MF.getFunction()
3377 .getFnAttribute("amdgpu-expert-scheduling-mode")
3378 .getValueAsBool());
3379 MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS;
3380 if (!WCG)
3381 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, &Limits,
3382 IsExpertMode);
3383 } else {
3384 MaxCounter = NUM_NORMAL_INST_CNTS;
3385 if (!WCG)
3386 WCG = std::make_unique<WaitcntGeneratorPreGFX12>(MF, NUM_NORMAL_INST_CNTS,
3387 &Limits);
3388 }
3389
3390 for (auto T : inst_counter_types())
3391 ForceEmitWaitcnt[T] = false;
3392
3393 SmemAccessCounter = eventCounter(WCG->getWaitEventMask(), SMEM_ACCESS);
3394
3395 BlockInfos.clear();
3396 bool Modified = false;
3397
3398 MachineBasicBlock &EntryBB = MF.front();
3399
3400 if (!MFI->isEntryFunction()) {
3401 // Wait for any outstanding memory operations that the input registers may
3402 // depend on. We can't track them and it's better to do the wait after the
3403 // costly call sequence.
3404
3405 // TODO: Could insert earlier and schedule more liberally with operations
3406 // that only use caller preserved registers.
3408 while (I != EntryBB.end() && I->isMetaInstruction())
3409 ++I;
3410
3411 if (ST->hasExtendedWaitCounts()) {
3412 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
3413 .addImm(0);
3414 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
3415 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
3416 continue;
3417
3418 if (!ST->hasImageInsts() &&
3419 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
3420 continue;
3421
3422 BuildMI(EntryBB, I, DebugLoc(),
3423 TII->get(instrsForExtendedCounterTypes[CT]))
3424 .addImm(0);
3425 }
3426 if (IsExpertMode) {
3427 unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, *ST);
3429 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3430 .addImm(Enc);
3431 }
3432 } else {
3433 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
3434 }
3435
3436 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
3437 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3438 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3439
3440 Modified = true;
3441 }
3442
3443 // Keep iterating over the blocks in reverse post order, inserting and
3444 // updating s_waitcnt where needed, until a fix point is reached.
3445 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3446 BlockInfos.try_emplace(MBB);
3447
3448 std::unique_ptr<WaitcntBrackets> Brackets;
3449 bool Repeat;
3450 do {
3451 Repeat = false;
3452
3453 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
3454 ++BII) {
3455 MachineBasicBlock *MBB = BII->first;
3456 BlockInfo &BI = BII->second;
3457 if (!BI.Dirty)
3458 continue;
3459
3460 if (BI.Incoming) {
3461 if (!Brackets)
3462 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3463 else
3464 *Brackets = *BI.Incoming;
3465 } else {
3466 if (!Brackets) {
3467 Brackets = std::make_unique<WaitcntBrackets>(this);
3468 } else {
3469 // Reinitialize in-place. N.B. do not do this by assigning from a
3470 // temporary because the WaitcntBrackets class is large and it could
3471 // cause this function to use an unreasonable amount of stack space.
3472 Brackets->~WaitcntBrackets();
3473 new (Brackets.get()) WaitcntBrackets(this);
3474 }
3475 }
3476
3477 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
3478 BI.Dirty = false;
3479
3480 if (Brackets->hasPendingEvent()) {
3481 BlockInfo *MoveBracketsToSucc = nullptr;
3482 for (MachineBasicBlock *Succ : MBB->successors()) {
3483 auto *SuccBII = BlockInfos.find(Succ);
3484 BlockInfo &SuccBI = SuccBII->second;
3485 if (!SuccBI.Incoming) {
3486 SuccBI.Dirty = true;
3487 if (SuccBII <= BII) {
3488 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
3489 Repeat = true;
3490 }
3491 if (!MoveBracketsToSucc) {
3492 MoveBracketsToSucc = &SuccBI;
3493 } else {
3494 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3495 }
3496 } else if (SuccBI.Incoming->merge(*Brackets)) {
3497 SuccBI.Dirty = true;
3498 if (SuccBII <= BII) {
3499 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
3500 Repeat = true;
3501 }
3502 }
3503 }
3504 if (MoveBracketsToSucc)
3505 MoveBracketsToSucc->Incoming = std::move(Brackets);
3506 }
3507 }
3508 } while (Repeat);
3509
3510 if (ST->hasScalarStores()) {
3511 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3512 bool HaveScalarStores = false;
3513
3514 for (MachineBasicBlock &MBB : MF) {
3515 for (MachineInstr &MI : MBB) {
3516 if (!HaveScalarStores && TII->isScalarStore(MI))
3517 HaveScalarStores = true;
3518
3519 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
3520 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3521 EndPgmBlocks.push_back(&MBB);
3522 }
3523 }
3524
3525 if (HaveScalarStores) {
3526 // If scalar writes are used, the cache must be flushed or else the next
3527 // wave to reuse the same scratch memory can be clobbered.
3528 //
3529 // Insert s_dcache_wb at wave termination points if there were any scalar
3530 // stores, and only if the cache hasn't already been flushed. This could
3531 // be improved by looking across blocks for flushes in postdominating
3532 // blocks from the stores but an explicitly requested flush is probably
3533 // very rare.
3534 for (MachineBasicBlock *MBB : EndPgmBlocks) {
3535 bool SeenDCacheWB = false;
3536
3537 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
3538 I != E; ++I) {
3539 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
3540 SeenDCacheWB = true;
3541 else if (TII->isScalarStore(*I))
3542 SeenDCacheWB = false;
3543
3544 // FIXME: It would be better to insert this before a waitcnt if any.
3545 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
3546 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3547 !SeenDCacheWB) {
3548 Modified = true;
3549 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
3550 }
3551 }
3552 }
3553 }
3554 }
3555
3556 if (IsExpertMode) {
3557 // Enable expert scheduling on function entry. To satisfy ABI requirements
3558 // and to allow calls between function with different expert scheduling
3559 // settings, disable it around calls and before returns.
3560
3562 while (I != EntryBB.end() && I->isMetaInstruction())
3563 ++I;
3564 setSchedulingMode(EntryBB, I, true);
3565
3566 for (MachineInstr *MI : CallInsts) {
3567 MachineBasicBlock &MBB = *MI->getParent();
3568 setSchedulingMode(MBB, MI, false);
3569 setSchedulingMode(MBB, std::next(MI->getIterator()), true);
3570 }
3571
3572 for (MachineInstr *MI : ReturnInsts)
3573 setSchedulingMode(*MI->getParent(), MI, false);
3574
3575 Modified = true;
3576 }
3577
3578 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
3579 // This is done in different ways depending on how the VGPRs were allocated
3580 // (i.e. whether we're in dynamic VGPR mode or not).
3581 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
3582 // waveslot limited kernel runs slower with the deallocation.
3583 if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) {
3584 for (auto [MI, _] : EndPgmInsts) {
3585 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3586 TII->get(AMDGPU::S_ALLOC_VGPR))
3587 .addImm(0);
3588 Modified = true;
3589 }
3590 } else if (!WCG->isOptNone() &&
3591 ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
3592 (MF.getFrameInfo().hasCalls() ||
3593 ST->getOccupancyWithNumVGPRs(
3594 TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
3595 /*IsDynamicVGPR=*/false) <
3597 for (auto [MI, Flag] : EndPgmInsts) {
3598 if (Flag) {
3599 if (ST->requiresNopBeforeDeallocVGPRs()) {
3600 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3601 TII->get(AMDGPU::S_NOP))
3602 .addImm(0);
3603 }
3604 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3605 TII->get(AMDGPU::S_SENDMSG))
3607 Modified = true;
3608 }
3609 }
3610 }
3611
3612 CallInsts.clear();
3613 ReturnInsts.clear();
3614 EndPgmInsts.clear();
3615 PreheadersToFlush.clear();
3616 SLoadAddresses.clear();
3617
3618 return Modified;
3619}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
#define _
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
#define T
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
#define AMDGPU_EVENT_NAME(Name)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
Provides some synthesis utilities to produce sequences of values.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:764
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator end()
Definition MapVector.h:67
iterator find(const KeyT &Key)
Definition MapVector.h:154
iterator begin()
Definition MapVector.h:65
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:864
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
bool isDPMACCInstruction(unsigned Opc)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
bool empty() const
Definition BasicBlock.h:101
LLVM_ABI std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
APInt operator&(APInt a, const APInt &b)
Definition APInt.h:2132
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition Sequence.h:337
@ Wait
Definition Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2122
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2163
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
bool operator|=(SparseBitVector< ElementSize > &LHS, const SparseBitVector< ElementSize > *RHS)
APInt operator|(APInt a, const APInt &b)
Definition APInt.h:2152
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.