LLVM 22.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
31#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/Sequence.h"
39#include "llvm/IR/Dominators.h"
43
44using namespace llvm;
45
46#define DEBUG_TYPE "si-insert-waitcnts"
47
48DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
49 "Force emit s_waitcnt expcnt(0) instrs");
50DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
51 "Force emit s_waitcnt lgkmcnt(0) instrs");
52DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
53 "Force emit s_waitcnt vmcnt(0) instrs");
54
55static cl::opt<bool>
56 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
57 cl::desc("Force all waitcnt instrs to be emitted as "
58 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
59 cl::init(false), cl::Hidden);
60
62 "amdgpu-waitcnt-load-forcezero",
63 cl::desc("Force all waitcnt load counters to wait until 0"),
64 cl::init(false), cl::Hidden);
65
67 "amdgpu-expert-scheduling-mode",
68 cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
69 cl::init(false), cl::Hidden);
70
71namespace {
72// Class of object that encapsulates latest instruction counter score
73// associated with the operand. Used for determining whether
74// s_waitcnt instruction needs to be emitted.
75
76enum InstCounterType {
77 LOAD_CNT = 0, // VMcnt prior to gfx12.
78 DS_CNT, // LKGMcnt prior to gfx12.
79 EXP_CNT, //
80 STORE_CNT, // VScnt in gfx10/gfx11.
81 NUM_NORMAL_INST_CNTS,
82 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
83 BVH_CNT, // gfx12+ only.
84 KM_CNT, // gfx12+ only.
85 X_CNT, // gfx1250.
86 NUM_EXTENDED_INST_CNTS,
87 VA_VDST = NUM_EXTENDED_INST_CNTS, // gfx12+ expert mode only.
88 VM_VSRC, // gfx12+ expert mode only.
89 NUM_EXPERT_INST_CNTS,
90 NUM_INST_CNTS = NUM_EXPERT_INST_CNTS
91};
92} // namespace
93
94namespace llvm {
95template <> struct enum_iteration_traits<InstCounterType> {
96 static constexpr bool is_iterable = true;
97};
98} // namespace llvm
99
100namespace {
101// Return an iterator over all counters between LOAD_CNT (the first counter)
102// and \c MaxCounter (exclusive, default value yields an enumeration over
103// all counters).
104auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
105 return enum_seq(LOAD_CNT, MaxCounter);
106}
107
108/// Integer IDs used to track vector memory locations we may have to wait on.
109/// Encoded as u16 chunks:
110///
111/// [0, REGUNITS_END ): MCRegUnit
112/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
113///
114/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
115/// It gives (2 << 16) - 1 entries per category which is more than enough
116/// for all register units. MCPhysReg is u16 so we don't even support >u16
117/// physical register numbers at this time, let alone >u16 register units.
118/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
119/// is enough for all register units.
120using VMEMID = uint32_t;
121
122enum : VMEMID {
123 TRACKINGID_RANGE_LEN = (1 << 16),
124
125 // Important: MCRegUnits must always be tracked starting from 0, as we
126 // need to be able to convert between a MCRegUnit and a VMEMID freely.
127 REGUNITS_BEGIN = 0,
128 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
129
130 // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
131 // entry, which is updated for all LDS DMA operations encountered.
132 // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
133 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
134 LDSDMA_BEGIN = REGUNITS_END,
135 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
136};
137
138/// Convert a MCRegUnit to a VMEMID.
139static constexpr VMEMID toVMEMID(MCRegUnit RU) {
140 return static_cast<unsigned>(RU);
141}
142
143struct HardwareLimits {
144 unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
145 unsigned ExpcntMax;
146 unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
147 unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
148 unsigned SamplecntMax; // gfx12+ only.
149 unsigned BvhcntMax; // gfx12+ only.
150 unsigned KmcntMax; // gfx12+ only.
151 unsigned XcntMax; // gfx1250.
152 unsigned VaVdstMax; // gfx12+ expert mode only.
153 unsigned VmVsrcMax; // gfx12+ expert mode only.
154};
155
156#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
157 DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \
158 DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
159 DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
160 DECL(GLOBAL_INV_ACCESS) /* GLOBAL_INV (gfx12+ only) */ \
161 DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
162 DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
163 DECL(VMEM_GROUP) /* vmem group */ \
164 DECL(LDS_ACCESS) /* lds read & write */ \
165 DECL(GDS_ACCESS) /* gds read & write */ \
166 DECL(SQ_MESSAGE) /* send message */ \
167 DECL(SCC_WRITE) /* write to SCC from barrier */ \
168 DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
169 DECL(SMEM_GROUP) /* scalar-memory group */ \
170 DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
171 DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
172 DECL(EXP_POS_ACCESS) /* write to export position */ \
173 DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
174 DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
175 DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */ \
176 DECL(VGPR_CSMACC_WRITE) /* write VGPR dest in Core/Side-MACC VALU */ \
177 DECL(VGPR_DPMACC_WRITE) /* write VGPR dest in DPMACC VALU */ \
178 DECL(VGPR_TRANS_WRITE) /* write VGPR dest in TRANS VALU */ \
179 DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \
180 DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
181 DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
182 DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */
183
184// clang-format off
185#define AMDGPU_EVENT_ENUM(Name) Name,
186enum WaitEventType {
188 NUM_WAIT_EVENTS
189};
190#undef AMDGPU_EVENT_ENUM
191
192#define AMDGPU_EVENT_NAME(Name) #Name,
193static constexpr StringLiteral WaitEventTypeName[] = {
195};
196#undef AMDGPU_EVENT_NAME
197// clang-format on
198
199// Enumerate different types of result-returning VMEM operations. Although
200// s_waitcnt orders them all with a single vmcnt counter, in the absence of
201// s_waitcnt only instructions of the same VmemType are guaranteed to write
202// their results in order -- so there is no need to insert an s_waitcnt between
203// two instructions of the same type that write the same vgpr.
204enum VmemType {
205 // BUF instructions and MIMG instructions without a sampler.
206 VMEM_NOSAMPLER,
207 // MIMG instructions with a sampler.
208 VMEM_SAMPLER,
209 // BVH instructions
210 VMEM_BVH,
211 NUM_VMEM_TYPES
212};
213
214// Maps values of InstCounterType to the instruction that waits on that
215// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
216// returns true, and does not cover VA_VDST or VM_VSRC.
217static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
218 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
219 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
220 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
221
222static bool updateVMCntOnly(const MachineInstr &Inst) {
223 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
225}
226
227#ifndef NDEBUG
228static bool isNormalMode(InstCounterType MaxCounter) {
229 return MaxCounter == NUM_NORMAL_INST_CNTS;
230}
231#endif // NDEBUG
232
233VmemType getVmemType(const MachineInstr &Inst) {
234 assert(updateVMCntOnly(Inst));
235 if (!SIInstrInfo::isImage(Inst))
236 return VMEM_NOSAMPLER;
238 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
240
241 if (BaseInfo->BVH)
242 return VMEM_BVH;
243
244 // We have to make an additional check for isVSAMPLE here since some
245 // instructions don't have a sampler, but are still classified as sampler
246 // instructions for the purposes of e.g. waitcnt.
247 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
248 return VMEM_SAMPLER;
249
250 return VMEM_NOSAMPLER;
251}
252
253unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
254 switch (T) {
255 case LOAD_CNT:
256 return Wait.LoadCnt;
257 case EXP_CNT:
258 return Wait.ExpCnt;
259 case DS_CNT:
260 return Wait.DsCnt;
261 case STORE_CNT:
262 return Wait.StoreCnt;
263 case SAMPLE_CNT:
264 return Wait.SampleCnt;
265 case BVH_CNT:
266 return Wait.BvhCnt;
267 case KM_CNT:
268 return Wait.KmCnt;
269 case X_CNT:
270 return Wait.XCnt;
271 case VA_VDST:
272 return Wait.VaVdst;
273 case VM_VSRC:
274 return Wait.VmVsrc;
275 default:
276 llvm_unreachable("bad InstCounterType");
277 }
278}
279
280void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
281 unsigned &WC = getCounterRef(Wait, T);
282 WC = std::min(WC, Count);
283}
284
285void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
286 getCounterRef(Wait, T) = ~0u;
287}
288
289unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
290 return getCounterRef(Wait, T);
291}
292
293// Mapping from event to counter according to the table masks.
294InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
295 for (auto T : inst_counter_types()) {
296 if (masks[T] & (1 << E))
297 return T;
298 }
299 llvm_unreachable("event type has no associated counter");
300}
301
302class WaitcntBrackets;
303
304// This abstracts the logic for generating and updating S_WAIT* instructions
305// away from the analysis that determines where they are needed. This was
306// done because the set of counters and instructions for waiting on them
307// underwent a major shift with gfx12, sufficiently so that having this
308// abstraction allows the main analysis logic to be simpler than it would
309// otherwise have had to become.
310class WaitcntGenerator {
311protected:
312 const GCNSubtarget *ST = nullptr;
313 const SIInstrInfo *TII = nullptr;
314 AMDGPU::IsaVersion IV;
315 InstCounterType MaxCounter;
316 bool OptNone;
317
318public:
319 WaitcntGenerator() = default;
320 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
321 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
322 IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
323 OptNone(MF.getFunction().hasOptNone() ||
324 MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
325
326 // Return true if the current function should be compiled with no
327 // optimization.
328 bool isOptNone() const { return OptNone; }
329
330 // Edits an existing sequence of wait count instructions according
331 // to an incoming Waitcnt value, which is itself updated to reflect
332 // any new wait count instructions which may need to be generated by
333 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
334 // were made.
335 //
336 // This editing will usually be merely updated operands, but it may also
337 // delete instructions if the incoming Wait value indicates they are not
338 // needed. It may also remove existing instructions for which a wait
339 // is needed if it can be determined that it is better to generate new
340 // instructions later, as can happen on gfx12.
341 virtual bool
342 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
343 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
345
346 // Transform a soft waitcnt into a normal one.
347 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
348
349 // Generates new wait count instructions according to the value of
350 // Wait, returning true if any new instructions were created.
351 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
353 AMDGPU::Waitcnt Wait) = 0;
354
355 // Returns an array of bit masks which can be used to map values in
356 // WaitEventType to corresponding counter values in InstCounterType.
357 virtual const unsigned *getWaitEventMask() const = 0;
358
359 // Returns a new waitcnt with all counters except VScnt set to 0. If
360 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
361 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
362
363 virtual ~WaitcntGenerator() = default;
364
365 // Create a mask value from the initializer list of wait event types.
366 static constexpr unsigned
367 eventMask(std::initializer_list<WaitEventType> Events) {
368 unsigned Mask = 0;
369 for (auto &E : Events)
370 Mask |= 1 << E;
371
372 return Mask;
373 }
374};
375
376class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
377public:
378 using WaitcntGenerator::WaitcntGenerator;
379
380 bool
381 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
382 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
383 MachineBasicBlock::instr_iterator It) const override;
384
385 bool createNewWaitcnt(MachineBasicBlock &Block,
387 AMDGPU::Waitcnt Wait) override;
388
389 const unsigned *getWaitEventMask() const override {
390 assert(ST);
391
392 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
393 eventMask(
394 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
395 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
396 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
397 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
398 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
399 0,
400 0,
401 0,
402 0,
403 0,
404 0};
405
406 return WaitEventMaskForInstPreGFX12;
407 }
408
409 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
410};
411
412class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
413protected:
414 bool IsExpertMode;
415
416public:
417 WaitcntGeneratorGFX12Plus() = default;
418 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
419 InstCounterType MaxCounter, bool IsExpertMode)
420 : WaitcntGenerator(MF, MaxCounter), IsExpertMode(IsExpertMode) {}
421
422 bool
423 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
424 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
425 MachineBasicBlock::instr_iterator It) const override;
426
427 bool createNewWaitcnt(MachineBasicBlock &Block,
429 AMDGPU::Waitcnt Wait) override;
430
431 const unsigned *getWaitEventMask() const override {
432 assert(ST);
433
434 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
435 eventMask({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
436 eventMask({LDS_ACCESS, GDS_ACCESS}),
437 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
438 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
439 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
440 eventMask({VMEM_SAMPLER_READ_ACCESS}),
441 eventMask({VMEM_BVH_READ_ACCESS}),
442 eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
443 eventMask({VMEM_GROUP, SMEM_GROUP}),
444 eventMask({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
445 VGPR_XDL_WRITE}),
446 eventMask({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
447
448 return WaitEventMaskForInstGFX12Plus;
449 }
450
451 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
452};
453
454class SIInsertWaitcnts {
455public:
456 const GCNSubtarget *ST;
457 const SIInstrInfo *TII = nullptr;
458 const SIRegisterInfo *TRI = nullptr;
459 const MachineRegisterInfo *MRI = nullptr;
460 InstCounterType SmemAccessCounter;
461 InstCounterType MaxCounter;
462 bool IsExpertMode = false;
463 const unsigned *WaitEventMaskForInst;
464
465private:
466 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
467 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
468 MachineLoopInfo *MLI;
469 MachinePostDominatorTree *PDT;
470 AliasAnalysis *AA = nullptr;
471
472 struct BlockInfo {
473 std::unique_ptr<WaitcntBrackets> Incoming;
474 bool Dirty = true;
475 };
476
477 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
478
479 bool ForceEmitWaitcnt[NUM_INST_CNTS];
480
481 // In any given run of this pass, WCG will point to one of these two
482 // generator objects, which must have been re-initialised before use
483 // from a value made using a subtarget constructor.
484 WaitcntGeneratorPreGFX12 WCGPreGFX12;
485 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
486
487 WaitcntGenerator *WCG = nullptr;
488
489 // Remember call and return instructions in the function.
490 DenseSet<MachineInstr *> CallInsts;
491 DenseSet<MachineInstr *> ReturnInsts;
492
493 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
494 // message.
495 DenseSet<MachineInstr *> ReleaseVGPRInsts;
496
497 HardwareLimits Limits;
498
499public:
500 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
501 AliasAnalysis *AA)
502 : MLI(MLI), PDT(PDT), AA(AA) {
503 (void)ForceExpCounter;
504 (void)ForceLgkmCounter;
505 (void)ForceVMCounter;
506 }
507
508 unsigned getWaitCountMax(InstCounterType T) const {
509 switch (T) {
510 case LOAD_CNT:
511 return Limits.LoadcntMax;
512 case DS_CNT:
513 return Limits.DscntMax;
514 case EXP_CNT:
515 return Limits.ExpcntMax;
516 case STORE_CNT:
517 return Limits.StorecntMax;
518 case SAMPLE_CNT:
519 return Limits.SamplecntMax;
520 case BVH_CNT:
521 return Limits.BvhcntMax;
522 case KM_CNT:
523 return Limits.KmcntMax;
524 case X_CNT:
525 return Limits.XcntMax;
526 case VA_VDST:
527 return Limits.VaVdstMax;
528 case VM_VSRC:
529 return Limits.VmVsrcMax;
530 default:
531 break;
532 }
533 return 0;
534 }
535
536 bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
537 bool isPreheaderToFlush(MachineBasicBlock &MBB,
538 const WaitcntBrackets &ScoreBrackets);
539 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
540 bool run(MachineFunction &MF);
541
542 void setForceEmitWaitcnt() {
543// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
544// For debug builds, get the debug counter info and adjust if need be
545#ifndef NDEBUG
546 if (DebugCounter::isCounterSet(ForceExpCounter) &&
547 DebugCounter::shouldExecute(ForceExpCounter)) {
548 ForceEmitWaitcnt[EXP_CNT] = true;
549 } else {
550 ForceEmitWaitcnt[EXP_CNT] = false;
551 }
552
553 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
554 DebugCounter::shouldExecute(ForceLgkmCounter)) {
555 ForceEmitWaitcnt[DS_CNT] = true;
556 ForceEmitWaitcnt[KM_CNT] = true;
557 } else {
558 ForceEmitWaitcnt[DS_CNT] = false;
559 ForceEmitWaitcnt[KM_CNT] = false;
560 }
561
562 if (DebugCounter::isCounterSet(ForceVMCounter) &&
563 DebugCounter::shouldExecute(ForceVMCounter)) {
564 ForceEmitWaitcnt[LOAD_CNT] = true;
565 ForceEmitWaitcnt[SAMPLE_CNT] = true;
566 ForceEmitWaitcnt[BVH_CNT] = true;
567 } else {
568 ForceEmitWaitcnt[LOAD_CNT] = false;
569 ForceEmitWaitcnt[SAMPLE_CNT] = false;
570 ForceEmitWaitcnt[BVH_CNT] = false;
571 }
572
573 ForceEmitWaitcnt[VA_VDST] = false;
574 ForceEmitWaitcnt[VM_VSRC] = false;
575#endif // NDEBUG
576 }
577
578 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
579 // instruction.
580 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
581 switch (Inst.getOpcode()) {
582 // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
583 case AMDGPU::GLOBAL_INV:
584 return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write
585 // VGPRs
586 case AMDGPU::GLOBAL_WB:
587 case AMDGPU::GLOBAL_WBINV:
588 return VMEM_WRITE_ACCESS; // tracked using storecnt
589 default:
590 break;
591 }
592
593 // Maps VMEM access types to their corresponding WaitEventType.
594 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
595 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
596
598 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
599 // these should use VM_CNT.
600 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
601 return VMEM_ACCESS;
602 if (Inst.mayStore() &&
603 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
604 if (TII->mayAccessScratch(Inst))
605 return SCRATCH_WRITE_ACCESS;
606 return VMEM_WRITE_ACCESS;
607 }
608 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
609 return VMEM_ACCESS;
610 return VmemReadMapping[getVmemType(Inst)];
611 }
612
613 std::optional<WaitEventType>
614 getExpertSchedulingEventType(const MachineInstr &Inst) const;
615
616 bool isVmemAccess(const MachineInstr &MI) const;
617 bool generateWaitcntInstBefore(MachineInstr &MI,
618 WaitcntBrackets &ScoreBrackets,
619 MachineInstr *OldWaitcntInstr,
620 bool FlushVmCnt);
621 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
623 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
624 MachineInstr *OldWaitcntInstr);
625 void updateEventWaitcntAfter(MachineInstr &Inst,
626 WaitcntBrackets *ScoreBrackets);
627 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
628 MachineBasicBlock *Block) const;
629 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
630 WaitcntBrackets &ScoreBrackets);
631 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
632 WaitcntBrackets &ScoreBrackets);
633 void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
634 bool ExpertMode) const;
635};
636
637// This objects maintains the current score brackets of each wait counter, and
638// a per-register scoreboard for each wait counter.
639//
640// We also maintain the latest score for every event type that can change the
641// waitcnt in order to know if there are multiple types of events within
642// the brackets. When multiple types of event happen in the bracket,
643// wait count may get decreased out of order, therefore we need to put in
644// "s_waitcnt 0" before use.
645class WaitcntBrackets {
646public:
647 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
648 assert(Context->TRI->getNumRegUnits() < REGUNITS_END);
649 }
650
651#ifndef NDEBUG
652 ~WaitcntBrackets() {
653 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
654 for (auto &[ID, Val] : VMem) {
655 if (Val.empty())
656 ++NumUnusedVmem;
657 }
658 for (auto &[ID, Val] : SGPRs) {
659 if (Val.empty())
660 ++NumUnusedSGPRs;
661 }
662
663 if (NumUnusedVmem || NumUnusedSGPRs) {
664 errs() << "WaitcntBracket had unused entries at destruction time: "
665 << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
666 << " SGPR unused entries\n";
667 std::abort();
668 }
669 }
670#endif
671
672 bool isSmemCounter(InstCounterType T) const {
673 return T == Context->SmemAccessCounter || T == X_CNT;
674 }
675
676 unsigned getSgprScoresIdx(InstCounterType T) const {
677 assert(isSmemCounter(T) && "Invalid SMEM counter");
678 return T == X_CNT ? 1 : 0;
679 }
680
681 unsigned getScoreLB(InstCounterType T) const {
682 assert(T < NUM_INST_CNTS);
683 return ScoreLBs[T];
684 }
685
686 unsigned getScoreUB(InstCounterType T) const {
687 assert(T < NUM_INST_CNTS);
688 return ScoreUBs[T];
689 }
690
691 unsigned getScoreRange(InstCounterType T) const {
692 return getScoreUB(T) - getScoreLB(T);
693 }
694
695 unsigned getSGPRScore(MCRegUnit RU, InstCounterType T) const {
696 auto It = SGPRs.find(RU);
697 return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(T)] : 0;
698 }
699
700 unsigned getVMemScore(VMEMID TID, InstCounterType T) const {
701 auto It = VMem.find(TID);
702 return It != VMem.end() ? It->second.Scores[T] : 0;
703 }
704
705 bool merge(const WaitcntBrackets &Other);
706
707 bool counterOutOfOrder(InstCounterType T) const;
708 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
709 simplifyWaitcnt(Wait, Wait);
710 }
711 void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
712 AMDGPU::Waitcnt &UpdateWait) const;
713 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
714 bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) const;
715 bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) const;
716 void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
717 AMDGPU::Waitcnt &UpdateWait) const;
718 void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
719 AMDGPU::Waitcnt &UpdateWait) const;
720
721 void determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
722 AMDGPU::Waitcnt &Wait) const;
723 void determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
724 AMDGPU::Waitcnt &Wait) const;
725 void tryClearSCCWriteEvent(MachineInstr *Inst);
726
727 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
728 void applyWaitcnt(InstCounterType T, unsigned Count);
729 void updateByEvent(WaitEventType E, MachineInstr &MI);
730
731 unsigned hasPendingEvent() const { return PendingEvents; }
732 unsigned hasPendingEvent(WaitEventType E) const {
733 return PendingEvents & (1 << E);
734 }
735 unsigned hasPendingEvent(InstCounterType T) const {
736 unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T];
737 assert((HasPending != 0) == (getScoreRange(T) != 0));
738 return HasPending;
739 }
740
741 bool hasMixedPendingEvents(InstCounterType T) const {
742 unsigned Events = hasPendingEvent(T);
743 // Return true if more than one bit is set in Events.
744 return Events & (Events - 1);
745 }
746
747 bool hasPendingFlat() const {
748 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
749 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
750 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
751 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
752 }
753
754 void setPendingFlat() {
755 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
756 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
757 }
758
759 bool hasPendingGDS() const {
760 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
761 }
762
763 unsigned getPendingGDSWait() const {
764 return std::min(getScoreUB(DS_CNT) - LastGDS,
765 Context->getWaitCountMax(DS_CNT) - 1);
766 }
767
768 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
769
770 // Return true if there might be pending writes to the vgpr-interval by VMEM
771 // instructions with types different from V.
772 bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
773 for (MCRegUnit RU : regunits(Reg)) {
774 auto It = VMem.find(toVMEMID(RU));
775 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
776 return true;
777 }
778 return false;
779 }
780
781 void clearVgprVmemTypes(MCPhysReg Reg) {
782 for (MCRegUnit RU : regunits(Reg)) {
783 if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
784 It->second.VMEMTypes = 0;
785 if (It->second.empty())
786 VMem.erase(It);
787 }
788 }
789 }
790
791 void setStateOnFunctionEntryOrReturn() {
792 setScoreUB(STORE_CNT,
793 getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
794 PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
795 }
796
797 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
798 return LDSDMAStores;
799 }
800
801 bool hasPointSampleAccel(const MachineInstr &MI) const;
802 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
803 MCPhysReg RU) const;
804
805 void print(raw_ostream &) const;
806 void dump() const { print(dbgs()); }
807
808 // Free up memory by removing empty entries from the DenseMap that track event
809 // scores.
810 void purgeEmptyTrackingData();
811
812private:
813 struct MergeInfo {
814 unsigned OldLB;
815 unsigned OtherLB;
816 unsigned MyShift;
817 unsigned OtherShift;
818 };
819
820 void determineWaitForScore(InstCounterType T, unsigned Score,
821 AMDGPU::Waitcnt &Wait) const;
822
823 static bool mergeScore(const MergeInfo &M, unsigned &Score,
824 unsigned OtherScore);
825
827 assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
828 if (!Context->TRI->isInAllocatableClass(Reg))
829 return {{}, {}};
830 const TargetRegisterClass *RC = Context->TRI->getPhysRegBaseClass(Reg);
831 unsigned Size = Context->TRI->getRegSizeInBits(*RC);
832 if (Size == 16 && Context->ST->hasD16Writes32BitVgpr())
833 Reg = Context->TRI->get32BitRegister(Reg);
834 return Context->TRI->regunits(Reg);
835 }
836
837 void setScoreLB(InstCounterType T, unsigned Val) {
838 assert(T < NUM_INST_CNTS);
839 ScoreLBs[T] = Val;
840 }
841
842 void setScoreUB(InstCounterType T, unsigned Val) {
843 assert(T < NUM_INST_CNTS);
844 ScoreUBs[T] = Val;
845
846 if (T != EXP_CNT)
847 return;
848
849 if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
850 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
851 }
852
853 void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {
854 const SIRegisterInfo *TRI = Context->TRI;
855 if (Reg == AMDGPU::SCC) {
856 SCCScore = Val;
857 } else if (TRI->isVectorRegister(*Context->MRI, Reg)) {
858 for (MCRegUnit RU : regunits(Reg))
859 VMem[toVMEMID(RU)].Scores[T] = Val;
860 } else if (TRI->isSGPRReg(*Context->MRI, Reg)) {
861 auto STy = getSgprScoresIdx(T);
862 for (MCRegUnit RU : regunits(Reg))
863 SGPRs[RU].Scores[STy] = Val;
864 } else {
865 llvm_unreachable("Register cannot be tracked/unknown register!");
866 }
867 }
868
869 void setVMemScore(VMEMID TID, InstCounterType T, unsigned Val) {
870 VMem[TID].Scores[T] = Val;
871 }
872
873 void setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy,
874 unsigned Val);
875
876 const SIInsertWaitcnts *Context;
877
878 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
879 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
880 unsigned PendingEvents = 0;
881 // Remember the last flat memory operation.
882 unsigned LastFlat[NUM_INST_CNTS] = {0};
883 // Remember the last GDS operation.
884 unsigned LastGDS = 0;
885
886 // The score tracking logic is fragmented as follows:
887 // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
888 // - SGPRs: SGPR RegUnits
889 // - SCC: Non-allocatable and not general purpose: not a SGPR.
890 //
891 // For the VMem case, if the key is within the range of LDS DMA IDs,
892 // then the corresponding index into the `LDSDMAStores` vector below is:
893 // Key - LDSDMA_BEGIN - 1
894 // This is because LDSDMA_BEGIN is a generic entry and does not have an
895 // associated MachineInstr.
896 //
897 // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
898
899 struct VMEMInfo {
900 // Scores for all instruction counters.
901 std::array<unsigned, NUM_INST_CNTS> Scores = {0};
902 // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
903 unsigned VMEMTypes = 0;
904
905 bool empty() const {
906 return all_of(Scores, [](unsigned K) { return K == 0; }) && !VMEMTypes;
907 }
908 };
909
910 struct SGPRInfo {
911 // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
912 // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
913 // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps
914 // the X_CNT score.
915 std::array<unsigned, 2> Scores = {0};
916
917 bool empty() const { return !Scores[0] && !Scores[1]; }
918 };
919
920 DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
921 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
922
923 // Reg score for SCC.
924 unsigned SCCScore = 0;
925 // The unique instruction that has an SCC write pending, if there is one.
926 const MachineInstr *PendingSCCWrite = nullptr;
927
928 // Store representative LDS DMA operations. The only useful info here is
929 // alias info. One store is kept per unique AAInfo.
930 SmallVector<const MachineInstr *> LDSDMAStores;
931};
932
933class SIInsertWaitcntsLegacy : public MachineFunctionPass {
934public:
935 static char ID;
936 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
937
938 bool runOnMachineFunction(MachineFunction &MF) override;
939
940 StringRef getPassName() const override {
941 return "SI insert wait instructions";
942 }
943
944 void getAnalysisUsage(AnalysisUsage &AU) const override {
945 AU.setPreservesCFG();
946 AU.addRequired<MachineLoopInfoWrapperPass>();
947 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
948 AU.addUsedIfAvailable<AAResultsWrapperPass>();
949 AU.addPreserved<AAResultsWrapperPass>();
951 }
952};
953
954} // end anonymous namespace
955
956void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
957 InstCounterType CntTy, unsigned Score) {
958 setRegScore(Op.getReg().asMCReg(), CntTy, Score);
959}
960
961// Return true if the subtarget is one that enables Point Sample Acceleration
962// and the MachineInstr passed in is one to which it might be applied (the
963// hardware makes this decision based on several factors, but we can't determine
964// this at compile time, so we have to assume it might be applied if the
965// instruction supports it).
966bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
967 if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
968 return false;
969
970 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
971 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
973 return BaseInfo->PointSampleAccel;
974}
975
976// Return true if the subtarget enables Point Sample Acceleration, the supplied
977// MachineInstr is one to which it might be applied and the supplied interval is
978// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
979// (this is the type that a point sample accelerated instruction effectively
980// becomes)
981bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
982 MCPhysReg Reg) const {
983 if (!hasPointSampleAccel(MI))
984 return false;
985
986 return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER);
987}
988
989void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
990 InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
991 assert(T < Context->MaxCounter);
992
993 unsigned UB = getScoreUB(T);
994 unsigned CurrScore = UB + 1;
995 if (CurrScore == 0)
996 report_fatal_error("InsertWaitcnt score wraparound");
997 // PendingEvents and ScoreUB need to be update regardless if this event
998 // changes the score of a register or not.
999 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
1000 PendingEvents |= 1 << E;
1001 setScoreUB(T, CurrScore);
1002
1003 const SIRegisterInfo *TRI = Context->TRI;
1004 const MachineRegisterInfo *MRI = Context->MRI;
1005 const SIInstrInfo *TII = Context->TII;
1006
1007 if (T == EXP_CNT) {
1008 // Put score on the source vgprs. If this is a store, just use those
1009 // specific register(s).
1010 if (TII->isDS(Inst) && Inst.mayLoadOrStore()) {
1011 // All GDS operations must protect their address register (same as
1012 // export.)
1013 if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
1014 setScoreByOperand(*AddrOp, EXP_CNT, CurrScore);
1015
1016 if (Inst.mayStore()) {
1017 if (const auto *Data0 =
1018 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
1019 setScoreByOperand(*Data0, EXP_CNT, CurrScore);
1020 if (const auto *Data1 =
1021 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
1022 setScoreByOperand(*Data1, EXP_CNT, CurrScore);
1023 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
1024 Inst.getOpcode() != AMDGPU::DS_APPEND &&
1025 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
1026 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1027 for (const MachineOperand &Op : Inst.all_uses()) {
1028 if (TRI->isVectorRegister(*MRI, Op.getReg()))
1029 setScoreByOperand(Op, EXP_CNT, CurrScore);
1030 }
1031 }
1032 } else if (TII->isFLAT(Inst)) {
1033 if (Inst.mayStore()) {
1034 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1035 EXP_CNT, CurrScore);
1036 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1037 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1038 EXP_CNT, CurrScore);
1039 }
1040 } else if (TII->isMIMG(Inst)) {
1041 if (Inst.mayStore()) {
1042 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
1043 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1044 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1045 EXP_CNT, CurrScore);
1046 }
1047 } else if (TII->isMTBUF(Inst)) {
1048 if (Inst.mayStore())
1049 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
1050 } else if (TII->isMUBUF(Inst)) {
1051 if (Inst.mayStore()) {
1052 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
1053 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1054 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1055 EXP_CNT, CurrScore);
1056 }
1057 } else if (TII->isLDSDIR(Inst)) {
1058 // LDSDIR instructions attach the score to the destination.
1059 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
1060 EXP_CNT, CurrScore);
1061 } else {
1062 if (TII->isEXP(Inst)) {
1063 // For export the destination registers are really temps that
1064 // can be used as the actual source after export patching, so
1065 // we need to treat them like sources and set the EXP_CNT
1066 // score.
1067 for (MachineOperand &DefMO : Inst.all_defs()) {
1068 if (TRI->isVGPR(*MRI, DefMO.getReg())) {
1069 setScoreByOperand(DefMO, EXP_CNT, CurrScore);
1070 }
1071 }
1072 }
1073 for (const MachineOperand &Op : Inst.all_uses()) {
1074 if (TRI->isVectorRegister(*MRI, Op.getReg()))
1075 setScoreByOperand(Op, EXP_CNT, CurrScore);
1076 }
1077 }
1078 } else if (T == X_CNT) {
1079 WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1080 if (PendingEvents & (1 << OtherEvent)) {
1081 // Hardware inserts an implicit xcnt between interleaved
1082 // SMEM and VMEM operations. So there will never be
1083 // outstanding address translations for both SMEM and
1084 // VMEM at the same time.
1085 setScoreLB(T, getScoreUB(T) - 1);
1086 PendingEvents &= ~(1 << OtherEvent);
1087 }
1088 for (const MachineOperand &Op : Inst.all_uses())
1089 setScoreByOperand(Op, T, CurrScore);
1090 } else if (T == VA_VDST || T == VM_VSRC) {
1091 // Match the score to the VGPR destination or source registers as
1092 // appropriate
1093 for (const MachineOperand &Op : Inst.operands()) {
1094 if (!Op.isReg() || (T == VA_VDST && Op.isUse()) ||
1095 (T == VM_VSRC && Op.isDef()))
1096 continue;
1097 if (TRI->isVectorRegister(*Context->MRI, Op.getReg()))
1098 setScoreByOperand(Op, T, CurrScore);
1099 }
1100 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1101 // Match the score to the destination registers.
1102 //
1103 // Check only explicit operands. Stores, especially spill stores, include
1104 // implicit uses and defs of their super registers which would create an
1105 // artificial dependency, while these are there only for register liveness
1106 // accounting purposes.
1107 //
1108 // Special cases where implicit register defs exists, such as M0 or VCC,
1109 // but none with memory instructions.
1110 for (const MachineOperand &Op : Inst.defs()) {
1111 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
1112 if (!TRI->isVectorRegister(*MRI, Op.getReg())) // TODO: add wrapper
1113 continue;
1114 if (updateVMCntOnly(Inst)) {
1115 // updateVMCntOnly should only leave us with VGPRs
1116 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1117 // defs. That's required for a sane index into `VgprMemTypes` below
1118 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
1119 VmemType V = getVmemType(Inst);
1120 unsigned char TypesMask = 1 << V;
1121 // If instruction can have Point Sample Accel applied, we have to flag
1122 // this with another potential dependency
1123 if (hasPointSampleAccel(Inst))
1124 TypesMask |= 1 << VMEM_NOSAMPLER;
1125 for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))
1126 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1127 }
1128 }
1129 setScoreByOperand(Op, T, CurrScore);
1130 }
1131 if (Inst.mayStore() &&
1132 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
1133 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1134 // written can be accessed. A load from LDS to VMEM does not need a wait.
1135 //
1136 // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
1137 // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
1138 // store. The "Slot" is the index into LDSDMAStores + 1.
1139 unsigned Slot = 0;
1140 for (const auto *MemOp : Inst.memoperands()) {
1141 if (!MemOp->isStore() ||
1142 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1143 continue;
1144 // Comparing just AA info does not guarantee memoperands are equal
1145 // in general, but this is so for LDS DMA in practice.
1146 auto AAI = MemOp->getAAInfo();
1147 // Alias scope information gives a way to definitely identify an
1148 // original memory object and practically produced in the module LDS
1149 // lowering pass. If there is no scope available we will not be able
1150 // to disambiguate LDS aliasing as after the module lowering all LDS
1151 // is squashed into a single big object.
1152 if (!AAI || !AAI.Scope)
1153 break;
1154 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1155 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1156 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1157 Slot = I + 1;
1158 break;
1159 }
1160 }
1161 }
1162 if (Slot)
1163 break;
1164 // The slot may not be valid because it can be >= NUM_LDSDMA which
1165 // means the scoreboard cannot track it. We still want to preserve the
1166 // MI in order to check alias information, though.
1167 LDSDMAStores.push_back(&Inst);
1168 Slot = LDSDMAStores.size();
1169 break;
1170 }
1171 setVMemScore(LDSDMA_BEGIN, T, CurrScore);
1172 if (Slot && Slot < NUM_LDSDMA)
1173 setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
1174 }
1175
1177 setRegScore(AMDGPU::SCC, T, CurrScore);
1178 PendingSCCWrite = &Inst;
1179 }
1180 }
1181}
1182
1183void WaitcntBrackets::print(raw_ostream &OS) const {
1184 const GCNSubtarget *ST = Context->ST;
1185
1186 OS << '\n';
1187 for (auto T : inst_counter_types(Context->MaxCounter)) {
1188 unsigned SR = getScoreRange(T);
1189
1190 switch (T) {
1191 case LOAD_CNT:
1192 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1193 << SR << "):";
1194 break;
1195 case DS_CNT:
1196 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1197 << SR << "):";
1198 break;
1199 case EXP_CNT:
1200 OS << " EXP_CNT(" << SR << "):";
1201 break;
1202 case STORE_CNT:
1203 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1204 << SR << "):";
1205 break;
1206 case SAMPLE_CNT:
1207 OS << " SAMPLE_CNT(" << SR << "):";
1208 break;
1209 case BVH_CNT:
1210 OS << " BVH_CNT(" << SR << "):";
1211 break;
1212 case KM_CNT:
1213 OS << " KM_CNT(" << SR << "):";
1214 break;
1215 case X_CNT:
1216 OS << " X_CNT(" << SR << "):";
1217 break;
1218 case VA_VDST:
1219 OS << " VA_VDST(" << SR << "): ";
1220 break;
1221 case VM_VSRC:
1222 OS << " VM_VSRC(" << SR << "): ";
1223 break;
1224 default:
1225 OS << " UNKNOWN(" << SR << "):";
1226 break;
1227 }
1228
1229 if (SR != 0) {
1230 // Print vgpr scores.
1231 unsigned LB = getScoreLB(T);
1232
1233 SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
1234 sort(SortedVMEMIDs);
1235
1236 for (auto ID : SortedVMEMIDs) {
1237 unsigned RegScore = VMem.at(ID).Scores[T];
1238 if (RegScore <= LB)
1239 continue;
1240 unsigned RelScore = RegScore - LB - 1;
1241 if (ID < REGUNITS_END) {
1242 OS << ' ' << RelScore << ":vRU" << ID;
1243 } else {
1244 assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
1245 "Unhandled/unexpected ID value!");
1246 OS << ' ' << RelScore << ":LDSDMA" << ID;
1247 }
1248 }
1249
1250 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1251 if (isSmemCounter(T)) {
1252 SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
1253 sort(SortedSMEMIDs);
1254 for (auto ID : SortedSMEMIDs) {
1255 unsigned RegScore = SGPRs.at(ID).Scores[getSgprScoresIdx(T)];
1256 if (RegScore <= LB)
1257 continue;
1258 unsigned RelScore = RegScore - LB - 1;
1259 OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
1260 }
1261 }
1262
1263 if (T == KM_CNT && SCCScore > 0)
1264 OS << ' ' << SCCScore << ":scc";
1265 }
1266 OS << '\n';
1267 }
1268
1269 OS << "Pending Events: ";
1270 if (hasPendingEvent()) {
1271 ListSeparator LS;
1272 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1273 if (hasPendingEvent((WaitEventType)I)) {
1274 OS << LS << WaitEventTypeName[I];
1275 }
1276 }
1277 } else {
1278 OS << "none";
1279 }
1280 OS << '\n';
1281
1282 OS << '\n';
1283}
1284
1285/// Simplify \p UpdateWait by removing waits that are redundant based on the
1286/// current WaitcntBrackets and any other waits specified in \p CheckWait.
1287void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
1288 AMDGPU::Waitcnt &UpdateWait) const {
1289 simplifyWaitcnt(LOAD_CNT, UpdateWait.LoadCnt);
1290 simplifyWaitcnt(EXP_CNT, UpdateWait.ExpCnt);
1291 simplifyWaitcnt(DS_CNT, UpdateWait.DsCnt);
1292 simplifyWaitcnt(STORE_CNT, UpdateWait.StoreCnt);
1293 simplifyWaitcnt(SAMPLE_CNT, UpdateWait.SampleCnt);
1294 simplifyWaitcnt(BVH_CNT, UpdateWait.BvhCnt);
1295 simplifyWaitcnt(KM_CNT, UpdateWait.KmCnt);
1296 simplifyXcnt(CheckWait, UpdateWait);
1297 simplifyWaitcnt(VA_VDST, UpdateWait.VaVdst);
1298 simplifyVmVsrc(CheckWait, UpdateWait);
1299}
1300
1301void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1302 unsigned &Count) const {
1303 // The number of outstanding events for this type, T, can be calculated
1304 // as (UB - LB). If the current Count is greater than or equal to the number
1305 // of outstanding events, then the wait for this counter is redundant.
1306 if (Count >= getScoreRange(T))
1307 Count = ~0u;
1308}
1309
1310void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
1311 AMDGPU::Waitcnt &UpdateWait) const {
1312 // Waiting for some counters implies waiting for VM_VSRC, since an
1313 // instruction that decrements a counter on completion would have
1314 // decremented VM_VSRC once its VGPR operands had been read.
1315 if (CheckWait.VmVsrc >=
1316 std::min({CheckWait.LoadCnt, CheckWait.StoreCnt, CheckWait.SampleCnt,
1317 CheckWait.BvhCnt, CheckWait.DsCnt}))
1318 UpdateWait.VmVsrc = ~0u;
1319 simplifyWaitcnt(VM_VSRC, UpdateWait.VmVsrc);
1320}
1321
1322void WaitcntBrackets::purgeEmptyTrackingData() {
1323 for (auto &[K, V] : make_early_inc_range(VMem)) {
1324 if (V.empty())
1325 VMem.erase(K);
1326 }
1327 for (auto &[K, V] : make_early_inc_range(SGPRs)) {
1328 if (V.empty())
1329 SGPRs.erase(K);
1330 }
1331}
1332
1333void WaitcntBrackets::determineWaitForScore(InstCounterType T,
1334 unsigned ScoreToWait,
1335 AMDGPU::Waitcnt &Wait) const {
1336 const unsigned LB = getScoreLB(T);
1337 const unsigned UB = getScoreUB(T);
1338
1339 // If the score falls within the bracket, we need a waitcnt.
1340 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1341 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1342 !Context->ST->hasFlatLgkmVMemCountInOrder()) {
1343 // If there is a pending FLAT operation, and this is a VMem or LGKM
1344 // waitcnt and the target can report early completion, then we need
1345 // to force a waitcnt 0.
1346 addWait(Wait, T, 0);
1347 } else if (counterOutOfOrder(T)) {
1348 // Counter can get decremented out-of-order when there
1349 // are multiple types event in the bracket. Also emit an s_wait counter
1350 // with a conservative value of 0 for the counter.
1351 addWait(Wait, T, 0);
1352 } else {
1353 // If a counter has been maxed out avoid overflow by waiting for
1354 // MAX(CounterType) - 1 instead.
1355 unsigned NeededWait =
1356 std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
1357 addWait(Wait, T, NeededWait);
1358 }
1359 }
1360}
1361
1362void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
1363 AMDGPU::Waitcnt &Wait) const {
1364 if (Reg == AMDGPU::SCC) {
1365 determineWaitForScore(T, SCCScore, Wait);
1366 } else {
1367 bool IsVGPR = Context->TRI->isVectorRegister(*Context->MRI, Reg);
1368 for (MCRegUnit RU : regunits(Reg))
1369 determineWaitForScore(
1370 T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
1371 Wait);
1372 }
1373}
1374
1375void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
1376 AMDGPU::Waitcnt &Wait) const {
1377 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1378 determineWaitForScore(T, getVMemScore(TID, T), Wait);
1379}
1380
1381void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1382 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1383 // SCC has landed
1384 if (PendingSCCWrite &&
1385 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1386 PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
1387 unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
1388 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1389 if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) ==
1390 SCC_WRITE_PendingEvent) {
1391 setScoreLB(KM_CNT, getScoreUB(KM_CNT));
1392 }
1393
1394 PendingEvents &= ~SCC_WRITE_PendingEvent;
1395 PendingSCCWrite = nullptr;
1396 }
1397}
1398
1399void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1400 applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1401 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1402 applyWaitcnt(DS_CNT, Wait.DsCnt);
1403 applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1404 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1405 applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1406 applyWaitcnt(KM_CNT, Wait.KmCnt);
1407 applyWaitcnt(X_CNT, Wait.XCnt);
1408 applyWaitcnt(VA_VDST, Wait.VaVdst);
1409 applyWaitcnt(VM_VSRC, Wait.VmVsrc);
1410}
1411
1412void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1413 const unsigned UB = getScoreUB(T);
1414 if (Count >= UB)
1415 return;
1416 if (Count != 0) {
1417 if (counterOutOfOrder(T))
1418 return;
1419 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1420 } else {
1421 setScoreLB(T, UB);
1422 PendingEvents &= ~Context->WaitEventMaskForInst[T];
1423 }
1424
1425 if (T == KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) {
1426 if (!hasMixedPendingEvents(X_CNT))
1427 applyWaitcnt(X_CNT, 0);
1428 else
1429 PendingEvents &= ~(1 << SMEM_GROUP);
1430 }
1431 if (T == LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
1432 !hasPendingEvent(STORE_CNT)) {
1433 if (!hasMixedPendingEvents(X_CNT))
1434 applyWaitcnt(X_CNT, Count);
1435 else if (Count == 0)
1436 PendingEvents &= ~(1 << VMEM_GROUP);
1437 }
1438}
1439
1440bool WaitcntBrackets::hasRedundantXCntWithKmCnt(
1441 const AMDGPU::Waitcnt &Wait) const {
1442 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1443 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1444 // zero.
1445 return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP);
1446}
1447
1448bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(
1449 const AMDGPU::Waitcnt &Wait) const {
1450 // If we have pending store we cannot optimize XCnt because we do not wait for
1451 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1452 // decremented to the same number as LOADCnt.
1453 return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1454 !hasPendingEvent(STORE_CNT);
1455}
1456
1457void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
1458 AMDGPU::Waitcnt &UpdateWait) const {
1459 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1460 // optimizations. On entry to a block with multiple predescessors, there may
1461 // be pending SMEM and VMEM events active at the same time.
1462 // In such cases, only clear one active event at a time.
1463 // TODO: Revisit xcnt optimizations for gfx1250.
1464 if (hasRedundantXCntWithKmCnt(CheckWait))
1465 UpdateWait.XCnt = ~0u;
1466 if (canOptimizeXCntWithLoadCnt(CheckWait) &&
1467 CheckWait.XCnt >= CheckWait.LoadCnt)
1468 UpdateWait.XCnt = ~0u;
1469 simplifyWaitcnt(X_CNT, UpdateWait.XCnt);
1470}
1471
1472// Where there are multiple types of event in the bracket of a counter,
1473// the decrement may go out of order.
1474bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1475 // Scalar memory read always can go out of order.
1476 if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1477 (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
1478 return true;
1479
1480 // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
1481 // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
1482 // out-of-order completion.
1483 if (T == LOAD_CNT) {
1484 unsigned Events = hasPendingEvent(T);
1485 // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
1486 // events
1487 Events &= ~(1 << GLOBAL_INV_ACCESS);
1488 // Return true only if there are still multiple event types after removing
1489 // GLOBAL_INV
1490 return Events & (Events - 1);
1491 }
1492
1493 return hasMixedPendingEvents(T);
1494}
1495
1496INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1497 false, false)
1500INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1502
1503char SIInsertWaitcntsLegacy::ID = 0;
1504
1505char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1506
1508 return new SIInsertWaitcntsLegacy();
1509}
1510
1511static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1512 unsigned NewEnc) {
1513 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1514 assert(OpIdx >= 0);
1515
1516 MachineOperand &MO = MI.getOperand(OpIdx);
1517
1518 if (NewEnc == MO.getImm())
1519 return false;
1520
1521 MO.setImm(NewEnc);
1522 return true;
1523}
1524
1525/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1526/// and if so, which counter it is waiting on.
1527static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1528 switch (Opcode) {
1529 case AMDGPU::S_WAIT_LOADCNT:
1530 return LOAD_CNT;
1531 case AMDGPU::S_WAIT_EXPCNT:
1532 return EXP_CNT;
1533 case AMDGPU::S_WAIT_STORECNT:
1534 return STORE_CNT;
1535 case AMDGPU::S_WAIT_SAMPLECNT:
1536 return SAMPLE_CNT;
1537 case AMDGPU::S_WAIT_BVHCNT:
1538 return BVH_CNT;
1539 case AMDGPU::S_WAIT_DSCNT:
1540 return DS_CNT;
1541 case AMDGPU::S_WAIT_KMCNT:
1542 return KM_CNT;
1543 case AMDGPU::S_WAIT_XCNT:
1544 return X_CNT;
1545 default:
1546 return {};
1547 }
1548}
1549
1550bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1551 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1552 if (Opcode == Waitcnt->getOpcode())
1553 return false;
1554
1555 Waitcnt->setDesc(TII->get(Opcode));
1556 return true;
1557}
1558
1559/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1560/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1561/// from \p Wait that were added by previous passes. Currently this pass
1562/// conservatively assumes that these preexisting waits are required for
1563/// correctness.
1564bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1565 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1566 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1567 assert(ST);
1568 assert(isNormalMode(MaxCounter));
1569
1570 bool Modified = false;
1571 MachineInstr *WaitcntInstr = nullptr;
1572 MachineInstr *WaitcntVsCntInstr = nullptr;
1573
1574 LLVM_DEBUG({
1575 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1576 if (It == OldWaitcntInstr.getParent()->instr_end())
1577 dbgs() << "end of block\n";
1578 else
1579 dbgs() << *It;
1580 });
1581
1582 for (auto &II :
1583 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1584 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1585 if (II.isMetaInstruction()) {
1586 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1587 continue;
1588 }
1589
1590 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1591 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1592
1593 // Update required wait count. If this is a soft waitcnt (= it was added
1594 // by an earlier pass), it may be entirely removed.
1595 if (Opcode == AMDGPU::S_WAITCNT) {
1596 unsigned IEnc = II.getOperand(0).getImm();
1597 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1598 if (TrySimplify)
1599 ScoreBrackets.simplifyWaitcnt(OldWait);
1600 Wait = Wait.combined(OldWait);
1601
1602 // Merge consecutive waitcnt of the same type by erasing multiples.
1603 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1604 II.eraseFromParent();
1605 Modified = true;
1606 } else
1607 WaitcntInstr = &II;
1608 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1609 assert(ST->hasVMemToLDSLoad());
1610 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1611 << "Before: " << Wait << '\n';);
1612 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, LDSDMA_BEGIN, Wait);
1613 LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
1614
1615 // It is possible (but unlikely) that this is the only wait instruction,
1616 // in which case, we exit this loop without a WaitcntInstr to consume
1617 // `Wait`. But that works because `Wait` was passed in by reference, and
1618 // the callee eventually calls createNewWaitcnt on it. We test this
1619 // possibility in an articial MIR test since such a situation cannot be
1620 // recreated by running the memory legalizer.
1621 II.eraseFromParent();
1622 } else {
1623 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1624 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1625
1626 unsigned OldVSCnt =
1627 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1628 if (TrySimplify)
1629 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1630 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1631
1632 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1633 II.eraseFromParent();
1634 Modified = true;
1635 } else
1636 WaitcntVsCntInstr = &II;
1637 }
1638 }
1639
1640 if (WaitcntInstr) {
1641 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1643 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1644
1645 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1646 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1647 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1648 Wait.LoadCnt = ~0u;
1649 Wait.ExpCnt = ~0u;
1650 Wait.DsCnt = ~0u;
1651
1652 LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1653 ? dbgs()
1654 << "applied pre-existing waitcnt\n"
1655 << "New Instr at block end: " << *WaitcntInstr << '\n'
1656 : dbgs() << "applied pre-existing waitcnt\n"
1657 << "Old Instr: " << *It
1658 << "New Instr: " << *WaitcntInstr << '\n');
1659 }
1660
1661 if (WaitcntVsCntInstr) {
1662 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
1663 AMDGPU::OpName::simm16, Wait.StoreCnt);
1664 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1665
1666 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1667 Wait.StoreCnt = ~0u;
1668
1669 LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1670 ? dbgs() << "applied pre-existing waitcnt\n"
1671 << "New Instr at block end: " << *WaitcntVsCntInstr
1672 << '\n'
1673 : dbgs() << "applied pre-existing waitcnt\n"
1674 << "Old Instr: " << *It
1675 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1676 }
1677
1678 return Modified;
1679}
1680
1681/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1682/// required counters in \p Wait
1683bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1684 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1685 AMDGPU::Waitcnt Wait) {
1686 assert(ST);
1687 assert(isNormalMode(MaxCounter));
1688
1689 bool Modified = false;
1690 const DebugLoc &DL = Block.findDebugLoc(It);
1691
1692 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1693 // single instruction while VScnt has its own instruction.
1694 if (Wait.hasWaitExceptStoreCnt()) {
1695 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1696 [[maybe_unused]] auto SWaitInst =
1697 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1698 Modified = true;
1699
1700 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1701 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1702 dbgs() << "New Instr: " << *SWaitInst << '\n');
1703 }
1704
1705 if (Wait.hasWaitStoreCnt()) {
1706 assert(ST->hasVscnt());
1707
1708 [[maybe_unused]] auto SWaitInst =
1709 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1710 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1711 .addImm(Wait.StoreCnt);
1712 Modified = true;
1713
1714 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1715 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1716 dbgs() << "New Instr: " << *SWaitInst << '\n');
1717 }
1718
1719 return Modified;
1720}
1721
1722AMDGPU::Waitcnt
1723WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1724 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1725}
1726
1727AMDGPU::Waitcnt
1728WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1729 unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
1730 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1731 ~0u /* XCNT */, ExpertVal, ExpertVal);
1732}
1733
1734/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1735/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1736/// were added by previous passes. Currently this pass conservatively
1737/// assumes that these preexisting waits are required for correctness.
1738bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1739 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1740 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1741 assert(ST);
1742 assert(!isNormalMode(MaxCounter));
1743
1744 bool Modified = false;
1745 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1746 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1747 MachineInstr *WaitcntDepctrInstr = nullptr;
1748 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1749
1750 LLVM_DEBUG({
1751 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
1752 if (It == OldWaitcntInstr.getParent()->instr_end())
1753 dbgs() << "end of block\n";
1754 else
1755 dbgs() << *It;
1756 });
1757
1758 // Accumulate waits that should not be simplified.
1759 AMDGPU::Waitcnt RequiredWait;
1760
1761 for (auto &II :
1762 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1763 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1764 if (II.isMetaInstruction()) {
1765 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1766 continue;
1767 }
1768
1769 MachineInstr **UpdatableInstr;
1770
1771 // Update required wait count. If this is a soft waitcnt (= it was added
1772 // by an earlier pass), it may be entirely removed.
1773
1774 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1775 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1776
1777 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1778 // attempt to do more than that either.
1779 if (Opcode == AMDGPU::S_WAITCNT)
1780 continue;
1781
1782 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1783 unsigned OldEnc =
1784 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1785 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
1786 if (TrySimplify)
1787 Wait = Wait.combined(OldWait);
1788 else
1789 RequiredWait = RequiredWait.combined(OldWait);
1790 UpdatableInstr = &CombinedLoadDsCntInstr;
1791 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1792 unsigned OldEnc =
1793 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1794 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
1795 if (TrySimplify)
1796 Wait = Wait.combined(OldWait);
1797 else
1798 RequiredWait = RequiredWait.combined(OldWait);
1799 UpdatableInstr = &CombinedStoreDsCntInstr;
1800 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
1801 unsigned OldEnc =
1802 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1803 AMDGPU::Waitcnt OldWait;
1804 OldWait.VaVdst = AMDGPU::DepCtr::decodeFieldVaVdst(OldEnc);
1805 OldWait.VmVsrc = AMDGPU::DepCtr::decodeFieldVmVsrc(OldEnc);
1806 if (TrySimplify)
1807 ScoreBrackets.simplifyWaitcnt(OldWait);
1808 Wait = Wait.combined(OldWait);
1809 UpdatableInstr = &WaitcntDepctrInstr;
1810 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1811 // Architectures higher than GFX10 do not have direct loads to
1812 // LDS, so no work required here yet.
1813 II.eraseFromParent();
1814 continue;
1815 } else {
1816 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1817 assert(CT.has_value());
1818 unsigned OldCnt =
1819 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1820 if (TrySimplify)
1821 addWait(Wait, CT.value(), OldCnt);
1822 else
1823 addWait(RequiredWait, CT.value(), OldCnt);
1824 UpdatableInstr = &WaitInstrs[CT.value()];
1825 }
1826
1827 // Merge consecutive waitcnt of the same type by erasing multiples.
1828 if (!*UpdatableInstr) {
1829 *UpdatableInstr = &II;
1830 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
1831 // S_WAITCNT_DEPCTR requires special care. Don't remove a
1832 // duplicate if it is waiting on things other than VA_VDST or
1833 // VM_VSRC. If that is the case, just make sure the VA_VDST and
1834 // VM_VSRC subfields of the operand are set to the "no wait"
1835 // values.
1836
1837 unsigned Enc = TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1838 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u);
1839 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u);
1840
1841 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST)) {
1842 Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc);
1843 Modified |= promoteSoftWaitCnt(&II);
1844 } else {
1845 II.eraseFromParent();
1846 Modified = true;
1847 }
1848 } else {
1849 II.eraseFromParent();
1850 Modified = true;
1851 }
1852 }
1853
1854 ScoreBrackets.simplifyWaitcnt(Wait.combined(RequiredWait), Wait);
1855 Wait = Wait.combined(RequiredWait);
1856
1857 if (CombinedLoadDsCntInstr) {
1858 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1859 // to be waited for. Otherwise, let the instruction be deleted so
1860 // the appropriate single counter wait instruction can be inserted
1861 // instead, when new S_WAIT_*CNT instructions are inserted by
1862 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1863 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1864 // the loop below that deals with single counter instructions.
1865 //
1866 // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
1867 // instructions that have decremented LOAD_CNT or DS_CNT on completion
1868 // will have needed to wait for their register sources to be available
1869 // first.
1870 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1871 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1872 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1873 AMDGPU::OpName::simm16, NewEnc);
1874 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1875 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1876 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1877 Wait.LoadCnt = ~0u;
1878 Wait.DsCnt = ~0u;
1879
1880 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1881 ? dbgs() << "applied pre-existing waitcnt\n"
1882 << "New Instr at block end: "
1883 << *CombinedLoadDsCntInstr << '\n'
1884 : dbgs() << "applied pre-existing waitcnt\n"
1885 << "Old Instr: " << *It << "New Instr: "
1886 << *CombinedLoadDsCntInstr << '\n');
1887 } else {
1888 CombinedLoadDsCntInstr->eraseFromParent();
1889 Modified = true;
1890 }
1891 }
1892
1893 if (CombinedStoreDsCntInstr) {
1894 // Similarly for S_WAIT_STORECNT_DSCNT.
1895 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1896 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1897 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
1898 AMDGPU::OpName::simm16, NewEnc);
1899 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1900 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1901 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1902 Wait.StoreCnt = ~0u;
1903 Wait.DsCnt = ~0u;
1904
1905 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1906 ? dbgs() << "applied pre-existing waitcnt\n"
1907 << "New Instr at block end: "
1908 << *CombinedStoreDsCntInstr << '\n'
1909 : dbgs() << "applied pre-existing waitcnt\n"
1910 << "Old Instr: " << *It << "New Instr: "
1911 << *CombinedStoreDsCntInstr << '\n');
1912 } else {
1913 CombinedStoreDsCntInstr->eraseFromParent();
1914 Modified = true;
1915 }
1916 }
1917
1918 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
1919 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1920 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1921 // instructions so that createNewWaitcnt() will create new combined
1922 // instructions to replace them.
1923
1924 if (Wait.DsCnt != ~0u) {
1925 // This is a vector of addresses in WaitInstrs pointing to instructions
1926 // that should be removed if they are present.
1928
1929 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1930 // both) need to be waited for, ensure that there are no existing
1931 // individual wait count instructions for these.
1932
1933 if (Wait.LoadCnt != ~0u) {
1934 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
1935 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1936 } else if (Wait.StoreCnt != ~0u) {
1937 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
1938 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1939 }
1940
1941 for (MachineInstr **WI : WaitsToErase) {
1942 if (!*WI)
1943 continue;
1944
1945 (*WI)->eraseFromParent();
1946 *WI = nullptr;
1947 Modified = true;
1948 }
1949 }
1950
1951 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1952 if (!WaitInstrs[CT])
1953 continue;
1954
1955 unsigned NewCnt = getWait(Wait, CT);
1956 if (NewCnt != ~0u) {
1957 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
1958 AMDGPU::OpName::simm16, NewCnt);
1959 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1960
1961 ScoreBrackets.applyWaitcnt(CT, NewCnt);
1962 setNoWait(Wait, CT);
1963
1964 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1965 ? dbgs() << "applied pre-existing waitcnt\n"
1966 << "New Instr at block end: " << *WaitInstrs[CT]
1967 << '\n'
1968 : dbgs() << "applied pre-existing waitcnt\n"
1969 << "Old Instr: " << *It
1970 << "New Instr: " << *WaitInstrs[CT] << '\n');
1971 } else {
1972 WaitInstrs[CT]->eraseFromParent();
1973 Modified = true;
1974 }
1975 }
1976
1977 if (WaitcntDepctrInstr) {
1978 // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
1979 // subfields with the new required values.
1980 unsigned Enc =
1981 TII->getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
1982 ->getImm();
1983 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, Wait.VmVsrc);
1984 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.VaVdst);
1985
1986 ScoreBrackets.applyWaitcnt(VA_VDST, Wait.VaVdst);
1987 ScoreBrackets.applyWaitcnt(VM_VSRC, Wait.VmVsrc);
1988 Wait.VaVdst = ~0u;
1989 Wait.VmVsrc = ~0u;
1990
1991 // If that new encoded Depctr immediate would actually still wait
1992 // for anything, update the instruction's operand. Otherwise it can
1993 // just be deleted.
1994 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST)) {
1995 Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr,
1996 AMDGPU::OpName::simm16, Enc);
1997 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1998 ? dbgs() << "applyPreexistingWaitcnt\n"
1999 << "New Instr at block end: "
2000 << *WaitcntDepctrInstr << '\n'
2001 : dbgs() << "applyPreexistingWaitcnt\n"
2002 << "Old Instr: " << *It
2003 << "New Instr: " << *WaitcntDepctrInstr << '\n');
2004 } else {
2005 WaitcntDepctrInstr->eraseFromParent();
2006 Modified = true;
2007 }
2008 }
2009
2010 return Modified;
2011}
2012
2013/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
2014bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2015 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
2016 AMDGPU::Waitcnt Wait) {
2017 assert(ST);
2018 assert(!isNormalMode(MaxCounter));
2019
2020 bool Modified = false;
2021 const DebugLoc &DL = Block.findDebugLoc(It);
2022
2023 // Check for opportunities to use combined wait instructions.
2024 if (Wait.DsCnt != ~0u) {
2025 MachineInstr *SWaitInst = nullptr;
2026
2027 if (Wait.LoadCnt != ~0u) {
2028 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2029
2030 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2031 .addImm(Enc);
2032
2033 Wait.LoadCnt = ~0u;
2034 Wait.DsCnt = ~0u;
2035 } else if (Wait.StoreCnt != ~0u) {
2036 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2037
2038 SWaitInst =
2039 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
2040 .addImm(Enc);
2041
2042 Wait.StoreCnt = ~0u;
2043 Wait.DsCnt = ~0u;
2044 }
2045
2046 if (SWaitInst) {
2047 Modified = true;
2048
2049 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2050 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2051 dbgs() << "New Instr: " << *SWaitInst << '\n');
2052 }
2053 }
2054
2055 // Generate an instruction for any remaining counter that needs
2056 // waiting for.
2057
2058 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2059 unsigned Count = getWait(Wait, CT);
2060 if (Count == ~0u)
2061 continue;
2062
2063 [[maybe_unused]] auto SWaitInst =
2064 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
2065 .addImm(Count);
2066
2067 Modified = true;
2068
2069 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2070 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2071 dbgs() << "New Instr: " << *SWaitInst << '\n');
2072 }
2073
2074 if (Wait.hasWaitDepctr()) {
2075 assert(IsExpertMode);
2076 unsigned Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Wait.VmVsrc, *ST);
2077 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.VaVdst);
2078
2079 [[maybe_unused]] auto SWaitInst =
2080 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_DEPCTR)).addImm(Enc);
2081
2082 Modified = true;
2083
2084 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
2085 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2086 dbgs() << "New Instr: " << *SWaitInst << '\n');
2087 }
2088
2089 return Modified;
2090}
2091
2092/// Generate s_waitcnt instruction to be placed before cur_Inst.
2093/// Instructions of a given type are returned in order,
2094/// but instructions of different types can complete out of order.
2095/// We rely on this in-order completion
2096/// and simply assign a score to the memory access instructions.
2097/// We keep track of the active "score bracket" to determine
2098/// if an access of a memory read requires an s_waitcnt
2099/// and if so what the value of each counter is.
2100/// The "score bracket" is bound by the lower bound and upper bound
2101/// scores (*_score_LB and *_score_ub respectively).
2102/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
2103/// flush the vmcnt counter here.
2104bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
2105 WaitcntBrackets &ScoreBrackets,
2106 MachineInstr *OldWaitcntInstr,
2107 bool FlushVmCnt) {
2108 setForceEmitWaitcnt();
2109
2110 assert(!MI.isMetaInstruction());
2111
2112 AMDGPU::Waitcnt Wait;
2113 const unsigned Opc = MI.getOpcode();
2114
2115 // FIXME: This should have already been handled by the memory legalizer.
2116 // Removing this currently doesn't affect any lit tests, but we need to
2117 // verify that nothing was relying on this. The number of buffer invalidates
2118 // being handled here should not be expanded.
2119 if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC ||
2120 Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV ||
2121 Opc == AMDGPU::BUFFER_GL1_INV) {
2122 Wait.LoadCnt = 0;
2123 }
2124
2125 // All waits must be resolved at call return.
2126 // NOTE: this could be improved with knowledge of all call sites or
2127 // with knowledge of the called routines.
2128 if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN ||
2129 Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
2130 Opc == AMDGPU::S_SETPC_B64_return) {
2131 ReturnInsts.insert(&MI);
2132 AMDGPU::Waitcnt AllZeroWait =
2133 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2134 // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
2135 // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
2136 // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
2137 // no need to wait for it at function boundaries.
2138 if (ST->hasExtendedWaitCounts() &&
2139 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
2140 AllZeroWait.LoadCnt = ~0u;
2141 Wait = Wait.combined(AllZeroWait);
2142 }
2143 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
2144 // Technically the hardware will do this on its own if we don't, but that
2145 // might cost extra cycles compared to doing it explicitly.
2146 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
2147 // have to wait for outstanding VMEM stores. In this case it can be useful to
2148 // send a message to explicitly release all VGPRs before the stores have
2149 // completed, but it is only safe to do this if there are no outstanding
2150 // scratch stores.
2151 else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) {
2152 if (!WCG->isOptNone() &&
2153 (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
2154 (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
2155 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
2156 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
2157 ReleaseVGPRInsts.insert(&MI);
2158 }
2159 // Resolve vm waits before gs-done.
2160 else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) &&
2161 ST->hasLegacyGeometry() &&
2162 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
2164 Wait.LoadCnt = 0;
2165 }
2166
2167 // Export & GDS instructions do not read the EXEC mask until after the export
2168 // is granted (which can occur well after the instruction is issued).
2169 // The shader program must flush all EXP operations on the export-count
2170 // before overwriting the EXEC mask.
2171 else {
2172 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
2173 // Export and GDS are tracked individually, either may trigger a waitcnt
2174 // for EXEC.
2175 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2176 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2177 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2178 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2179 Wait.ExpCnt = 0;
2180 }
2181 }
2182
2183 // Wait for any pending GDS instruction to complete before any
2184 // "Always GDS" instruction.
2185 if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
2186 addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
2187
2188 if (MI.isCall()) {
2189 // The function is going to insert a wait on everything in its prolog.
2190 // This still needs to be careful if the call target is a load (e.g. a GOT
2191 // load). We also need to check WAW dependency with saved PC.
2192 CallInsts.insert(&MI);
2193 Wait = AMDGPU::Waitcnt();
2194
2195 const MachineOperand &CallAddrOp = TII->getCalleeOperand(MI);
2196 if (CallAddrOp.isReg()) {
2197 ScoreBrackets.determineWaitForPhysReg(
2198 SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait);
2199
2200 if (const auto *RtnAddrOp =
2201 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
2202 ScoreBrackets.determineWaitForPhysReg(
2203 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait);
2204 }
2205 }
2206 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
2207 ScoreBrackets.tryClearSCCWriteEvent(&MI);
2208 } else {
2209 // FIXME: Should not be relying on memoperands.
2210 // Look at the source operands of every instruction to see if
2211 // any of them results from a previous memory operation that affects
2212 // its current usage. If so, an s_waitcnt instruction needs to be
2213 // emitted.
2214 // If the source operand was defined by a load, add the s_waitcnt
2215 // instruction.
2216 //
2217 // Two cases are handled for destination operands:
2218 // 1) If the destination operand was defined by a load, add the s_waitcnt
2219 // instruction to guarantee the right WAW order.
2220 // 2) If a destination operand that was used by a recent export/store ins,
2221 // add s_waitcnt on exp_cnt to guarantee the WAR order.
2222
2223 for (const MachineMemOperand *Memop : MI.memoperands()) {
2224 const Value *Ptr = Memop->getValue();
2225 if (Memop->isStore()) {
2226 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
2227 addWait(Wait, SmemAccessCounter, 0);
2228 if (PDT->dominates(MI.getParent(), It->second))
2229 SLoadAddresses.erase(It);
2230 }
2231 }
2232 unsigned AS = Memop->getAddrSpace();
2234 continue;
2235 // No need to wait before load from VMEM to LDS.
2236 if (TII->mayWriteLDSThroughDMA(MI))
2237 continue;
2238
2239 // LOAD_CNT is only relevant to vgpr or LDS.
2240 unsigned TID = LDSDMA_BEGIN;
2241 if (Ptr && Memop->getAAInfo()) {
2242 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2243 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2244 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
2245 if ((I + 1) >= NUM_LDSDMA) {
2246 // We didn't have enough slot to track this LDS DMA store, it
2247 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
2248 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
2249 break;
2250 }
2251
2252 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID + I + 1, Wait);
2253 }
2254 }
2255 } else {
2256 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
2257 }
2258 if (Memop->isStore()) {
2259 ScoreBrackets.determineWaitForLDSDMA(EXP_CNT, TID, Wait);
2260 }
2261 }
2262
2263 // Loop over use and def operands.
2264 for (const MachineOperand &Op : MI.operands()) {
2265 if (!Op.isReg())
2266 continue;
2267
2268 // If the instruction does not read tied source, skip the operand.
2269 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
2270 continue;
2271
2272 MCPhysReg Reg = Op.getReg().asMCReg();
2273
2274 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
2275 if (IsVGPR) {
2276 // Implicit VGPR defs and uses are never a part of the memory
2277 // instructions description and usually present to account for
2278 // super-register liveness.
2279 // TODO: Most of the other instructions also have implicit uses
2280 // for the liveness accounting only.
2281 if (Op.isImplicit() && MI.mayLoadOrStore())
2282 continue;
2283
2284 ScoreBrackets.determineWaitForPhysReg(VA_VDST, Reg, Wait);
2285 if (Op.isDef())
2286 ScoreBrackets.determineWaitForPhysReg(VM_VSRC, Reg, Wait);
2287 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2288 // previous write and this write are the same type of VMEM
2289 // instruction, in which case they are (in some architectures)
2290 // guaranteed to write their results in order anyway.
2291 // Additionally check instructions where Point Sample Acceleration
2292 // might be applied.
2293 if (Op.isUse() || !updateVMCntOnly(MI) ||
2294 ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||
2295 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
2296 !ST->hasVmemWriteVgprInOrder()) {
2297 ScoreBrackets.determineWaitForPhysReg(LOAD_CNT, Reg, Wait);
2298 ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT, Reg, Wait);
2299 ScoreBrackets.determineWaitForPhysReg(BVH_CNT, Reg, Wait);
2300 ScoreBrackets.clearVgprVmemTypes(Reg);
2301 }
2302
2303 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2304 ScoreBrackets.determineWaitForPhysReg(EXP_CNT, Reg, Wait);
2305 }
2306 ScoreBrackets.determineWaitForPhysReg(DS_CNT, Reg, Wait);
2307 } else if (Op.getReg() == AMDGPU::SCC) {
2308 ScoreBrackets.determineWaitForPhysReg(KM_CNT, Reg, Wait);
2309 } else {
2310 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait);
2311 }
2312
2313 if (ST->hasWaitXCnt() && Op.isDef())
2314 ScoreBrackets.determineWaitForPhysReg(X_CNT, Reg, Wait);
2315 }
2316 }
2317 }
2318
2319 // Ensure safety against exceptions from outstanding memory operations while
2320 // waiting for a barrier:
2321 //
2322 // * Some subtargets safely handle backing off the barrier in hardware
2323 // when an exception occurs.
2324 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2325 // there can be no outstanding memory operations during the wait.
2326 // * Subtargets with split barriers don't need to back off the barrier; it
2327 // is up to the trap handler to preserve the user barrier state correctly.
2328 //
2329 // In all other cases, ensure safety by ensuring that there are no outstanding
2330 // memory operations.
2331 if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() &&
2332 !ST->supportsBackOffBarrier()) {
2333 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2334 }
2335
2336 // TODO: Remove this work-around, enable the assert for Bug 457939
2337 // after fixing the scheduler. Also, the Shader Compiler code is
2338 // independent of target.
2339 if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() &&
2340 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2341 Wait.DsCnt = 0;
2342 }
2343
2344 // Verify that the wait is actually needed.
2345 ScoreBrackets.simplifyWaitcnt(Wait);
2346
2347 // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
2348 // waits on VA_VDST if the instruction it would precede is not a VALU
2349 // instruction, since hardware handles VALU->VGPR->VALU hazards in
2350 // expert scheduling mode.
2351 if (TII->isVALU(MI))
2352 Wait.VaVdst = ~0u;
2353
2354 // Since the translation for VMEM addresses occur in-order, we can apply the
2355 // XCnt if the current instruction is of VMEM type and has a memory
2356 // dependency with another VMEM instruction in flight.
2357 if (Wait.XCnt != ~0u && isVmemAccess(MI)) {
2358 ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt);
2359 Wait.XCnt = ~0u;
2360 }
2361
2362 // When forcing emit, we need to skip terminators because that would break the
2363 // terminators of the MBB if we emit a waitcnt between terminators.
2364 if (ForceEmitZeroFlag && !MI.isTerminator())
2365 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2366
2367 if (ForceEmitWaitcnt[LOAD_CNT])
2368 Wait.LoadCnt = 0;
2369 if (ForceEmitWaitcnt[EXP_CNT])
2370 Wait.ExpCnt = 0;
2371 if (ForceEmitWaitcnt[DS_CNT])
2372 Wait.DsCnt = 0;
2373 if (ForceEmitWaitcnt[SAMPLE_CNT])
2374 Wait.SampleCnt = 0;
2375 if (ForceEmitWaitcnt[BVH_CNT])
2376 Wait.BvhCnt = 0;
2377 if (ForceEmitWaitcnt[KM_CNT])
2378 Wait.KmCnt = 0;
2379 if (ForceEmitWaitcnt[X_CNT])
2380 Wait.XCnt = 0;
2381 // Only force emit VA_VDST and VM_VSRC if expert mode is enabled.
2382 if (IsExpertMode) {
2383 if (ForceEmitWaitcnt[VA_VDST])
2384 Wait.VaVdst = 0;
2385 if (ForceEmitWaitcnt[VM_VSRC])
2386 Wait.VmVsrc = 0;
2387 }
2388
2389 if (FlushVmCnt) {
2390 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2391 Wait.LoadCnt = 0;
2392 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2393 Wait.SampleCnt = 0;
2394 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2395 Wait.BvhCnt = 0;
2396 }
2397
2398 if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
2399 Wait.LoadCnt = 0;
2400
2401 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2402 OldWaitcntInstr);
2403}
2404
2405bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2407 MachineBasicBlock &Block,
2408 WaitcntBrackets &ScoreBrackets,
2409 MachineInstr *OldWaitcntInstr) {
2410 bool Modified = false;
2411
2412 if (OldWaitcntInstr)
2413 // Try to merge the required wait with preexisting waitcnt instructions.
2414 // Also erase redundant waitcnt.
2415 Modified =
2416 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2417
2418 // Any counts that could have been applied to any existing waitcnt
2419 // instructions will have been done so, now deal with any remaining.
2420 ScoreBrackets.applyWaitcnt(Wait);
2421
2422 // ExpCnt can be merged into VINTERP.
2423 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
2425 MachineOperand *WaitExp =
2426 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2427 if (Wait.ExpCnt < WaitExp->getImm()) {
2428 WaitExp->setImm(Wait.ExpCnt);
2429 Modified = true;
2430 }
2431 Wait.ExpCnt = ~0u;
2432
2433 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2434 << "Update Instr: " << *It);
2435 }
2436
2437 if (WCG->createNewWaitcnt(Block, It, Wait))
2438 Modified = true;
2439
2440 return Modified;
2441}
2442
2443std::optional<WaitEventType>
2444SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
2445 if (TII->isVALU(Inst)) {
2446 // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
2447 // out-of-order with respect to each other, so each of these classes
2448 // has its own event.
2449
2450 if (TII->isXDL(Inst))
2451 return VGPR_XDL_WRITE;
2452
2453 if (TII->isTRANS(Inst))
2454 return VGPR_TRANS_WRITE;
2455
2457 return VGPR_DPMACC_WRITE;
2458
2459 return VGPR_CSMACC_WRITE;
2460 }
2461
2462 // FLAT and LDS instructions may read their VGPR sources out-of-order
2463 // with respect to each other and all other VMEM instructions, so
2464 // each of these also has a separate event.
2465
2466 if (TII->isFLAT(Inst))
2467 return VGPR_FLAT_READ;
2468
2469 if (TII->isDS(Inst))
2470 return VGPR_LDS_READ;
2471
2472 if (TII->isVMEM(Inst) || TII->isVIMAGE(Inst) || TII->isVSAMPLE(Inst))
2473 return VGPR_VMEM_READ;
2474
2475 // Otherwise, no hazard.
2476
2477 return {};
2478}
2479
2480bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2481 return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||
2482 (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2483}
2484
2485// Return true if the next instruction is S_ENDPGM, following fallthrough
2486// blocks if necessary.
2487bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2488 MachineBasicBlock *Block) const {
2489 auto BlockEnd = Block->getParent()->end();
2490 auto BlockIter = Block->getIterator();
2491
2492 while (true) {
2493 if (It.isEnd()) {
2494 if (++BlockIter != BlockEnd) {
2495 It = BlockIter->instr_begin();
2496 continue;
2497 }
2498
2499 return false;
2500 }
2501
2502 if (!It->isMetaInstruction())
2503 break;
2504
2505 It++;
2506 }
2507
2508 assert(!It.isEnd());
2509
2510 return It->getOpcode() == AMDGPU::S_ENDPGM;
2511}
2512
2513// Add a wait after an instruction if architecture requirements mandate one.
2514bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2515 MachineBasicBlock &Block,
2516 WaitcntBrackets &ScoreBrackets) {
2517 AMDGPU::Waitcnt Wait;
2518 bool NeedsEndPGMCheck = false;
2519
2520 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2521 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2523
2524 if (TII->isAlwaysGDS(Inst.getOpcode())) {
2525 Wait.DsCnt = 0;
2526 NeedsEndPGMCheck = true;
2527 }
2528
2529 ScoreBrackets.simplifyWaitcnt(Wait);
2530
2531 auto SuccessorIt = std::next(Inst.getIterator());
2532 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2533 /*OldWaitcntInstr=*/nullptr);
2534
2535 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2536 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
2537 .addImm(0);
2538 }
2539
2540 return Result;
2541}
2542
2543void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2544 WaitcntBrackets *ScoreBrackets) {
2545 // Now look at the instruction opcode. If it is a memory access
2546 // instruction, update the upper-bound of the appropriate counter's
2547 // bracket and the destination operand scores.
2548 // For architectures with X_CNT, mark the source address operands
2549 // with the appropriate counter values.
2550 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
2551
2552 bool IsVMEMAccess = false;
2553 bool IsSMEMAccess = false;
2554
2555 if (IsExpertMode) {
2556 if (const auto ET = getExpertSchedulingEventType(Inst))
2557 ScoreBrackets->updateByEvent(*ET, Inst);
2558 }
2559
2560 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
2561 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
2562 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2563 ScoreBrackets->updateByEvent(GDS_ACCESS, Inst);
2564 ScoreBrackets->updateByEvent(GDS_GPR_LOCK, Inst);
2565 ScoreBrackets->setPendingGDS();
2566 } else {
2567 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2568 }
2569 } else if (TII->isFLAT(Inst)) {
2571 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2572 return;
2573 }
2574
2575 assert(Inst.mayLoadOrStore());
2576
2577 int FlatASCount = 0;
2578
2579 if (TII->mayAccessVMEMThroughFlat(Inst)) {
2580 ++FlatASCount;
2581 IsVMEMAccess = true;
2582 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2583 }
2584
2585 if (TII->mayAccessLDSThroughFlat(Inst)) {
2586 ++FlatASCount;
2587 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2588 }
2589
2590 // Async/LDSDMA operations have FLAT encoding but do not actually use flat
2591 // pointers. They do have two operands that each access global and LDS, thus
2592 // making it appear at this point that they are using a flat pointer. Filter
2593 // them out, and for the rest, generate a dependency on flat pointers so
2594 // that both VM and LGKM counters are flushed.
2595 if (!SIInstrInfo::isLDSDMA(Inst) && FlatASCount > 1)
2596 ScoreBrackets->setPendingFlat();
2597 } else if (SIInstrInfo::isVMEM(Inst) &&
2599 IsVMEMAccess = true;
2600 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2601
2602 if (ST->vmemWriteNeedsExpWaitcnt() &&
2603 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2604 ScoreBrackets->updateByEvent(VMW_GPR_LOCK, Inst);
2605 }
2606 } else if (TII->isSMRD(Inst)) {
2607 IsSMEMAccess = true;
2608 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2609 } else if (Inst.isCall()) {
2610 // Act as a wait on everything
2611 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2612 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2613 } else if (SIInstrInfo::isLDSDIR(Inst)) {
2614 ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst);
2615 } else if (TII->isVINTERP(Inst)) {
2616 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2617 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2618 } else if (SIInstrInfo::isEXP(Inst)) {
2619 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2621 ScoreBrackets->updateByEvent(EXP_PARAM_ACCESS, Inst);
2622 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2623 ScoreBrackets->updateByEvent(EXP_POS_ACCESS, Inst);
2624 else
2625 ScoreBrackets->updateByEvent(EXP_GPR_LOCK, Inst);
2626 } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
2627 ScoreBrackets->updateByEvent(SCC_WRITE, Inst);
2628 } else {
2629 switch (Inst.getOpcode()) {
2630 case AMDGPU::S_SENDMSG:
2631 case AMDGPU::S_SENDMSG_RTN_B32:
2632 case AMDGPU::S_SENDMSG_RTN_B64:
2633 case AMDGPU::S_SENDMSGHALT:
2634 ScoreBrackets->updateByEvent(SQ_MESSAGE, Inst);
2635 break;
2636 case AMDGPU::S_MEMTIME:
2637 case AMDGPU::S_MEMREALTIME:
2638 case AMDGPU::S_GET_BARRIER_STATE_M0:
2639 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2640 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2641 break;
2642 }
2643 }
2644
2645 if (!ST->hasWaitXCnt())
2646 return;
2647
2648 if (IsVMEMAccess)
2649 ScoreBrackets->updateByEvent(VMEM_GROUP, Inst);
2650
2651 if (IsSMEMAccess)
2652 ScoreBrackets->updateByEvent(SMEM_GROUP, Inst);
2653}
2654
2655bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2656 unsigned OtherScore) {
2657 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2658 unsigned OtherShifted =
2659 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2660 Score = std::max(MyShifted, OtherShifted);
2661 return OtherShifted > MyShifted;
2662}
2663
2664/// Merge the pending events and associater score brackets of \p Other into
2665/// this brackets status.
2666///
2667/// Returns whether the merge resulted in a change that requires tighter waits
2668/// (i.e. the merged brackets strictly dominate the original brackets).
2669bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2670 bool StrictDom = false;
2671
2672 // Check if "other" has keys we don't have, and create default entries for
2673 // those. If they remain empty after merging, we will clean it up after.
2674 for (auto K : Other.VMem.keys())
2675 VMem.try_emplace(K);
2676 for (auto K : Other.SGPRs.keys())
2677 SGPRs.try_emplace(K);
2678
2679 for (auto T : inst_counter_types(Context->MaxCounter)) {
2680 // Merge event flags for this counter
2681 const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst;
2682 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2683 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2684 if (OtherEvents & ~OldEvents)
2685 StrictDom = true;
2686 PendingEvents |= OtherEvents;
2687
2688 // Merge scores for this counter
2689 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2690 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2691 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2692 if (NewUB < ScoreLBs[T])
2693 report_fatal_error("waitcnt score overflow");
2694
2695 MergeInfo M;
2696 M.OldLB = ScoreLBs[T];
2697 M.OtherLB = Other.ScoreLBs[T];
2698 M.MyShift = NewUB - ScoreUBs[T];
2699 M.OtherShift = NewUB - Other.ScoreUBs[T];
2700
2701 ScoreUBs[T] = NewUB;
2702
2703 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
2704
2705 if (T == DS_CNT)
2706 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
2707
2708 if (T == KM_CNT) {
2709 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
2710 if (Other.hasPendingEvent(SCC_WRITE)) {
2711 unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
2712 if (!OldEventsHasSCCWrite) {
2713 PendingSCCWrite = Other.PendingSCCWrite;
2714 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
2715 PendingSCCWrite = nullptr;
2716 }
2717 }
2718 }
2719
2720 for (auto &[RegID, Info] : VMem)
2721 StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));
2722
2723 if (isSmemCounter(T)) {
2724 unsigned Idx = getSgprScoresIdx(T);
2725 for (auto &[RegID, Info] : SGPRs) {
2726 auto It = Other.SGPRs.find(RegID);
2727 unsigned OtherScore =
2728 (It != Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;
2729 StrictDom |= mergeScore(M, Info.Scores[Idx], OtherScore);
2730 }
2731 }
2732 }
2733
2734 for (auto &[TID, Info] : VMem) {
2735 if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {
2736 unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;
2737 StrictDom |= NewVmemTypes != Info.VMEMTypes;
2738 Info.VMEMTypes = NewVmemTypes;
2739 }
2740 }
2741
2742 purgeEmptyTrackingData();
2743 return StrictDom;
2744}
2745
2746static bool isWaitInstr(MachineInstr &Inst) {
2747 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2748 return Opcode == AMDGPU::S_WAITCNT ||
2749 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2750 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2751 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2752 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2753 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2754 counterTypeForInstr(Opcode).has_value();
2755}
2756
2757void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
2759 bool ExpertMode) const {
2760 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
2762 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SETREG_IMM32_B32))
2763 .addImm(ExpertMode ? 2 : 0)
2764 .addImm(EncodedReg);
2765}
2766
2767// Generate s_waitcnt instructions where needed.
2768bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2769 MachineBasicBlock &Block,
2770 WaitcntBrackets &ScoreBrackets) {
2771 bool Modified = false;
2772
2773 LLVM_DEBUG({
2774 dbgs() << "*** Begin Block: ";
2775 Block.printName(dbgs());
2776 ScoreBrackets.dump();
2777 });
2778
2779 // Track the correctness of vccz through this basic block. There are two
2780 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2781 // ST->partialVCCWritesUpdateVCCZ().
2782 bool VCCZCorrect = true;
2783 if (ST->hasReadVCCZBug()) {
2784 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2785 // to vcc and then issued an smem load.
2786 VCCZCorrect = false;
2787 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2788 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2789 // to vcc_lo or vcc_hi.
2790 VCCZCorrect = false;
2791 }
2792
2793 // Walk over the instructions.
2794 MachineInstr *OldWaitcntInstr = nullptr;
2795
2796 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2797 E = Block.instr_end();
2798 Iter != E;) {
2799 MachineInstr &Inst = *Iter;
2800 if (Inst.isMetaInstruction()) {
2801 ++Iter;
2802 continue;
2803 }
2804
2805 // Track pre-existing waitcnts that were added in earlier iterations or by
2806 // the memory legalizer.
2807 if (isWaitInstr(Inst) ||
2808 (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
2809 if (!OldWaitcntInstr)
2810 OldWaitcntInstr = &Inst;
2811 ++Iter;
2812 continue;
2813 }
2814
2815 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2816 isPreheaderToFlush(Block, ScoreBrackets);
2817
2818 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
2819 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2820 FlushVmCnt);
2821 OldWaitcntInstr = nullptr;
2822
2823 // Restore vccz if it's not known to be correct already.
2824 bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::isCBranchVCCZRead(Inst);
2825
2826 // Don't examine operands unless we need to track vccz correctness.
2827 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
2828 if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
2829 Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) {
2830 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2831 if (!ST->partialVCCWritesUpdateVCCZ())
2832 VCCZCorrect = false;
2833 } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) {
2834 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
2835 // vccz bit, so when we detect that an instruction may read from a
2836 // corrupt vccz bit, we need to:
2837 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2838 // operations to complete.
2839 // 2. Restore the correct value of vccz by writing the current value
2840 // of vcc back to vcc.
2841 if (ST->hasReadVCCZBug() &&
2842 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2843 // Writes to vcc while there's an outstanding smem read may get
2844 // clobbered as soon as any read completes.
2845 VCCZCorrect = false;
2846 } else {
2847 // Writes to vcc will fix any incorrect value in vccz.
2848 VCCZCorrect = true;
2849 }
2850 }
2851 }
2852
2853 if (TII->isSMRD(Inst)) {
2854 for (const MachineMemOperand *Memop : Inst.memoperands()) {
2855 // No need to handle invariant loads when avoiding WAR conflicts, as
2856 // there cannot be a vector store to the same memory location.
2857 if (!Memop->isInvariant()) {
2858 const Value *Ptr = Memop->getValue();
2859 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
2860 }
2861 }
2862 if (ST->hasReadVCCZBug()) {
2863 // This smem read could complete and clobber vccz at any time.
2864 VCCZCorrect = false;
2865 }
2866 }
2867
2868 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2869
2870 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
2871
2872 LLVM_DEBUG({
2873 Inst.print(dbgs());
2874 ScoreBrackets.dump();
2875 });
2876
2877 // TODO: Remove this work-around after fixing the scheduler and enable the
2878 // assert above.
2879 if (RestoreVCCZ) {
2880 // Restore the vccz bit. Any time a value is written to vcc, the vcc
2881 // bit is updated, so we can restore the bit by reading the value of
2882 // vcc and then writing it back to the register.
2883 BuildMI(Block, Inst, Inst.getDebugLoc(),
2884 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2885 TRI->getVCC())
2886 .addReg(TRI->getVCC());
2887 VCCZCorrect = true;
2888 Modified = true;
2889 }
2890
2891 ++Iter;
2892 }
2893
2894 // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
2895 // needed.
2896 AMDGPU::Waitcnt Wait;
2897 if (Block.getFirstTerminator() == Block.end() &&
2898 isPreheaderToFlush(Block, ScoreBrackets)) {
2899 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2900 Wait.LoadCnt = 0;
2901 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2902 Wait.SampleCnt = 0;
2903 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2904 Wait.BvhCnt = 0;
2905 }
2906
2907 // Combine or remove any redundant waitcnts at the end of the block.
2908 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
2909 OldWaitcntInstr);
2910
2911 LLVM_DEBUG({
2912 dbgs() << "*** End Block: ";
2913 Block.printName(dbgs());
2914 ScoreBrackets.dump();
2915 });
2916
2917 return Modified;
2918}
2919
2920// Return true if the given machine basic block is a preheader of a loop in
2921// which we want to flush the vmcnt counter, and false otherwise.
2922bool SIInsertWaitcnts::isPreheaderToFlush(
2923 MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets) {
2924 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
2925 if (!IsInserted)
2926 return Iterator->second;
2927
2928 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
2929 if (!Succ)
2930 return false;
2931
2932 MachineLoop *Loop = MLI->getLoopFor(Succ);
2933 if (!Loop)
2934 return false;
2935
2936 if (Loop->getLoopPreheader() == &MBB &&
2937 shouldFlushVmCnt(Loop, ScoreBrackets)) {
2938 Iterator->second = true;
2939 return true;
2940 }
2941
2942 return false;
2943}
2944
2945bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2947 return TII->mayAccessVMEMThroughFlat(MI);
2948 return SIInstrInfo::isVMEM(MI);
2949}
2950
2951// Return true if it is better to flush the vmcnt counter in the preheader of
2952// the given loop. We currently decide to flush in two situations:
2953// 1. The loop contains vmem store(s), no vmem load and at least one use of a
2954// vgpr containing a value that is loaded outside of the loop. (Only on
2955// targets with no vscnt counter).
2956// 2. The loop contains vmem load(s), but the loaded values are not used in the
2957// loop, and at least one use of a vgpr containing a value that is loaded
2958// outside of the loop.
2959bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2960 const WaitcntBrackets &Brackets) {
2961 bool HasVMemLoad = false;
2962 bool HasVMemStore = false;
2963 bool UsesVgprLoadedOutside = false;
2964 DenseSet<MCRegUnit> VgprUse;
2965 DenseSet<MCRegUnit> VgprDef;
2966
2967 for (MachineBasicBlock *MBB : ML->blocks()) {
2968 for (MachineInstr &MI : *MBB) {
2969 if (isVMEMOrFlatVMEM(MI)) {
2970 HasVMemLoad |= MI.mayLoad();
2971 HasVMemStore |= MI.mayStore();
2972 }
2973
2974 for (const MachineOperand &Op : MI.all_uses()) {
2975 if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
2976 continue;
2977 // Vgpr use
2978 for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
2979 // If we find a register that is loaded inside the loop, 1. and 2.
2980 // are invalidated and we can exit.
2981 if (VgprDef.contains(RU))
2982 return false;
2983 VgprUse.insert(RU);
2984 // If at least one of Op's registers is in the score brackets, the
2985 // value is likely loaded outside of the loop.
2986 VMEMID ID = toVMEMID(RU);
2987 if (Brackets.getVMemScore(ID, LOAD_CNT) >
2988 Brackets.getScoreLB(LOAD_CNT) ||
2989 Brackets.getVMemScore(ID, SAMPLE_CNT) >
2990 Brackets.getScoreLB(SAMPLE_CNT) ||
2991 Brackets.getVMemScore(ID, BVH_CNT) >
2992 Brackets.getScoreLB(BVH_CNT)) {
2993 UsesVgprLoadedOutside = true;
2994 break;
2995 }
2996 }
2997 }
2998
2999 // VMem load vgpr def
3000 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
3001 for (const MachineOperand &Op : MI.all_defs()) {
3002 for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
3003 // If we find a register that is loaded inside the loop, 1. and 2.
3004 // are invalidated and we can exit.
3005 if (VgprUse.contains(RU))
3006 return false;
3007 VgprDef.insert(RU);
3008 }
3009 }
3010 }
3011 }
3012 }
3013 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
3014 return true;
3015 return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
3016}
3017
3018bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3019 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3020 auto *PDT =
3021 &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3022 AliasAnalysis *AA = nullptr;
3023 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3024 AA = &AAR->getAAResults();
3025
3026 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
3027}
3028
3029PreservedAnalyses
3032 auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
3033 auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
3035 .getManager()
3036 .getCachedResult<AAManager>(MF.getFunction());
3037
3038 if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
3039 return PreservedAnalyses::all();
3040
3043 .preserve<AAManager>();
3044}
3045
3046bool SIInsertWaitcnts::run(MachineFunction &MF) {
3047 ST = &MF.getSubtarget<GCNSubtarget>();
3048 TII = ST->getInstrInfo();
3049 TRI = &TII->getRegisterInfo();
3050 MRI = &MF.getRegInfo();
3052
3054
3055 if (ST->hasExtendedWaitCounts()) {
3056 IsExpertMode = ST->hasExpertSchedulingMode() &&
3057 (ExpertSchedulingModeFlag.getNumOccurrences()
3059 : MF.getFunction()
3060 .getFnAttribute("amdgpu-expert-scheduling-mode")
3061 .getValueAsBool());
3062 MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS;
3063 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter, IsExpertMode);
3064 WCG = &WCGGFX12Plus;
3065 } else {
3066 MaxCounter = NUM_NORMAL_INST_CNTS;
3067 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, MaxCounter);
3068 WCG = &WCGPreGFX12;
3069 }
3070
3071 for (auto T : inst_counter_types())
3072 ForceEmitWaitcnt[T] = false;
3073
3074 WaitEventMaskForInst = WCG->getWaitEventMask();
3075
3076 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
3077
3078 if (ST->hasExtendedWaitCounts()) {
3079 Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
3080 Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
3081 } else {
3082 Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
3083 Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
3084 }
3085 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
3086 Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
3087 Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
3088 Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
3089 Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
3090 Limits.XcntMax = AMDGPU::getXcntBitMask(IV);
3091 Limits.VaVdstMax = AMDGPU::DepCtr::getVaVdstBitMask();
3092 Limits.VmVsrcMax = AMDGPU::DepCtr::getVmVsrcBitMask();
3093
3094 BlockInfos.clear();
3095 bool Modified = false;
3096
3097 MachineBasicBlock &EntryBB = MF.front();
3098
3099 if (!MFI->isEntryFunction()) {
3100 // Wait for any outstanding memory operations that the input registers may
3101 // depend on. We can't track them and it's better to do the wait after the
3102 // costly call sequence.
3103
3104 // TODO: Could insert earlier and schedule more liberally with operations
3105 // that only use caller preserved registers.
3107 while (I != EntryBB.end() && I->isMetaInstruction())
3108 ++I;
3109
3110 if (ST->hasExtendedWaitCounts()) {
3111 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
3112 .addImm(0);
3113 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
3114 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
3115 continue;
3116
3117 if (!ST->hasImageInsts() &&
3118 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
3119 continue;
3120
3121 BuildMI(EntryBB, I, DebugLoc(),
3122 TII->get(instrsForExtendedCounterTypes[CT]))
3123 .addImm(0);
3124 }
3125 if (IsExpertMode) {
3126 unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, *ST);
3128 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3129 .addImm(Enc);
3130 }
3131 } else {
3132 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
3133 }
3134
3135 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
3136 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3137 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3138
3139 Modified = true;
3140 }
3141
3142 // Keep iterating over the blocks in reverse post order, inserting and
3143 // updating s_waitcnt where needed, until a fix point is reached.
3144 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3145 BlockInfos.try_emplace(MBB);
3146
3147 std::unique_ptr<WaitcntBrackets> Brackets;
3148 bool Repeat;
3149 do {
3150 Repeat = false;
3151
3152 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
3153 ++BII) {
3154 MachineBasicBlock *MBB = BII->first;
3155 BlockInfo &BI = BII->second;
3156 if (!BI.Dirty)
3157 continue;
3158
3159 if (BI.Incoming) {
3160 if (!Brackets)
3161 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3162 else
3163 *Brackets = *BI.Incoming;
3164 } else {
3165 if (!Brackets)
3166 Brackets = std::make_unique<WaitcntBrackets>(this);
3167 else
3168 *Brackets = WaitcntBrackets(this);
3169 }
3170
3171 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
3172 BI.Dirty = false;
3173
3174 if (Brackets->hasPendingEvent()) {
3175 BlockInfo *MoveBracketsToSucc = nullptr;
3176 for (MachineBasicBlock *Succ : MBB->successors()) {
3177 auto *SuccBII = BlockInfos.find(Succ);
3178 BlockInfo &SuccBI = SuccBII->second;
3179 if (!SuccBI.Incoming) {
3180 SuccBI.Dirty = true;
3181 if (SuccBII <= BII) {
3182 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
3183 Repeat = true;
3184 }
3185 if (!MoveBracketsToSucc) {
3186 MoveBracketsToSucc = &SuccBI;
3187 } else {
3188 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3189 }
3190 } else if (SuccBI.Incoming->merge(*Brackets)) {
3191 SuccBI.Dirty = true;
3192 if (SuccBII <= BII) {
3193 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
3194 Repeat = true;
3195 }
3196 }
3197 }
3198 if (MoveBracketsToSucc)
3199 MoveBracketsToSucc->Incoming = std::move(Brackets);
3200 }
3201 }
3202 } while (Repeat);
3203
3204 if (ST->hasScalarStores()) {
3205 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3206 bool HaveScalarStores = false;
3207
3208 for (MachineBasicBlock &MBB : MF) {
3209 for (MachineInstr &MI : MBB) {
3210 if (!HaveScalarStores && TII->isScalarStore(MI))
3211 HaveScalarStores = true;
3212
3213 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
3214 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3215 EndPgmBlocks.push_back(&MBB);
3216 }
3217 }
3218
3219 if (HaveScalarStores) {
3220 // If scalar writes are used, the cache must be flushed or else the next
3221 // wave to reuse the same scratch memory can be clobbered.
3222 //
3223 // Insert s_dcache_wb at wave termination points if there were any scalar
3224 // stores, and only if the cache hasn't already been flushed. This could
3225 // be improved by looking across blocks for flushes in postdominating
3226 // blocks from the stores but an explicitly requested flush is probably
3227 // very rare.
3228 for (MachineBasicBlock *MBB : EndPgmBlocks) {
3229 bool SeenDCacheWB = false;
3230
3231 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
3232 I != E; ++I) {
3233 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
3234 SeenDCacheWB = true;
3235 else if (TII->isScalarStore(*I))
3236 SeenDCacheWB = false;
3237
3238 // FIXME: It would be better to insert this before a waitcnt if any.
3239 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
3240 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3241 !SeenDCacheWB) {
3242 Modified = true;
3243 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
3244 }
3245 }
3246 }
3247 }
3248 }
3249
3250 if (IsExpertMode) {
3251 // Enable expert scheduling on function entry. To satisfy ABI requirements
3252 // and to allow calls between function with different expert scheduling
3253 // settings, disable it around calls and before returns.
3254
3256 while (I != EntryBB.end() && I->isMetaInstruction())
3257 ++I;
3258 setSchedulingMode(EntryBB, I, true);
3259
3260 for (MachineInstr *MI : CallInsts) {
3261 MachineBasicBlock &MBB = *MI->getParent();
3262 setSchedulingMode(MBB, MI, false);
3263 setSchedulingMode(MBB, std::next(MI->getIterator()), true);
3264 }
3265
3266 for (MachineInstr *MI : ReturnInsts)
3267 setSchedulingMode(*MI->getParent(), MI, false);
3268
3269 Modified = true;
3270 }
3271
3272 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
3273 // This is done in different ways depending on how the VGPRs were allocated
3274 // (i.e. whether we're in dynamic VGPR mode or not).
3275 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
3276 // waveslot limited kernel runs slower with the deallocation.
3277 if (MFI->isDynamicVGPREnabled()) {
3278 for (MachineInstr *MI : ReleaseVGPRInsts) {
3279 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3280 TII->get(AMDGPU::S_ALLOC_VGPR))
3281 .addImm(0);
3282 Modified = true;
3283 }
3284 } else {
3285 if (!ReleaseVGPRInsts.empty() &&
3286 (MF.getFrameInfo().hasCalls() ||
3287 ST->getOccupancyWithNumVGPRs(
3288 TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
3289 /*IsDynamicVGPR=*/false) <
3291 for (MachineInstr *MI : ReleaseVGPRInsts) {
3292 if (ST->requiresNopBeforeDeallocVGPRs()) {
3293 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3294 TII->get(AMDGPU::S_NOP))
3295 .addImm(0);
3296 }
3297 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3298 TII->get(AMDGPU::S_SENDMSG))
3300 Modified = true;
3301 }
3302 }
3303 }
3304
3305 CallInsts.clear();
3306 ReturnInsts.clear();
3307 ReleaseVGPRInsts.clear();
3308 PreheadersToFlush.clear();
3309 SLoadAddresses.clear();
3310
3311 return Modified;
3312}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
#define T
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
#define AMDGPU_EVENT_NAME(Name)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
Provides some synthesis utilities to produce sequences of values.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:765
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator end()
Definition MapVector.h:67
iterator find(const KeyT &Key)
Definition MapVector.h:154
iterator begin()
Definition MapVector.h:65
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:864
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
bool isDPMACCInstruction(unsigned Opc)
unsigned getStorecntBitMask(const IsaVersion &Version)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getSamplecntBitMask(const IsaVersion &Version)
unsigned getKmcntBitMask(const IsaVersion &Version)
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getXcntBitMask(const IsaVersion &Version)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getBvhcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getLoadcntBitMask(const IsaVersion &Version)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned getDscntBitMask(const IsaVersion &Version)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
bool empty() const
Definition BasicBlock.h:101
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition Sequence.h:337
@ Wait
Definition Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.