LLVM 22.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
31#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/Sequence.h"
39#include "llvm/IR/Dominators.h"
43
44using namespace llvm;
45
46#define DEBUG_TYPE "si-insert-waitcnts"
47
48DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
49 "Force emit s_waitcnt expcnt(0) instrs");
50DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
51 "Force emit s_waitcnt lgkmcnt(0) instrs");
52DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
53 "Force emit s_waitcnt vmcnt(0) instrs");
54
55static cl::opt<bool>
56 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
57 cl::desc("Force all waitcnt instrs to be emitted as "
58 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
59 cl::init(false), cl::Hidden);
60
62 "amdgpu-waitcnt-load-forcezero",
63 cl::desc("Force all waitcnt load counters to wait until 0"),
64 cl::init(false), cl::Hidden);
65
66namespace {
67// Class of object that encapsulates latest instruction counter score
68// associated with the operand. Used for determining whether
69// s_waitcnt instruction needs to be emitted.
70
71enum InstCounterType {
72 LOAD_CNT = 0, // VMcnt prior to gfx12.
73 DS_CNT, // LKGMcnt prior to gfx12.
74 EXP_CNT, //
75 STORE_CNT, // VScnt in gfx10/gfx11.
76 NUM_NORMAL_INST_CNTS,
77 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
78 BVH_CNT, // gfx12+ only.
79 KM_CNT, // gfx12+ only.
80 X_CNT, // gfx1250.
81 NUM_EXTENDED_INST_CNTS,
82 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
83};
84} // namespace
85
86namespace llvm {
87template <> struct enum_iteration_traits<InstCounterType> {
88 static constexpr bool is_iterable = true;
89};
90} // namespace llvm
91
92namespace {
93// Return an iterator over all counters between LOAD_CNT (the first counter)
94// and \c MaxCounter (exclusive, default value yields an enumeration over
95// all counters).
96auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
97 return enum_seq(LOAD_CNT, MaxCounter);
98}
99
100using RegInterval = std::pair<int, int>;
101
102struct HardwareLimits {
103 unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
104 unsigned ExpcntMax;
105 unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
106 unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
107 unsigned SamplecntMax; // gfx12+ only.
108 unsigned BvhcntMax; // gfx12+ only.
109 unsigned KmcntMax; // gfx12+ only.
110 unsigned XcntMax; // gfx1250.
111};
112
113#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
114 DECL(VMEM_ACCESS) /* vmem read & write */ \
115 DECL(VMEM_READ_ACCESS) /* vmem read */ \
116 DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
117 DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
118 DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
119 DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
120 DECL(VMEM_GROUP) /* vmem group */ \
121 DECL(LDS_ACCESS) /* lds read & write */ \
122 DECL(GDS_ACCESS) /* gds read & write */ \
123 DECL(SQ_MESSAGE) /* send message */ \
124 DECL(SCC_WRITE) /* write to SCC from barrier */ \
125 DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
126 DECL(SMEM_GROUP) /* scalar-memory group */ \
127 DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
128 DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
129 DECL(EXP_POS_ACCESS) /* write to export position */ \
130 DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
131 DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
132 DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */
133
134// clang-format off
135#define AMDGPU_EVENT_ENUM(Name) Name,
136enum WaitEventType {
138 NUM_WAIT_EVENTS
139};
140#undef AMDGPU_EVENT_ENUM
141
142#define AMDGPU_EVENT_NAME(Name) #Name,
143static constexpr StringLiteral WaitEventTypeName[] = {
145};
146#undef AMDGPU_EVENT_NAME
147// clang-format on
148
149// The mapping is:
150// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
151// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
152// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
153// NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS .. SCC
154// We reserve a fixed number of VGPR slots in the scoring tables for
155// special tokens like SCMEM_LDS (needed for buffer load to LDS).
156enum RegisterMapping {
157 SQ_MAX_PGM_VGPRS = 2048, // Maximum programmable VGPRs across all targets.
158 AGPR_OFFSET = 512, // Maximum programmable ArchVGPRs across all targets.
159 SQ_MAX_PGM_SGPRS = 128, // Maximum programmable SGPRs across all targets.
160 // Artificial register slots to track LDS writes into specific LDS locations
161 // if a location is known. When slots are exhausted or location is
162 // unknown use the first slot. The first slot is also always updated in
163 // addition to known location's slot to properly generate waits if dependent
164 // instruction's location is unknown.
165 FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores.
166 NUM_LDS_VGPRS = 9, // One more than the stores we track.
167 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start.
168 NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS,
169 // Remaining non-allocatable registers
170 SCC = NUM_ALL_ALLOCATABLE
171};
172
173// Enumerate different types of result-returning VMEM operations. Although
174// s_waitcnt orders them all with a single vmcnt counter, in the absence of
175// s_waitcnt only instructions of the same VmemType are guaranteed to write
176// their results in order -- so there is no need to insert an s_waitcnt between
177// two instructions of the same type that write the same vgpr.
178enum VmemType {
179 // BUF instructions and MIMG instructions without a sampler.
180 VMEM_NOSAMPLER,
181 // MIMG instructions with a sampler.
182 VMEM_SAMPLER,
183 // BVH instructions
184 VMEM_BVH,
185 NUM_VMEM_TYPES
186};
187
188// Maps values of InstCounterType to the instruction that waits on that
189// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
190// returns true.
191static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
192 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
193 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
194 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
195
196static bool updateVMCntOnly(const MachineInstr &Inst) {
197 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
199}
200
201#ifndef NDEBUG
202static bool isNormalMode(InstCounterType MaxCounter) {
203 return MaxCounter == NUM_NORMAL_INST_CNTS;
204}
205#endif // NDEBUG
206
207VmemType getVmemType(const MachineInstr &Inst) {
208 assert(updateVMCntOnly(Inst));
209 if (!SIInstrInfo::isImage(Inst))
210 return VMEM_NOSAMPLER;
212 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
214
215 if (BaseInfo->BVH)
216 return VMEM_BVH;
217
218 // We have to make an additional check for isVSAMPLE here since some
219 // instructions don't have a sampler, but are still classified as sampler
220 // instructions for the purposes of e.g. waitcnt.
221 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
222 return VMEM_SAMPLER;
223
224 return VMEM_NOSAMPLER;
225}
226
227unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
228 switch (T) {
229 case LOAD_CNT:
230 return Wait.LoadCnt;
231 case EXP_CNT:
232 return Wait.ExpCnt;
233 case DS_CNT:
234 return Wait.DsCnt;
235 case STORE_CNT:
236 return Wait.StoreCnt;
237 case SAMPLE_CNT:
238 return Wait.SampleCnt;
239 case BVH_CNT:
240 return Wait.BvhCnt;
241 case KM_CNT:
242 return Wait.KmCnt;
243 case X_CNT:
244 return Wait.XCnt;
245 default:
246 llvm_unreachable("bad InstCounterType");
247 }
248}
249
250void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
251 unsigned &WC = getCounterRef(Wait, T);
252 WC = std::min(WC, Count);
253}
254
255void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
256 getCounterRef(Wait, T) = ~0u;
257}
258
259unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
260 return getCounterRef(Wait, T);
261}
262
263// Mapping from event to counter according to the table masks.
264InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
265 for (auto T : inst_counter_types()) {
266 if (masks[T] & (1 << E))
267 return T;
268 }
269 llvm_unreachable("event type has no associated counter");
270}
271
272class WaitcntBrackets;
273
274// This abstracts the logic for generating and updating S_WAIT* instructions
275// away from the analysis that determines where they are needed. This was
276// done because the set of counters and instructions for waiting on them
277// underwent a major shift with gfx12, sufficiently so that having this
278// abstraction allows the main analysis logic to be simpler than it would
279// otherwise have had to become.
280class WaitcntGenerator {
281protected:
282 const GCNSubtarget *ST = nullptr;
283 const SIInstrInfo *TII = nullptr;
284 AMDGPU::IsaVersion IV;
285 InstCounterType MaxCounter;
286 bool OptNone;
287
288public:
289 WaitcntGenerator() = default;
290 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
291 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
292 IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
293 OptNone(MF.getFunction().hasOptNone() ||
294 MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
295
296 // Return true if the current function should be compiled with no
297 // optimization.
298 bool isOptNone() const { return OptNone; }
299
300 // Edits an existing sequence of wait count instructions according
301 // to an incoming Waitcnt value, which is itself updated to reflect
302 // any new wait count instructions which may need to be generated by
303 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
304 // were made.
305 //
306 // This editing will usually be merely updated operands, but it may also
307 // delete instructions if the incoming Wait value indicates they are not
308 // needed. It may also remove existing instructions for which a wait
309 // is needed if it can be determined that it is better to generate new
310 // instructions later, as can happen on gfx12.
311 virtual bool
312 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
313 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
315
316 // Transform a soft waitcnt into a normal one.
317 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
318
319 // Generates new wait count instructions according to the value of
320 // Wait, returning true if any new instructions were created.
321 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
323 AMDGPU::Waitcnt Wait) = 0;
324
325 // Returns an array of bit masks which can be used to map values in
326 // WaitEventType to corresponding counter values in InstCounterType.
327 virtual const unsigned *getWaitEventMask() const = 0;
328
329 // Returns a new waitcnt with all counters except VScnt set to 0. If
330 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
331 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
332
333 virtual ~WaitcntGenerator() = default;
334
335 // Create a mask value from the initializer list of wait event types.
336 static constexpr unsigned
337 eventMask(std::initializer_list<WaitEventType> Events) {
338 unsigned Mask = 0;
339 for (auto &E : Events)
340 Mask |= 1 << E;
341
342 return Mask;
343 }
344};
345
346class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
347public:
348 WaitcntGeneratorPreGFX12() = default;
349 WaitcntGeneratorPreGFX12(const MachineFunction &MF)
350 : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
351
352 bool
353 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
354 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
355 MachineBasicBlock::instr_iterator It) const override;
356
357 bool createNewWaitcnt(MachineBasicBlock &Block,
359 AMDGPU::Waitcnt Wait) override;
360
361 const unsigned *getWaitEventMask() const override {
362 assert(ST);
363
364 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
365 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
366 VMEM_BVH_READ_ACCESS}),
367 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
368 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
369 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
370 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
371 0,
372 0,
373 0,
374 0};
375
376 return WaitEventMaskForInstPreGFX12;
377 }
378
379 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
380};
381
382class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
383public:
384 WaitcntGeneratorGFX12Plus() = default;
385 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
386 InstCounterType MaxCounter)
387 : WaitcntGenerator(MF, MaxCounter) {}
388
389 bool
390 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
391 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
392 MachineBasicBlock::instr_iterator It) const override;
393
394 bool createNewWaitcnt(MachineBasicBlock &Block,
396 AMDGPU::Waitcnt Wait) override;
397
398 const unsigned *getWaitEventMask() const override {
399 assert(ST);
400
401 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
402 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
403 eventMask({LDS_ACCESS, GDS_ACCESS}),
404 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
405 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
406 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
407 eventMask({VMEM_SAMPLER_READ_ACCESS}),
408 eventMask({VMEM_BVH_READ_ACCESS}),
409 eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
410 eventMask({VMEM_GROUP, SMEM_GROUP})};
411
412 return WaitEventMaskForInstGFX12Plus;
413 }
414
415 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
416};
417
418class SIInsertWaitcnts {
419public:
420 const GCNSubtarget *ST;
421 const SIInstrInfo *TII = nullptr;
422 const SIRegisterInfo *TRI = nullptr;
423 const MachineRegisterInfo *MRI = nullptr;
424 InstCounterType SmemAccessCounter;
425 InstCounterType MaxCounter;
426 const unsigned *WaitEventMaskForInst;
427
428private:
429 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
430 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
431 MachineLoopInfo *MLI;
432 MachinePostDominatorTree *PDT;
433 AliasAnalysis *AA = nullptr;
434
435 struct BlockInfo {
436 std::unique_ptr<WaitcntBrackets> Incoming;
437 bool Dirty = true;
438 };
439
440 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
441
442 bool ForceEmitWaitcnt[NUM_INST_CNTS];
443
444 // In any given run of this pass, WCG will point to one of these two
445 // generator objects, which must have been re-initialised before use
446 // from a value made using a subtarget constructor.
447 WaitcntGeneratorPreGFX12 WCGPreGFX12;
448 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
449
450 WaitcntGenerator *WCG = nullptr;
451
452 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
453 // message.
454 DenseSet<MachineInstr *> ReleaseVGPRInsts;
455
456 HardwareLimits Limits;
457
458public:
459 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
460 AliasAnalysis *AA)
461 : MLI(MLI), PDT(PDT), AA(AA) {
462 (void)ForceExpCounter;
463 (void)ForceLgkmCounter;
464 (void)ForceVMCounter;
465 }
466
467 unsigned getWaitCountMax(InstCounterType T) const {
468 switch (T) {
469 case LOAD_CNT:
470 return Limits.LoadcntMax;
471 case DS_CNT:
472 return Limits.DscntMax;
473 case EXP_CNT:
474 return Limits.ExpcntMax;
475 case STORE_CNT:
476 return Limits.StorecntMax;
477 case SAMPLE_CNT:
478 return Limits.SamplecntMax;
479 case BVH_CNT:
480 return Limits.BvhcntMax;
481 case KM_CNT:
482 return Limits.KmcntMax;
483 case X_CNT:
484 return Limits.XcntMax;
485 default:
486 break;
487 }
488 return 0;
489 }
490
491 bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
492 bool isPreheaderToFlush(MachineBasicBlock &MBB,
493 const WaitcntBrackets &ScoreBrackets);
494 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
495 bool run(MachineFunction &MF);
496
497 void setForceEmitWaitcnt() {
498// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
499// For debug builds, get the debug counter info and adjust if need be
500#ifndef NDEBUG
501 if (DebugCounter::isCounterSet(ForceExpCounter) &&
502 DebugCounter::shouldExecute(ForceExpCounter)) {
503 ForceEmitWaitcnt[EXP_CNT] = true;
504 } else {
505 ForceEmitWaitcnt[EXP_CNT] = false;
506 }
507
508 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
509 DebugCounter::shouldExecute(ForceLgkmCounter)) {
510 ForceEmitWaitcnt[DS_CNT] = true;
511 ForceEmitWaitcnt[KM_CNT] = true;
512 } else {
513 ForceEmitWaitcnt[DS_CNT] = false;
514 ForceEmitWaitcnt[KM_CNT] = false;
515 }
516
517 if (DebugCounter::isCounterSet(ForceVMCounter) &&
518 DebugCounter::shouldExecute(ForceVMCounter)) {
519 ForceEmitWaitcnt[LOAD_CNT] = true;
520 ForceEmitWaitcnt[SAMPLE_CNT] = true;
521 ForceEmitWaitcnt[BVH_CNT] = true;
522 } else {
523 ForceEmitWaitcnt[LOAD_CNT] = false;
524 ForceEmitWaitcnt[SAMPLE_CNT] = false;
525 ForceEmitWaitcnt[BVH_CNT] = false;
526 }
527#endif // NDEBUG
528 }
529
530 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
531 // instruction.
532 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
533 switch (Inst.getOpcode()) {
534 // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
535 case AMDGPU::GLOBAL_INV:
536 return VMEM_READ_ACCESS; // tracked using loadcnt
537 case AMDGPU::GLOBAL_WB:
538 case AMDGPU::GLOBAL_WBINV:
539 return VMEM_WRITE_ACCESS; // tracked using storecnt
540 default:
541 break;
542 }
543
544 // Maps VMEM access types to their corresponding WaitEventType.
545 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
546 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
547
549 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
550 // these should use VM_CNT.
551 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
552 return VMEM_ACCESS;
553 if (Inst.mayStore() &&
554 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
555 // FLAT and SCRATCH instructions may access scratch. Other VMEM
556 // instructions do not.
557 if (TII->mayAccessScratchThroughFlat(Inst))
558 return SCRATCH_WRITE_ACCESS;
559 return VMEM_WRITE_ACCESS;
560 }
561 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
562 return VMEM_READ_ACCESS;
563 return VmemReadMapping[getVmemType(Inst)];
564 }
565
566 bool isVmemAccess(const MachineInstr &MI) const;
567 bool generateWaitcntInstBefore(MachineInstr &MI,
568 WaitcntBrackets &ScoreBrackets,
569 MachineInstr *OldWaitcntInstr,
570 bool FlushVmCnt);
571 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
573 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
574 MachineInstr *OldWaitcntInstr);
575 void updateEventWaitcntAfter(MachineInstr &Inst,
576 WaitcntBrackets *ScoreBrackets);
577 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
578 MachineBasicBlock *Block) const;
579 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
580 WaitcntBrackets &ScoreBrackets);
581 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
582 WaitcntBrackets &ScoreBrackets);
583};
584
585// This objects maintains the current score brackets of each wait counter, and
586// a per-register scoreboard for each wait counter.
587//
588// We also maintain the latest score for every event type that can change the
589// waitcnt in order to know if there are multiple types of events within
590// the brackets. When multiple types of event happen in the bracket,
591// wait count may get decreased out of order, therefore we need to put in
592// "s_waitcnt 0" before use.
593class WaitcntBrackets {
594public:
595 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {}
596
597 bool isSmemCounter(InstCounterType T) const {
598 return T == Context->SmemAccessCounter || T == X_CNT;
599 }
600
601 unsigned getSgprScoresIdx(InstCounterType T) const {
602 assert(isSmemCounter(T) && "Invalid SMEM counter");
603 return T == X_CNT ? 1 : 0;
604 }
605
606 unsigned getScoreLB(InstCounterType T) const {
607 assert(T < NUM_INST_CNTS);
608 return ScoreLBs[T];
609 }
610
611 unsigned getScoreUB(InstCounterType T) const {
612 assert(T < NUM_INST_CNTS);
613 return ScoreUBs[T];
614 }
615
616 unsigned getScoreRange(InstCounterType T) const {
617 return getScoreUB(T) - getScoreLB(T);
618 }
619
620 unsigned getRegScore(int GprNo, InstCounterType T) const {
621 if (GprNo < NUM_ALL_VGPRS)
622 return VgprScores[T][GprNo];
623
624 if (GprNo < NUM_ALL_ALLOCATABLE)
625 return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
626
627 assert(GprNo == SCC);
628 return SCCScore;
629 }
630
631 bool merge(const WaitcntBrackets &Other);
632
633 RegInterval getRegInterval(const MachineInstr *MI,
634 const MachineOperand &Op) const;
635
636 bool counterOutOfOrder(InstCounterType T) const;
637 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait);
638 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
639 bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);
640 bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
641 void simplifyXcnt(AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait);
642
643 void determineWait(InstCounterType T, RegInterval Interval,
644 AMDGPU::Waitcnt &Wait) const;
645 void determineWait(InstCounterType T, int RegNo,
646 AMDGPU::Waitcnt &Wait) const {
647 determineWait(T, {RegNo, RegNo + 1}, Wait);
648 }
649 void tryClearSCCWriteEvent(MachineInstr *Inst);
650
651 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
652 void applyWaitcnt(InstCounterType T, unsigned Count);
653 void updateByEvent(WaitEventType E, MachineInstr &MI);
654
655 unsigned hasPendingEvent() const { return PendingEvents; }
656 unsigned hasPendingEvent(WaitEventType E) const {
657 return PendingEvents & (1 << E);
658 }
659 unsigned hasPendingEvent(InstCounterType T) const {
660 unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T];
661 assert((HasPending != 0) == (getScoreRange(T) != 0));
662 return HasPending;
663 }
664
665 bool hasMixedPendingEvents(InstCounterType T) const {
666 unsigned Events = hasPendingEvent(T);
667 // Return true if more than one bit is set in Events.
668 return Events & (Events - 1);
669 }
670
671 bool hasPendingFlat() const {
672 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
673 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
674 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
675 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
676 }
677
678 void setPendingFlat() {
679 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
680 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
681 }
682
683 bool hasPendingGDS() const {
684 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
685 }
686
687 unsigned getPendingGDSWait() const {
688 return std::min(getScoreUB(DS_CNT) - LastGDS,
689 Context->getWaitCountMax(DS_CNT) - 1);
690 }
691
692 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
693
694 // Return true if there might be pending writes to the vgpr-interval by VMEM
695 // instructions with types different from V.
696 bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
697 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
698 assert(RegNo < NUM_ALL_VGPRS);
699 if (VgprVmemTypes[RegNo] & ~(1 << V))
700 return true;
701 }
702 return false;
703 }
704
705 void clearVgprVmemTypes(RegInterval Interval) {
706 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
707 assert(RegNo < NUM_ALL_VGPRS);
708 VgprVmemTypes[RegNo] = 0;
709 }
710 }
711
712 void setStateOnFunctionEntryOrReturn() {
713 setScoreUB(STORE_CNT,
714 getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
715 PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
716 }
717
718 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
719 return LDSDMAStores;
720 }
721
722 bool hasPointSampleAccel(const MachineInstr &MI) const;
723 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
724 RegInterval Interval) const;
725
726 void print(raw_ostream &) const;
727 void dump() const { print(dbgs()); }
728
729private:
730 struct MergeInfo {
731 unsigned OldLB;
732 unsigned OtherLB;
733 unsigned MyShift;
734 unsigned OtherShift;
735 };
736 static bool mergeScore(const MergeInfo &M, unsigned &Score,
737 unsigned OtherScore);
738
739 void setScoreLB(InstCounterType T, unsigned Val) {
740 assert(T < NUM_INST_CNTS);
741 ScoreLBs[T] = Val;
742 }
743
744 void setScoreUB(InstCounterType T, unsigned Val) {
745 assert(T < NUM_INST_CNTS);
746 ScoreUBs[T] = Val;
747
748 if (T != EXP_CNT)
749 return;
750
751 if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
752 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
753 }
754
755 void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
756 setScoreByInterval({GprNo, GprNo + 1}, T, Val);
757 }
758
759 void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
760 unsigned Score);
761
762 void setScoreByOperand(const MachineInstr *MI, const MachineOperand &Op,
763 InstCounterType CntTy, unsigned Val);
764
765 const SIInsertWaitcnts *Context;
766
767 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
768 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
769 unsigned PendingEvents = 0;
770 // Remember the last flat memory operation.
771 unsigned LastFlat[NUM_INST_CNTS] = {0};
772 // Remember the last GDS operation.
773 unsigned LastGDS = 0;
774 // wait_cnt scores for every vgpr.
775 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
776 int VgprUB = -1;
777 int SgprUB = -1;
778 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
779 // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
780 // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
781 // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
782 // X_CNT score.
783 unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
784 // Reg score for SCC.
785 unsigned SCCScore = 0;
786 // The unique instruction that has an SCC write pending, if there is one.
787 const MachineInstr *PendingSCCWrite = nullptr;
788 // Bitmask of the VmemTypes of VMEM instructions that might have a pending
789 // write to each vgpr.
790 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
791 // Store representative LDS DMA operations. The only useful info here is
792 // alias info. One store is kept per unique AAInfo.
793 SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
794};
795
796class SIInsertWaitcntsLegacy : public MachineFunctionPass {
797public:
798 static char ID;
799 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
800
801 bool runOnMachineFunction(MachineFunction &MF) override;
802
803 StringRef getPassName() const override {
804 return "SI insert wait instructions";
805 }
806
807 void getAnalysisUsage(AnalysisUsage &AU) const override {
808 AU.setPreservesCFG();
809 AU.addRequired<MachineLoopInfoWrapperPass>();
810 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
811 AU.addUsedIfAvailable<AAResultsWrapperPass>();
812 AU.addPreserved<AAResultsWrapperPass>();
814 }
815};
816
817} // end anonymous namespace
818
819RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
820 const MachineOperand &Op) const {
821 if (Op.getReg() == AMDGPU::SCC)
822 return {SCC, SCC + 1};
823
824 const SIRegisterInfo *TRI = Context->TRI;
825 const MachineRegisterInfo *MRI = Context->MRI;
826
827 if (!TRI->isInAllocatableClass(Op.getReg()))
828 return {-1, -1};
829
830 // A use via a PW operand does not need a waitcnt.
831 // A partial write is not a WAW.
832 assert(!Op.getSubReg() || !Op.isUndef());
833
834 RegInterval Result;
835
836 MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST);
837 unsigned RegIdx = TRI->getHWRegIndex(MCReg);
838
839 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
840 unsigned Size = TRI->getRegSizeInBits(*RC);
841
842 // AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits
843 if (TRI->isVectorRegister(*MRI, Op.getReg())) {
844 unsigned Reg = RegIdx << 1 | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0);
845 assert(!Context->ST->hasMAIInsts() || Reg < AGPR_OFFSET);
846 Result.first = Reg;
847 if (TRI->isAGPR(*MRI, Op.getReg()))
848 Result.first += AGPR_OFFSET;
849 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
850 assert(Size % 16 == 0);
851 Result.second = Result.first + (Size / 16);
852
853 if (Size == 16 && Context->ST->hasD16Writes32BitVgpr()) {
854 // Regardless of which lo16/hi16 is used, consider the full 32-bit
855 // register used.
856 if (AMDGPU::isHi16Reg(MCReg, *TRI))
857 Result.first -= 1;
858 else
859 Result.second += 1;
860 }
861 } else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
862 // SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar
863 // sources like SRC_PRIVATE_BASE.
864 Result.first = RegIdx + NUM_ALL_VGPRS;
865 Result.second = Result.first + divideCeil(Size, 32);
866 } else {
867 return {-1, -1};
868 }
869
870 return Result;
871}
872
873void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
874 InstCounterType CntTy,
875 unsigned Score) {
876 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
877 if (RegNo < NUM_ALL_VGPRS) {
878 VgprUB = std::max(VgprUB, RegNo);
879 VgprScores[CntTy][RegNo] = Score;
880 } else if (RegNo < NUM_ALL_ALLOCATABLE) {
881 SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
882 SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
883 } else {
884 assert(RegNo == SCC);
885 SCCScore = Score;
886 }
887 }
888}
889
890void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
891 const MachineOperand &Op,
892 InstCounterType CntTy, unsigned Score) {
893 RegInterval Interval = getRegInterval(MI, Op);
894 setScoreByInterval(Interval, CntTy, Score);
895}
896
897// Return true if the subtarget is one that enables Point Sample Acceleration
898// and the MachineInstr passed in is one to which it might be applied (the
899// hardware makes this decision based on several factors, but we can't determine
900// this at compile time, so we have to assume it might be applied if the
901// instruction supports it).
902bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
903 if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
904 return false;
905
906 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
907 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
909 return BaseInfo->PointSampleAccel;
910}
911
912// Return true if the subtarget enables Point Sample Acceleration, the supplied
913// MachineInstr is one to which it might be applied and the supplied interval is
914// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
915// (this is the type that a point sample accelerated instruction effectively
916// becomes)
917bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
918 const MachineInstr &MI, RegInterval Interval) const {
919 if (!hasPointSampleAccel(MI))
920 return false;
921
922 return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER);
923}
924
925void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
926 InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
927
928 unsigned UB = getScoreUB(T);
929 unsigned CurrScore = UB + 1;
930 if (CurrScore == 0)
931 report_fatal_error("InsertWaitcnt score wraparound");
932 // PendingEvents and ScoreUB need to be update regardless if this event
933 // changes the score of a register or not.
934 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
935 PendingEvents |= 1 << E;
936 setScoreUB(T, CurrScore);
937
938 const SIRegisterInfo *TRI = Context->TRI;
939 const MachineRegisterInfo *MRI = Context->MRI;
940 const SIInstrInfo *TII = Context->TII;
941
942 if (T == EXP_CNT) {
943 // Put score on the source vgprs. If this is a store, just use those
944 // specific register(s).
945 if (TII->isDS(Inst) && Inst.mayLoadOrStore()) {
946 // All GDS operations must protect their address register (same as
947 // export.)
948 if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
949 setScoreByOperand(&Inst, *AddrOp, EXP_CNT, CurrScore);
950
951 if (Inst.mayStore()) {
952 if (const auto *Data0 =
953 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
954 setScoreByOperand(&Inst, *Data0, EXP_CNT, CurrScore);
955 if (const auto *Data1 =
956 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
957 setScoreByOperand(&Inst, *Data1, EXP_CNT, CurrScore);
958 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
959 Inst.getOpcode() != AMDGPU::DS_APPEND &&
960 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
961 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
962 for (const MachineOperand &Op : Inst.all_uses()) {
963 if (TRI->isVectorRegister(*MRI, Op.getReg()))
964 setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore);
965 }
966 }
967 } else if (TII->isFLAT(Inst)) {
968 if (Inst.mayStore()) {
969 setScoreByOperand(&Inst,
970 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
971 EXP_CNT, CurrScore);
972 } else if (SIInstrInfo::isAtomicRet(Inst)) {
973 setScoreByOperand(&Inst,
974 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
975 EXP_CNT, CurrScore);
976 }
977 } else if (TII->isMIMG(Inst)) {
978 if (Inst.mayStore()) {
979 setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
980 } else if (SIInstrInfo::isAtomicRet(Inst)) {
981 setScoreByOperand(&Inst,
982 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
983 EXP_CNT, CurrScore);
984 }
985 } else if (TII->isMTBUF(Inst)) {
986 if (Inst.mayStore())
987 setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
988 } else if (TII->isMUBUF(Inst)) {
989 if (Inst.mayStore()) {
990 setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
991 } else if (SIInstrInfo::isAtomicRet(Inst)) {
992 setScoreByOperand(&Inst,
993 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
994 EXP_CNT, CurrScore);
995 }
996 } else if (TII->isLDSDIR(Inst)) {
997 // LDSDIR instructions attach the score to the destination.
998 setScoreByOperand(&Inst,
999 *TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
1000 EXP_CNT, CurrScore);
1001 } else {
1002 if (TII->isEXP(Inst)) {
1003 // For export the destination registers are really temps that
1004 // can be used as the actual source after export patching, so
1005 // we need to treat them like sources and set the EXP_CNT
1006 // score.
1007 for (MachineOperand &DefMO : Inst.all_defs()) {
1008 if (TRI->isVGPR(*MRI, DefMO.getReg())) {
1009 setScoreByOperand(&Inst, DefMO, EXP_CNT, CurrScore);
1010 }
1011 }
1012 }
1013 for (const MachineOperand &Op : Inst.all_uses()) {
1014 if (TRI->isVectorRegister(*MRI, Op.getReg()))
1015 setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore);
1016 }
1017 }
1018 } else if (T == X_CNT) {
1019 WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1020 if (PendingEvents & (1 << OtherEvent)) {
1021 // Hardware inserts an implicit xcnt between interleaved
1022 // SMEM and VMEM operations. So there will never be
1023 // outstanding address translations for both SMEM and
1024 // VMEM at the same time.
1025 setScoreLB(T, getScoreUB(T) - 1);
1026 PendingEvents &= ~(1 << OtherEvent);
1027 }
1028 for (const MachineOperand &Op : Inst.all_uses())
1029 setScoreByOperand(&Inst, Op, T, CurrScore);
1030 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1031 // Match the score to the destination registers.
1032 //
1033 // Check only explicit operands. Stores, especially spill stores, include
1034 // implicit uses and defs of their super registers which would create an
1035 // artificial dependency, while these are there only for register liveness
1036 // accounting purposes.
1037 //
1038 // Special cases where implicit register defs exists, such as M0 or VCC,
1039 // but none with memory instructions.
1040 for (const MachineOperand &Op : Inst.defs()) {
1041 RegInterval Interval = getRegInterval(&Inst, Op);
1042 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
1043 if (Interval.first >= NUM_ALL_VGPRS)
1044 continue;
1045 if (updateVMCntOnly(Inst)) {
1046 // updateVMCntOnly should only leave us with VGPRs
1047 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1048 // defs. That's required for a sane index into `VgprMemTypes` below
1049 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
1050 VmemType V = getVmemType(Inst);
1051 unsigned char TypesMask = 1 << V;
1052 // If instruction can have Point Sample Accel applied, we have to flag
1053 // this with another potential dependency
1054 if (hasPointSampleAccel(Inst))
1055 TypesMask |= 1 << VMEM_NOSAMPLER;
1056 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
1057 VgprVmemTypes[RegNo] |= TypesMask;
1058 }
1059 }
1060 setScoreByInterval(Interval, T, CurrScore);
1061 }
1062 if (Inst.mayStore() &&
1063 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
1064 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1065 // written can be accessed. A load from LDS to VMEM does not need a wait.
1066 unsigned Slot = 0;
1067 for (const auto *MemOp : Inst.memoperands()) {
1068 if (!MemOp->isStore() ||
1069 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1070 continue;
1071 // Comparing just AA info does not guarantee memoperands are equal
1072 // in general, but this is so for LDS DMA in practice.
1073 auto AAI = MemOp->getAAInfo();
1074 // Alias scope information gives a way to definitely identify an
1075 // original memory object and practically produced in the module LDS
1076 // lowering pass. If there is no scope available we will not be able
1077 // to disambiguate LDS aliasing as after the module lowering all LDS
1078 // is squashed into a single big object. Do not attempt to use one of
1079 // the limited LDSDMAStores for something we will not be able to use
1080 // anyway.
1081 if (!AAI || !AAI.Scope)
1082 break;
1083 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1084 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1085 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1086 Slot = I + 1;
1087 break;
1088 }
1089 }
1090 }
1091 if (Slot || LDSDMAStores.size() == NUM_LDS_VGPRS - 1)
1092 break;
1093 LDSDMAStores.push_back(&Inst);
1094 Slot = LDSDMAStores.size();
1095 break;
1096 }
1097 setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore);
1098 if (Slot)
1099 setRegScore(FIRST_LDS_VGPR, T, CurrScore);
1100 }
1101
1103 setRegScore(SCC, T, CurrScore);
1104 PendingSCCWrite = &Inst;
1105 }
1106 }
1107}
1108
1109void WaitcntBrackets::print(raw_ostream &OS) const {
1110 const GCNSubtarget *ST = Context->ST;
1111
1112 OS << '\n';
1113 for (auto T : inst_counter_types(Context->MaxCounter)) {
1114 unsigned SR = getScoreRange(T);
1115
1116 switch (T) {
1117 case LOAD_CNT:
1118 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1119 << SR << "): ";
1120 break;
1121 case DS_CNT:
1122 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1123 << SR << "): ";
1124 break;
1125 case EXP_CNT:
1126 OS << " EXP_CNT(" << SR << "): ";
1127 break;
1128 case STORE_CNT:
1129 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1130 << SR << "): ";
1131 break;
1132 case SAMPLE_CNT:
1133 OS << " SAMPLE_CNT(" << SR << "): ";
1134 break;
1135 case BVH_CNT:
1136 OS << " BVH_CNT(" << SR << "): ";
1137 break;
1138 case KM_CNT:
1139 OS << " KM_CNT(" << SR << "): ";
1140 break;
1141 case X_CNT:
1142 OS << " X_CNT(" << SR << "): ";
1143 break;
1144 default:
1145 OS << " UNKNOWN(" << SR << "): ";
1146 break;
1147 }
1148
1149 if (SR != 0) {
1150 // Print vgpr scores.
1151 unsigned LB = getScoreLB(T);
1152
1153 for (int J = 0; J <= VgprUB; J++) {
1154 unsigned RegScore = getRegScore(J, T);
1155 if (RegScore <= LB)
1156 continue;
1157 unsigned RelScore = RegScore - LB - 1;
1158 if (J < FIRST_LDS_VGPR) {
1159 OS << RelScore << ":v" << J << " ";
1160 } else {
1161 OS << RelScore << ":ds ";
1162 }
1163 }
1164 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1165 if (isSmemCounter(T)) {
1166 for (int J = 0; J <= SgprUB; J++) {
1167 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
1168 if (RegScore <= LB)
1169 continue;
1170 unsigned RelScore = RegScore - LB - 1;
1171 OS << RelScore << ":s" << J << " ";
1172 }
1173 }
1174 if (T == KM_CNT && SCCScore > 0)
1175 OS << SCCScore << ":scc ";
1176 }
1177 OS << '\n';
1178 }
1179
1180 OS << "Pending Events: ";
1181 if (hasPendingEvent()) {
1182 ListSeparator LS;
1183 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1184 if (hasPendingEvent((WaitEventType)I)) {
1185 OS << LS << WaitEventTypeName[I];
1186 }
1187 }
1188 } else {
1189 OS << "none";
1190 }
1191 OS << '\n';
1192
1193 OS << '\n';
1194}
1195
1196/// Simplify the waitcnt, in the sense of removing redundant counts, and return
1197/// whether a waitcnt instruction is needed at all.
1198void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) {
1199 simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1200 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
1201 simplifyWaitcnt(DS_CNT, Wait.DsCnt);
1202 simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
1203 simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1204 simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
1205 simplifyWaitcnt(KM_CNT, Wait.KmCnt);
1206 simplifyXcnt(Wait, Wait);
1207}
1208
1209void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1210 unsigned &Count) const {
1211 // The number of outstanding events for this type, T, can be calculated
1212 // as (UB - LB). If the current Count is greater than or equal to the number
1213 // of outstanding events, then the wait for this counter is redundant.
1214 if (Count >= getScoreRange(T))
1215 Count = ~0u;
1216}
1217
1218void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
1219 AMDGPU::Waitcnt &Wait) const {
1220 const unsigned LB = getScoreLB(T);
1221 const unsigned UB = getScoreUB(T);
1222 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1223 unsigned ScoreToWait = getRegScore(RegNo, T);
1224
1225 // If the score of src_operand falls within the bracket, we need an
1226 // s_waitcnt instruction.
1227 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1228 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1229 !Context->ST->hasFlatLgkmVMemCountInOrder()) {
1230 // If there is a pending FLAT operation, and this is a VMem or LGKM
1231 // waitcnt and the target can report early completion, then we need
1232 // to force a waitcnt 0.
1233 addWait(Wait, T, 0);
1234 } else if (counterOutOfOrder(T)) {
1235 // Counter can get decremented out-of-order when there
1236 // are multiple types event in the bracket. Also emit an s_wait counter
1237 // with a conservative value of 0 for the counter.
1238 addWait(Wait, T, 0);
1239 } else {
1240 // If a counter has been maxed out avoid overflow by waiting for
1241 // MAX(CounterType) - 1 instead.
1242 unsigned NeededWait =
1243 std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
1244 addWait(Wait, T, NeededWait);
1245 }
1246 }
1247 }
1248}
1249
1250void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1251 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1252 // SCC has landed
1253 if (PendingSCCWrite &&
1254 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1255 PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
1256 unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
1257 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1258 if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) ==
1259 SCC_WRITE_PendingEvent) {
1260 setScoreLB(KM_CNT, getScoreUB(KM_CNT));
1261 }
1262
1263 PendingEvents &= ~SCC_WRITE_PendingEvent;
1264 PendingSCCWrite = nullptr;
1265 }
1266}
1267
1268void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1269 applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1270 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1271 applyWaitcnt(DS_CNT, Wait.DsCnt);
1272 applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1273 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1274 applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1275 applyWaitcnt(KM_CNT, Wait.KmCnt);
1276 applyWaitcnt(X_CNT, Wait.XCnt);
1277}
1278
1279void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1280 const unsigned UB = getScoreUB(T);
1281 if (Count >= UB)
1282 return;
1283 if (Count != 0) {
1284 if (counterOutOfOrder(T))
1285 return;
1286 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1287 } else {
1288 setScoreLB(T, UB);
1289 PendingEvents &= ~Context->WaitEventMaskForInst[T];
1290 }
1291}
1292
1293bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) {
1294 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1295 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1296 // zero.
1297 return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP);
1298}
1299
1300bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
1301 // If we have pending store we cannot optimize XCnt because we do not wait for
1302 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1303 // decremented to the same number as LOADCnt.
1304 return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1305 !hasPendingEvent(STORE_CNT);
1306}
1307
1308void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &CheckWait,
1309 AMDGPU::Waitcnt &UpdateWait) {
1310 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1311 // optimizations. On entry to a block with multiple predescessors, there may
1312 // be pending SMEM and VMEM events active at the same time.
1313 // In such cases, only clear one active event at a time.
1314 // TODO: Revisit xcnt optimizations for gfx1250.
1315 if (hasRedundantXCntWithKmCnt(CheckWait)) {
1316 if (!hasMixedPendingEvents(X_CNT)) {
1317 applyWaitcnt(X_CNT, 0);
1318 } else {
1319 PendingEvents &= ~(1 << SMEM_GROUP);
1320 }
1321 } else if (canOptimizeXCntWithLoadCnt(CheckWait)) {
1322 if (!hasMixedPendingEvents(X_CNT)) {
1323 applyWaitcnt(X_CNT, std::min(CheckWait.XCnt, CheckWait.LoadCnt));
1324 } else if (CheckWait.LoadCnt == 0) {
1325 PendingEvents &= ~(1 << VMEM_GROUP);
1326 }
1327 }
1328 simplifyWaitcnt(X_CNT, UpdateWait.XCnt);
1329}
1330
1331// Where there are multiple types of event in the bracket of a counter,
1332// the decrement may go out of order.
1333bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1334 // Scalar memory read always can go out of order.
1335 if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1336 (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
1337 return true;
1338 return hasMixedPendingEvents(T);
1339}
1340
1341INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1342 false, false)
1345INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1347
1348char SIInsertWaitcntsLegacy::ID = 0;
1349
1350char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1351
1353 return new SIInsertWaitcntsLegacy();
1354}
1355
1356static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1357 unsigned NewEnc) {
1358 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1359 assert(OpIdx >= 0);
1360
1361 MachineOperand &MO = MI.getOperand(OpIdx);
1362
1363 if (NewEnc == MO.getImm())
1364 return false;
1365
1366 MO.setImm(NewEnc);
1367 return true;
1368}
1369
1370/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1371/// and if so, which counter it is waiting on.
1372static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1373 switch (Opcode) {
1374 case AMDGPU::S_WAIT_LOADCNT:
1375 return LOAD_CNT;
1376 case AMDGPU::S_WAIT_EXPCNT:
1377 return EXP_CNT;
1378 case AMDGPU::S_WAIT_STORECNT:
1379 return STORE_CNT;
1380 case AMDGPU::S_WAIT_SAMPLECNT:
1381 return SAMPLE_CNT;
1382 case AMDGPU::S_WAIT_BVHCNT:
1383 return BVH_CNT;
1384 case AMDGPU::S_WAIT_DSCNT:
1385 return DS_CNT;
1386 case AMDGPU::S_WAIT_KMCNT:
1387 return KM_CNT;
1388 case AMDGPU::S_WAIT_XCNT:
1389 return X_CNT;
1390 default:
1391 return {};
1392 }
1393}
1394
1395bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1396 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1397 if (Opcode == Waitcnt->getOpcode())
1398 return false;
1399
1400 Waitcnt->setDesc(TII->get(Opcode));
1401 return true;
1402}
1403
1404/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1405/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1406/// from \p Wait that were added by previous passes. Currently this pass
1407/// conservatively assumes that these preexisting waits are required for
1408/// correctness.
1409bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1410 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1411 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1412 assert(ST);
1413 assert(isNormalMode(MaxCounter));
1414
1415 bool Modified = false;
1416 MachineInstr *WaitcntInstr = nullptr;
1417 MachineInstr *WaitcntVsCntInstr = nullptr;
1418
1419 LLVM_DEBUG({
1420 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1421 if (It == OldWaitcntInstr.getParent()->instr_end())
1422 dbgs() << "end of block\n";
1423 else
1424 dbgs() << *It;
1425 });
1426
1427 for (auto &II :
1428 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1429 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1430 if (II.isMetaInstruction()) {
1431 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1432 continue;
1433 }
1434
1435 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1436 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1437
1438 // Update required wait count. If this is a soft waitcnt (= it was added
1439 // by an earlier pass), it may be entirely removed.
1440 if (Opcode == AMDGPU::S_WAITCNT) {
1441 unsigned IEnc = II.getOperand(0).getImm();
1442 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1443 if (TrySimplify)
1444 ScoreBrackets.simplifyWaitcnt(OldWait);
1445 Wait = Wait.combined(OldWait);
1446
1447 // Merge consecutive waitcnt of the same type by erasing multiples.
1448 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1449 II.eraseFromParent();
1450 Modified = true;
1451 } else
1452 WaitcntInstr = &II;
1453 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1454 assert(ST->hasVMemToLDSLoad());
1455 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1456 << "Before: " << Wait.LoadCnt << '\n';);
1457 ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
1458 LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';);
1459
1460 // It is possible (but unlikely) that this is the only wait instruction,
1461 // in which case, we exit this loop without a WaitcntInstr to consume
1462 // `Wait`. But that works because `Wait` was passed in by reference, and
1463 // the callee eventually calls createNewWaitcnt on it. We test this
1464 // possibility in an articial MIR test since such a situation cannot be
1465 // recreated by running the memory legalizer.
1466 II.eraseFromParent();
1467 } else {
1468 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1469 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1470
1471 unsigned OldVSCnt =
1472 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1473 if (TrySimplify)
1474 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1475 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1476
1477 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1478 II.eraseFromParent();
1479 Modified = true;
1480 } else
1481 WaitcntVsCntInstr = &II;
1482 }
1483 }
1484
1485 if (WaitcntInstr) {
1486 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1488 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1489
1490 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1491 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1492 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1493 Wait.LoadCnt = ~0u;
1494 Wait.ExpCnt = ~0u;
1495 Wait.DsCnt = ~0u;
1496
1497 LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1498 ? dbgs()
1499 << "applied pre-existing waitcnt\n"
1500 << "New Instr at block end: " << *WaitcntInstr << '\n'
1501 : dbgs() << "applied pre-existing waitcnt\n"
1502 << "Old Instr: " << *It
1503 << "New Instr: " << *WaitcntInstr << '\n');
1504 }
1505
1506 if (WaitcntVsCntInstr) {
1507 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
1508 AMDGPU::OpName::simm16, Wait.StoreCnt);
1509 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1510
1511 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1512 Wait.StoreCnt = ~0u;
1513
1514 LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1515 ? dbgs() << "applied pre-existing waitcnt\n"
1516 << "New Instr at block end: " << *WaitcntVsCntInstr
1517 << '\n'
1518 : dbgs() << "applied pre-existing waitcnt\n"
1519 << "Old Instr: " << *It
1520 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1521 }
1522
1523 return Modified;
1524}
1525
1526/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1527/// required counters in \p Wait
1528bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1529 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1530 AMDGPU::Waitcnt Wait) {
1531 assert(ST);
1532 assert(isNormalMode(MaxCounter));
1533
1534 bool Modified = false;
1535 const DebugLoc &DL = Block.findDebugLoc(It);
1536
1537 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1538 // single instruction while VScnt has its own instruction.
1539 if (Wait.hasWaitExceptStoreCnt()) {
1540 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1541 [[maybe_unused]] auto SWaitInst =
1542 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1543 Modified = true;
1544
1545 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1546 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1547 dbgs() << "New Instr: " << *SWaitInst << '\n');
1548 }
1549
1550 if (Wait.hasWaitStoreCnt()) {
1551 assert(ST->hasVscnt());
1552
1553 [[maybe_unused]] auto SWaitInst =
1554 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1555 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1556 .addImm(Wait.StoreCnt);
1557 Modified = true;
1558
1559 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1560 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1561 dbgs() << "New Instr: " << *SWaitInst << '\n');
1562 }
1563
1564 return Modified;
1565}
1566
1567AMDGPU::Waitcnt
1568WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1569 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1570}
1571
1572AMDGPU::Waitcnt
1573WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1574 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1575 ~0u /* XCNT */);
1576}
1577
1578/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1579/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1580/// were added by previous passes. Currently this pass conservatively
1581/// assumes that these preexisting waits are required for correctness.
1582bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1583 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1584 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1585 assert(ST);
1586 assert(!isNormalMode(MaxCounter));
1587
1588 bool Modified = false;
1589 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1590 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1591 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1592
1593 LLVM_DEBUG({
1594 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
1595 if (It == OldWaitcntInstr.getParent()->instr_end())
1596 dbgs() << "end of block\n";
1597 else
1598 dbgs() << *It;
1599 });
1600
1601 for (auto &II :
1602 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1603 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1604 if (II.isMetaInstruction()) {
1605 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1606 continue;
1607 }
1608
1609 MachineInstr **UpdatableInstr;
1610
1611 // Update required wait count. If this is a soft waitcnt (= it was added
1612 // by an earlier pass), it may be entirely removed.
1613
1614 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1615 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1616
1617 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1618 // attempt to do more than that either.
1619 if (Opcode == AMDGPU::S_WAITCNT)
1620 continue;
1621
1622 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1623 unsigned OldEnc =
1624 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1625 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
1626 if (TrySimplify)
1627 ScoreBrackets.simplifyWaitcnt(OldWait);
1628 Wait = Wait.combined(OldWait);
1629 UpdatableInstr = &CombinedLoadDsCntInstr;
1630 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1631 unsigned OldEnc =
1632 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1633 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
1634 if (TrySimplify)
1635 ScoreBrackets.simplifyWaitcnt(OldWait);
1636 Wait = Wait.combined(OldWait);
1637 UpdatableInstr = &CombinedStoreDsCntInstr;
1638 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1639 // Architectures higher than GFX10 do not have direct loads to
1640 // LDS, so no work required here yet.
1641 II.eraseFromParent();
1642 continue;
1643 } else {
1644 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1645 assert(CT.has_value());
1646 unsigned OldCnt =
1647 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1648 if (TrySimplify)
1649 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
1650 addWait(Wait, CT.value(), OldCnt);
1651 UpdatableInstr = &WaitInstrs[CT.value()];
1652 }
1653
1654 // Merge consecutive waitcnt of the same type by erasing multiples.
1655 if (!*UpdatableInstr) {
1656 *UpdatableInstr = &II;
1657 } else {
1658 II.eraseFromParent();
1659 Modified = true;
1660 }
1661 }
1662
1663 // Save the pre combine waitcnt in order to make xcnt checks.
1664 AMDGPU::Waitcnt PreCombine = Wait;
1665 if (CombinedLoadDsCntInstr) {
1666 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1667 // to be waited for. Otherwise, let the instruction be deleted so
1668 // the appropriate single counter wait instruction can be inserted
1669 // instead, when new S_WAIT_*CNT instructions are inserted by
1670 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1671 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1672 // the loop below that deals with single counter instructions.
1673 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1674 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1675 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1676 AMDGPU::OpName::simm16, NewEnc);
1677 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1678 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1679 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1680 Wait.LoadCnt = ~0u;
1681 Wait.DsCnt = ~0u;
1682
1683 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1684 ? dbgs() << "applied pre-existing waitcnt\n"
1685 << "New Instr at block end: "
1686 << *CombinedLoadDsCntInstr << '\n'
1687 : dbgs() << "applied pre-existing waitcnt\n"
1688 << "Old Instr: " << *It << "New Instr: "
1689 << *CombinedLoadDsCntInstr << '\n');
1690 } else {
1691 CombinedLoadDsCntInstr->eraseFromParent();
1692 Modified = true;
1693 }
1694 }
1695
1696 if (CombinedStoreDsCntInstr) {
1697 // Similarly for S_WAIT_STORECNT_DSCNT.
1698 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1699 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1700 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
1701 AMDGPU::OpName::simm16, NewEnc);
1702 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1703 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1704 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1705 Wait.StoreCnt = ~0u;
1706 Wait.DsCnt = ~0u;
1707
1708 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1709 ? dbgs() << "applied pre-existing waitcnt\n"
1710 << "New Instr at block end: "
1711 << *CombinedStoreDsCntInstr << '\n'
1712 : dbgs() << "applied pre-existing waitcnt\n"
1713 << "Old Instr: " << *It << "New Instr: "
1714 << *CombinedStoreDsCntInstr << '\n');
1715 } else {
1716 CombinedStoreDsCntInstr->eraseFromParent();
1717 Modified = true;
1718 }
1719 }
1720
1721 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
1722 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1723 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1724 // instructions so that createNewWaitcnt() will create new combined
1725 // instructions to replace them.
1726
1727 if (Wait.DsCnt != ~0u) {
1728 // This is a vector of addresses in WaitInstrs pointing to instructions
1729 // that should be removed if they are present.
1731
1732 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1733 // both) need to be waited for, ensure that there are no existing
1734 // individual wait count instructions for these.
1735
1736 if (Wait.LoadCnt != ~0u) {
1737 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
1738 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1739 } else if (Wait.StoreCnt != ~0u) {
1740 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
1741 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1742 }
1743
1744 for (MachineInstr **WI : WaitsToErase) {
1745 if (!*WI)
1746 continue;
1747
1748 (*WI)->eraseFromParent();
1749 *WI = nullptr;
1750 Modified = true;
1751 }
1752 }
1753
1754 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1755 if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(PreCombine)) ||
1756 (CT == LOAD_CNT &&
1757 ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine))) {
1758 // Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
1759 // due to taking the backedge of a block.
1760 ScoreBrackets.simplifyXcnt(PreCombine, Wait);
1761 }
1762 if (!WaitInstrs[CT])
1763 continue;
1764
1765 unsigned NewCnt = getWait(Wait, CT);
1766 if (NewCnt != ~0u) {
1767 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
1768 AMDGPU::OpName::simm16, NewCnt);
1769 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1770
1771 ScoreBrackets.applyWaitcnt(CT, NewCnt);
1772 setNoWait(Wait, CT);
1773
1774 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1775 ? dbgs() << "applied pre-existing waitcnt\n"
1776 << "New Instr at block end: " << *WaitInstrs[CT]
1777 << '\n'
1778 : dbgs() << "applied pre-existing waitcnt\n"
1779 << "Old Instr: " << *It
1780 << "New Instr: " << *WaitInstrs[CT] << '\n');
1781 } else {
1782 WaitInstrs[CT]->eraseFromParent();
1783 Modified = true;
1784 }
1785 }
1786
1787 return Modified;
1788}
1789
1790/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
1791bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1792 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1793 AMDGPU::Waitcnt Wait) {
1794 assert(ST);
1795 assert(!isNormalMode(MaxCounter));
1796
1797 bool Modified = false;
1798 const DebugLoc &DL = Block.findDebugLoc(It);
1799
1800 // Check for opportunities to use combined wait instructions.
1801 if (Wait.DsCnt != ~0u) {
1802 MachineInstr *SWaitInst = nullptr;
1803
1804 if (Wait.LoadCnt != ~0u) {
1805 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1806
1807 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
1808 .addImm(Enc);
1809
1810 Wait.LoadCnt = ~0u;
1811 Wait.DsCnt = ~0u;
1812 } else if (Wait.StoreCnt != ~0u) {
1813 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1814
1815 SWaitInst =
1816 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
1817 .addImm(Enc);
1818
1819 Wait.StoreCnt = ~0u;
1820 Wait.DsCnt = ~0u;
1821 }
1822
1823 if (SWaitInst) {
1824 Modified = true;
1825
1826 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1827 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1828 dbgs() << "New Instr: " << *SWaitInst << '\n');
1829 }
1830 }
1831
1832 // Generate an instruction for any remaining counter that needs
1833 // waiting for.
1834
1835 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1836 unsigned Count = getWait(Wait, CT);
1837 if (Count == ~0u)
1838 continue;
1839
1840 [[maybe_unused]] auto SWaitInst =
1841 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
1842 .addImm(Count);
1843
1844 Modified = true;
1845
1846 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1847 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1848 dbgs() << "New Instr: " << *SWaitInst << '\n');
1849 }
1850
1851 return Modified;
1852}
1853
1854/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
1856 // Currently all conventions wait, but this may not always be the case.
1857 //
1858 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
1859 // senses to omit the wait and do it in the caller.
1860 return true;
1861}
1862
1863/// \returns true if the callee is expected to wait for any outstanding waits
1864/// before returning.
1865static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
1866
1867/// Generate s_waitcnt instruction to be placed before cur_Inst.
1868/// Instructions of a given type are returned in order,
1869/// but instructions of different types can complete out of order.
1870/// We rely on this in-order completion
1871/// and simply assign a score to the memory access instructions.
1872/// We keep track of the active "score bracket" to determine
1873/// if an access of a memory read requires an s_waitcnt
1874/// and if so what the value of each counter is.
1875/// The "score bracket" is bound by the lower bound and upper bound
1876/// scores (*_score_LB and *_score_ub respectively).
1877/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1878/// flush the vmcnt counter here.
1879bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1880 WaitcntBrackets &ScoreBrackets,
1881 MachineInstr *OldWaitcntInstr,
1882 bool FlushVmCnt) {
1883 setForceEmitWaitcnt();
1884
1885 assert(!MI.isMetaInstruction());
1886
1887 AMDGPU::Waitcnt Wait;
1888 const unsigned Opc = MI.getOpcode();
1889
1890 // FIXME: This should have already been handled by the memory legalizer.
1891 // Removing this currently doesn't affect any lit tests, but we need to
1892 // verify that nothing was relying on this. The number of buffer invalidates
1893 // being handled here should not be expanded.
1894 if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC ||
1895 Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV ||
1896 Opc == AMDGPU::BUFFER_GL1_INV) {
1897 Wait.LoadCnt = 0;
1898 }
1899
1900 // All waits must be resolved at call return.
1901 // NOTE: this could be improved with knowledge of all call sites or
1902 // with knowledge of the called routines.
1903 if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN ||
1904 Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
1905 Opc == AMDGPU::S_SETPC_B64_return ||
1906 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1907 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
1908 }
1909 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
1910 // Technically the hardware will do this on its own if we don't, but that
1911 // might cost extra cycles compared to doing it explicitly.
1912 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
1913 // have to wait for outstanding VMEM stores. In this case it can be useful to
1914 // send a message to explicitly release all VGPRs before the stores have
1915 // completed, but it is only safe to do this if there are no outstanding
1916 // scratch stores.
1917 else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) {
1918 if (!WCG->isOptNone() &&
1919 (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
1920 (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
1921 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1922 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
1923 ReleaseVGPRInsts.insert(&MI);
1924 }
1925 // Resolve vm waits before gs-done.
1926 else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) &&
1927 ST->hasLegacyGeometry() &&
1928 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1930 Wait.LoadCnt = 0;
1931 }
1932
1933 // Export & GDS instructions do not read the EXEC mask until after the export
1934 // is granted (which can occur well after the instruction is issued).
1935 // The shader program must flush all EXP operations on the export-count
1936 // before overwriting the EXEC mask.
1937 else {
1938 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1939 // Export and GDS are tracked individually, either may trigger a waitcnt
1940 // for EXEC.
1941 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1942 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1943 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1944 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1945 Wait.ExpCnt = 0;
1946 }
1947 }
1948
1949 // Wait for any pending GDS instruction to complete before any
1950 // "Always GDS" instruction.
1951 if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
1952 addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
1953
1954 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1955 // The function is going to insert a wait on everything in its prolog.
1956 // This still needs to be careful if the call target is a load (e.g. a GOT
1957 // load). We also need to check WAW dependency with saved PC.
1958 Wait = AMDGPU::Waitcnt();
1959
1960 const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1961 if (CallAddrOp.isReg()) {
1962 RegInterval CallAddrOpInterval =
1963 ScoreBrackets.getRegInterval(&MI, CallAddrOp);
1964
1965 ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval,
1966 Wait);
1967
1968 if (const auto *RtnAddrOp =
1969 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
1970 RegInterval RtnAddrOpInterval =
1971 ScoreBrackets.getRegInterval(&MI, *RtnAddrOp);
1972
1973 ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval,
1974 Wait);
1975 }
1976 }
1977 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
1978 ScoreBrackets.tryClearSCCWriteEvent(&MI);
1979 } else {
1980 // FIXME: Should not be relying on memoperands.
1981 // Look at the source operands of every instruction to see if
1982 // any of them results from a previous memory operation that affects
1983 // its current usage. If so, an s_waitcnt instruction needs to be
1984 // emitted.
1985 // If the source operand was defined by a load, add the s_waitcnt
1986 // instruction.
1987 //
1988 // Two cases are handled for destination operands:
1989 // 1) If the destination operand was defined by a load, add the s_waitcnt
1990 // instruction to guarantee the right WAW order.
1991 // 2) If a destination operand that was used by a recent export/store ins,
1992 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1993
1994 for (const MachineMemOperand *Memop : MI.memoperands()) {
1995 const Value *Ptr = Memop->getValue();
1996 if (Memop->isStore()) {
1997 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
1998 addWait(Wait, SmemAccessCounter, 0);
1999 if (PDT->dominates(MI.getParent(), It->second))
2000 SLoadAddresses.erase(It);
2001 }
2002 }
2003 unsigned AS = Memop->getAddrSpace();
2005 continue;
2006 // No need to wait before load from VMEM to LDS.
2007 if (TII->mayWriteLDSThroughDMA(MI))
2008 continue;
2009
2010 // LOAD_CNT is only relevant to vgpr or LDS.
2011 unsigned RegNo = FIRST_LDS_VGPR;
2012 if (Ptr && Memop->getAAInfo()) {
2013 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2014 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2015 if (MI.mayAlias(AA, *LDSDMAStores[I], true))
2016 ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
2017 }
2018 } else {
2019 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
2020 }
2021 if (Memop->isStore()) {
2022 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
2023 }
2024 }
2025
2026 // Loop over use and def operands.
2027 for (const MachineOperand &Op : MI.operands()) {
2028 if (!Op.isReg())
2029 continue;
2030
2031 // If the instruction does not read tied source, skip the operand.
2032 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
2033 continue;
2034
2035 RegInterval Interval = ScoreBrackets.getRegInterval(&MI, Op);
2036
2037 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
2038 if (IsVGPR) {
2039 // Implicit VGPR defs and uses are never a part of the memory
2040 // instructions description and usually present to account for
2041 // super-register liveness.
2042 // TODO: Most of the other instructions also have implicit uses
2043 // for the liveness accounting only.
2044 if (Op.isImplicit() && MI.mayLoadOrStore())
2045 continue;
2046
2047 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2048 // previous write and this write are the same type of VMEM
2049 // instruction, in which case they are (in some architectures)
2050 // guaranteed to write their results in order anyway.
2051 // Additionally check instructions where Point Sample Acceleration
2052 // might be applied.
2053 if (Op.isUse() || !updateVMCntOnly(MI) ||
2054 ScoreBrackets.hasOtherPendingVmemTypes(Interval,
2055 getVmemType(MI)) ||
2056 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Interval) ||
2057 !ST->hasVmemWriteVgprInOrder()) {
2058 ScoreBrackets.determineWait(LOAD_CNT, Interval, Wait);
2059 ScoreBrackets.determineWait(SAMPLE_CNT, Interval, Wait);
2060 ScoreBrackets.determineWait(BVH_CNT, Interval, Wait);
2061 ScoreBrackets.clearVgprVmemTypes(Interval);
2062 }
2063
2064 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2065 ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
2066 }
2067 ScoreBrackets.determineWait(DS_CNT, Interval, Wait);
2068 } else if (Op.getReg() == AMDGPU::SCC) {
2069 ScoreBrackets.determineWait(KM_CNT, Interval, Wait);
2070 } else {
2071 ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
2072 }
2073
2074 if (ST->hasWaitXCnt() && Op.isDef())
2075 ScoreBrackets.determineWait(X_CNT, Interval, Wait);
2076 }
2077 }
2078 }
2079
2080 // Ensure safety against exceptions from outstanding memory operations while
2081 // waiting for a barrier:
2082 //
2083 // * Some subtargets safely handle backing off the barrier in hardware
2084 // when an exception occurs.
2085 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2086 // there can be no outstanding memory operations during the wait.
2087 // * Subtargets with split barriers don't need to back off the barrier; it
2088 // is up to the trap handler to preserve the user barrier state correctly.
2089 //
2090 // In all other cases, ensure safety by ensuring that there are no outstanding
2091 // memory operations.
2092 if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() &&
2093 !ST->supportsBackOffBarrier()) {
2094 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2095 }
2096
2097 // TODO: Remove this work-around, enable the assert for Bug 457939
2098 // after fixing the scheduler. Also, the Shader Compiler code is
2099 // independent of target.
2100 if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() &&
2101 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2102 Wait.DsCnt = 0;
2103 }
2104
2105 // Verify that the wait is actually needed.
2106 ScoreBrackets.simplifyWaitcnt(Wait);
2107
2108 // Since the translation for VMEM addresses occur in-order, we can apply the
2109 // XCnt if the current instruction is of VMEM type and has a memory
2110 // dependency with another VMEM instruction in flight.
2111 if (Wait.XCnt != ~0u && isVmemAccess(MI)) {
2112 ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt);
2113 Wait.XCnt = ~0u;
2114 }
2115
2116 // When forcing emit, we need to skip terminators because that would break the
2117 // terminators of the MBB if we emit a waitcnt between terminators.
2118 if (ForceEmitZeroFlag && !MI.isTerminator())
2119 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2120
2121 if (ForceEmitWaitcnt[LOAD_CNT])
2122 Wait.LoadCnt = 0;
2123 if (ForceEmitWaitcnt[EXP_CNT])
2124 Wait.ExpCnt = 0;
2125 if (ForceEmitWaitcnt[DS_CNT])
2126 Wait.DsCnt = 0;
2127 if (ForceEmitWaitcnt[SAMPLE_CNT])
2128 Wait.SampleCnt = 0;
2129 if (ForceEmitWaitcnt[BVH_CNT])
2130 Wait.BvhCnt = 0;
2131 if (ForceEmitWaitcnt[KM_CNT])
2132 Wait.KmCnt = 0;
2133 if (ForceEmitWaitcnt[X_CNT])
2134 Wait.XCnt = 0;
2135
2136 if (FlushVmCnt) {
2137 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2138 Wait.LoadCnt = 0;
2139 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2140 Wait.SampleCnt = 0;
2141 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2142 Wait.BvhCnt = 0;
2143 }
2144
2145 if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
2146 Wait.LoadCnt = 0;
2147
2148 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2149 OldWaitcntInstr);
2150}
2151
2152bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2154 MachineBasicBlock &Block,
2155 WaitcntBrackets &ScoreBrackets,
2156 MachineInstr *OldWaitcntInstr) {
2157 bool Modified = false;
2158
2159 if (OldWaitcntInstr)
2160 // Try to merge the required wait with preexisting waitcnt instructions.
2161 // Also erase redundant waitcnt.
2162 Modified =
2163 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2164
2165 // Any counts that could have been applied to any existing waitcnt
2166 // instructions will have been done so, now deal with any remaining.
2167 ScoreBrackets.applyWaitcnt(Wait);
2168
2169 // ExpCnt can be merged into VINTERP.
2170 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
2172 MachineOperand *WaitExp =
2173 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2174 if (Wait.ExpCnt < WaitExp->getImm()) {
2175 WaitExp->setImm(Wait.ExpCnt);
2176 Modified = true;
2177 }
2178 Wait.ExpCnt = ~0u;
2179
2180 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2181 << "Update Instr: " << *It);
2182 }
2183
2184 if (WCG->createNewWaitcnt(Block, It, Wait))
2185 Modified = true;
2186
2187 return Modified;
2188}
2189
2190bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2191 return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||
2192 (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2193}
2194
2195// Return true if the next instruction is S_ENDPGM, following fallthrough
2196// blocks if necessary.
2197bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2198 MachineBasicBlock *Block) const {
2199 auto BlockEnd = Block->getParent()->end();
2200 auto BlockIter = Block->getIterator();
2201
2202 while (true) {
2203 if (It.isEnd()) {
2204 if (++BlockIter != BlockEnd) {
2205 It = BlockIter->instr_begin();
2206 continue;
2207 }
2208
2209 return false;
2210 }
2211
2212 if (!It->isMetaInstruction())
2213 break;
2214
2215 It++;
2216 }
2217
2218 assert(!It.isEnd());
2219
2220 return It->getOpcode() == AMDGPU::S_ENDPGM;
2221}
2222
2223// Add a wait after an instruction if architecture requirements mandate one.
2224bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2225 MachineBasicBlock &Block,
2226 WaitcntBrackets &ScoreBrackets) {
2227 AMDGPU::Waitcnt Wait;
2228 bool NeedsEndPGMCheck = false;
2229
2230 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2231 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2233
2234 if (TII->isAlwaysGDS(Inst.getOpcode())) {
2235 Wait.DsCnt = 0;
2236 NeedsEndPGMCheck = true;
2237 }
2238
2239 ScoreBrackets.simplifyWaitcnt(Wait);
2240
2241 auto SuccessorIt = std::next(Inst.getIterator());
2242 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2243 /*OldWaitcntInstr=*/nullptr);
2244
2245 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2246 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
2247 .addImm(0);
2248 }
2249
2250 return Result;
2251}
2252
2253void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2254 WaitcntBrackets *ScoreBrackets) {
2255 // Now look at the instruction opcode. If it is a memory access
2256 // instruction, update the upper-bound of the appropriate counter's
2257 // bracket and the destination operand scores.
2258 // For architectures with X_CNT, mark the source address operands
2259 // with the appropriate counter values.
2260 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
2261
2262 bool IsVMEMAccess = false;
2263 bool IsSMEMAccess = false;
2264 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
2265 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
2266 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2267 ScoreBrackets->updateByEvent(GDS_ACCESS, Inst);
2268 ScoreBrackets->updateByEvent(GDS_GPR_LOCK, Inst);
2269 ScoreBrackets->setPendingGDS();
2270 } else {
2271 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2272 }
2273 } else if (TII->isFLAT(Inst)) {
2275 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2276 return;
2277 }
2278
2279 assert(Inst.mayLoadOrStore());
2280
2281 int FlatASCount = 0;
2282
2283 if (TII->mayAccessVMEMThroughFlat(Inst)) {
2284 ++FlatASCount;
2285 IsVMEMAccess = true;
2286 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2287 }
2288
2289 if (TII->mayAccessLDSThroughFlat(Inst)) {
2290 ++FlatASCount;
2291 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2292 }
2293
2294 // This is a flat memory operation that access both VMEM and LDS, so note it
2295 // - it will require that both the VM and LGKM be flushed to zero if it is
2296 // pending when a VM or LGKM dependency occurs.
2297 if (FlatASCount > 1)
2298 ScoreBrackets->setPendingFlat();
2299 } else if (SIInstrInfo::isVMEM(Inst) &&
2301 IsVMEMAccess = true;
2302 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2303
2304 if (ST->vmemWriteNeedsExpWaitcnt() &&
2305 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2306 ScoreBrackets->updateByEvent(VMW_GPR_LOCK, Inst);
2307 }
2308 } else if (TII->isSMRD(Inst)) {
2309 IsSMEMAccess = true;
2310 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2311 } else if (Inst.isCall()) {
2312 if (callWaitsOnFunctionReturn(Inst)) {
2313 // Act as a wait on everything
2314 ScoreBrackets->applyWaitcnt(
2315 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2316 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2317 } else {
2318 // May need to way wait for anything.
2319 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
2320 }
2321 } else if (SIInstrInfo::isLDSDIR(Inst)) {
2322 ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst);
2323 } else if (TII->isVINTERP(Inst)) {
2324 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2325 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2326 } else if (SIInstrInfo::isEXP(Inst)) {
2327 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2329 ScoreBrackets->updateByEvent(EXP_PARAM_ACCESS, Inst);
2330 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2331 ScoreBrackets->updateByEvent(EXP_POS_ACCESS, Inst);
2332 else
2333 ScoreBrackets->updateByEvent(EXP_GPR_LOCK, Inst);
2334 } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
2335 ScoreBrackets->updateByEvent(SCC_WRITE, Inst);
2336 } else {
2337 switch (Inst.getOpcode()) {
2338 case AMDGPU::S_SENDMSG:
2339 case AMDGPU::S_SENDMSG_RTN_B32:
2340 case AMDGPU::S_SENDMSG_RTN_B64:
2341 case AMDGPU::S_SENDMSGHALT:
2342 ScoreBrackets->updateByEvent(SQ_MESSAGE, Inst);
2343 break;
2344 case AMDGPU::S_MEMTIME:
2345 case AMDGPU::S_MEMREALTIME:
2346 case AMDGPU::S_GET_BARRIER_STATE_M0:
2347 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2348 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2349 break;
2350 }
2351 }
2352
2353 if (!ST->hasWaitXCnt())
2354 return;
2355
2356 if (IsVMEMAccess)
2357 ScoreBrackets->updateByEvent(VMEM_GROUP, Inst);
2358
2359 if (IsSMEMAccess)
2360 ScoreBrackets->updateByEvent(SMEM_GROUP, Inst);
2361}
2362
2363bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2364 unsigned OtherScore) {
2365 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2366 unsigned OtherShifted =
2367 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2368 Score = std::max(MyShifted, OtherShifted);
2369 return OtherShifted > MyShifted;
2370}
2371
2372/// Merge the pending events and associater score brackets of \p Other into
2373/// this brackets status.
2374///
2375/// Returns whether the merge resulted in a change that requires tighter waits
2376/// (i.e. the merged brackets strictly dominate the original brackets).
2377bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2378 bool StrictDom = false;
2379
2380 VgprUB = std::max(VgprUB, Other.VgprUB);
2381 SgprUB = std::max(SgprUB, Other.SgprUB);
2382
2383 for (auto T : inst_counter_types(Context->MaxCounter)) {
2384 // Merge event flags for this counter
2385 const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst;
2386 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2387 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2388 if (OtherEvents & ~OldEvents)
2389 StrictDom = true;
2390 PendingEvents |= OtherEvents;
2391
2392 // Merge scores for this counter
2393 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2394 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2395 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2396 if (NewUB < ScoreLBs[T])
2397 report_fatal_error("waitcnt score overflow");
2398
2399 MergeInfo M;
2400 M.OldLB = ScoreLBs[T];
2401 M.OtherLB = Other.ScoreLBs[T];
2402 M.MyShift = NewUB - ScoreUBs[T];
2403 M.OtherShift = NewUB - Other.ScoreUBs[T];
2404
2405 ScoreUBs[T] = NewUB;
2406
2407 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
2408
2409 if (T == DS_CNT)
2410 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
2411
2412 if (T == KM_CNT) {
2413 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
2414 if (Other.hasPendingEvent(SCC_WRITE)) {
2415 unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
2416 if (!OldEventsHasSCCWrite) {
2417 PendingSCCWrite = Other.PendingSCCWrite;
2418 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
2419 PendingSCCWrite = nullptr;
2420 }
2421 }
2422 }
2423
2424 for (int J = 0; J <= VgprUB; J++)
2425 StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
2426
2427 if (isSmemCounter(T)) {
2428 unsigned Idx = getSgprScoresIdx(T);
2429 for (int J = 0; J <= SgprUB; J++)
2430 StrictDom |=
2431 mergeScore(M, SgprScores[Idx][J], Other.SgprScores[Idx][J]);
2432 }
2433 }
2434
2435 for (int J = 0; J <= VgprUB; J++) {
2436 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
2437 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2438 VgprVmemTypes[J] = NewVmemTypes;
2439 }
2440
2441 return StrictDom;
2442}
2443
2444static bool isWaitInstr(MachineInstr &Inst) {
2445 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2446 return Opcode == AMDGPU::S_WAITCNT ||
2447 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2448 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2449 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2450 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2451 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2452 counterTypeForInstr(Opcode).has_value();
2453}
2454
2455// Generate s_waitcnt instructions where needed.
2456bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2457 MachineBasicBlock &Block,
2458 WaitcntBrackets &ScoreBrackets) {
2459 bool Modified = false;
2460
2461 LLVM_DEBUG({
2462 dbgs() << "*** Begin Block: ";
2463 Block.printName(dbgs());
2464 ScoreBrackets.dump();
2465 });
2466
2467 // Track the correctness of vccz through this basic block. There are two
2468 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2469 // ST->partialVCCWritesUpdateVCCZ().
2470 bool VCCZCorrect = true;
2471 if (ST->hasReadVCCZBug()) {
2472 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2473 // to vcc and then issued an smem load.
2474 VCCZCorrect = false;
2475 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2476 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2477 // to vcc_lo or vcc_hi.
2478 VCCZCorrect = false;
2479 }
2480
2481 // Walk over the instructions.
2482 MachineInstr *OldWaitcntInstr = nullptr;
2483
2484 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2485 E = Block.instr_end();
2486 Iter != E;) {
2487 MachineInstr &Inst = *Iter;
2488 if (Inst.isMetaInstruction()) {
2489 ++Iter;
2490 continue;
2491 }
2492
2493 // Track pre-existing waitcnts that were added in earlier iterations or by
2494 // the memory legalizer.
2495 if (isWaitInstr(Inst)) {
2496 if (!OldWaitcntInstr)
2497 OldWaitcntInstr = &Inst;
2498 ++Iter;
2499 continue;
2500 }
2501
2502 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2503 isPreheaderToFlush(Block, ScoreBrackets);
2504
2505 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
2506 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2507 FlushVmCnt);
2508 OldWaitcntInstr = nullptr;
2509
2510 // Restore vccz if it's not known to be correct already.
2511 bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::isCBranchVCCZRead(Inst);
2512
2513 // Don't examine operands unless we need to track vccz correctness.
2514 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
2515 if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
2516 Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) {
2517 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2518 if (!ST->partialVCCWritesUpdateVCCZ())
2519 VCCZCorrect = false;
2520 } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) {
2521 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
2522 // vccz bit, so when we detect that an instruction may read from a
2523 // corrupt vccz bit, we need to:
2524 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2525 // operations to complete.
2526 // 2. Restore the correct value of vccz by writing the current value
2527 // of vcc back to vcc.
2528 if (ST->hasReadVCCZBug() &&
2529 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2530 // Writes to vcc while there's an outstanding smem read may get
2531 // clobbered as soon as any read completes.
2532 VCCZCorrect = false;
2533 } else {
2534 // Writes to vcc will fix any incorrect value in vccz.
2535 VCCZCorrect = true;
2536 }
2537 }
2538 }
2539
2540 if (TII->isSMRD(Inst)) {
2541 for (const MachineMemOperand *Memop : Inst.memoperands()) {
2542 // No need to handle invariant loads when avoiding WAR conflicts, as
2543 // there cannot be a vector store to the same memory location.
2544 if (!Memop->isInvariant()) {
2545 const Value *Ptr = Memop->getValue();
2546 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
2547 }
2548 }
2549 if (ST->hasReadVCCZBug()) {
2550 // This smem read could complete and clobber vccz at any time.
2551 VCCZCorrect = false;
2552 }
2553 }
2554
2555 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2556
2557 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
2558
2559 LLVM_DEBUG({
2560 Inst.print(dbgs());
2561 ScoreBrackets.dump();
2562 });
2563
2564 // TODO: Remove this work-around after fixing the scheduler and enable the
2565 // assert above.
2566 if (RestoreVCCZ) {
2567 // Restore the vccz bit. Any time a value is written to vcc, the vcc
2568 // bit is updated, so we can restore the bit by reading the value of
2569 // vcc and then writing it back to the register.
2570 BuildMI(Block, Inst, Inst.getDebugLoc(),
2571 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2572 TRI->getVCC())
2573 .addReg(TRI->getVCC());
2574 VCCZCorrect = true;
2575 Modified = true;
2576 }
2577
2578 ++Iter;
2579 }
2580
2581 // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
2582 // needed.
2583 AMDGPU::Waitcnt Wait;
2584 if (Block.getFirstTerminator() == Block.end() &&
2585 isPreheaderToFlush(Block, ScoreBrackets)) {
2586 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2587 Wait.LoadCnt = 0;
2588 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2589 Wait.SampleCnt = 0;
2590 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2591 Wait.BvhCnt = 0;
2592 }
2593
2594 // Combine or remove any redundant waitcnts at the end of the block.
2595 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
2596 OldWaitcntInstr);
2597
2598 LLVM_DEBUG({
2599 dbgs() << "*** End Block: ";
2600 Block.printName(dbgs());
2601 ScoreBrackets.dump();
2602 });
2603
2604 return Modified;
2605}
2606
2607// Return true if the given machine basic block is a preheader of a loop in
2608// which we want to flush the vmcnt counter, and false otherwise.
2609bool SIInsertWaitcnts::isPreheaderToFlush(
2610 MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets) {
2611 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
2612 if (!IsInserted)
2613 return Iterator->second;
2614
2615 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
2616 if (!Succ)
2617 return false;
2618
2619 MachineLoop *Loop = MLI->getLoopFor(Succ);
2620 if (!Loop)
2621 return false;
2622
2623 if (Loop->getLoopPreheader() == &MBB &&
2624 shouldFlushVmCnt(Loop, ScoreBrackets)) {
2625 Iterator->second = true;
2626 return true;
2627 }
2628
2629 return false;
2630}
2631
2632bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2634 return TII->mayAccessVMEMThroughFlat(MI);
2635 return SIInstrInfo::isVMEM(MI);
2636}
2637
2638// Return true if it is better to flush the vmcnt counter in the preheader of
2639// the given loop. We currently decide to flush in two situations:
2640// 1. The loop contains vmem store(s), no vmem load and at least one use of a
2641// vgpr containing a value that is loaded outside of the loop. (Only on
2642// targets with no vscnt counter).
2643// 2. The loop contains vmem load(s), but the loaded values are not used in the
2644// loop, and at least one use of a vgpr containing a value that is loaded
2645// outside of the loop.
2646bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2647 const WaitcntBrackets &Brackets) {
2648 bool HasVMemLoad = false;
2649 bool HasVMemStore = false;
2650 bool UsesVgprLoadedOutside = false;
2651 DenseSet<Register> VgprUse;
2652 DenseSet<Register> VgprDef;
2653
2654 for (MachineBasicBlock *MBB : ML->blocks()) {
2655 for (MachineInstr &MI : *MBB) {
2656 if (isVMEMOrFlatVMEM(MI)) {
2657 HasVMemLoad |= MI.mayLoad();
2658 HasVMemStore |= MI.mayStore();
2659 }
2660
2661 for (const MachineOperand &Op : MI.all_uses()) {
2662 if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
2663 continue;
2664 RegInterval Interval = Brackets.getRegInterval(&MI, Op);
2665 // Vgpr use
2666 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2667 // If we find a register that is loaded inside the loop, 1. and 2.
2668 // are invalidated and we can exit.
2669 if (VgprDef.contains(RegNo))
2670 return false;
2671 VgprUse.insert(RegNo);
2672 // If at least one of Op's registers is in the score brackets, the
2673 // value is likely loaded outside of the loop.
2674 if (Brackets.getRegScore(RegNo, LOAD_CNT) >
2675 Brackets.getScoreLB(LOAD_CNT) ||
2676 Brackets.getRegScore(RegNo, SAMPLE_CNT) >
2677 Brackets.getScoreLB(SAMPLE_CNT) ||
2678 Brackets.getRegScore(RegNo, BVH_CNT) >
2679 Brackets.getScoreLB(BVH_CNT)) {
2680 UsesVgprLoadedOutside = true;
2681 break;
2682 }
2683 }
2684 }
2685
2686 // VMem load vgpr def
2687 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
2688 for (const MachineOperand &Op : MI.all_defs()) {
2689 RegInterval Interval = Brackets.getRegInterval(&MI, Op);
2690 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2691 // If we find a register that is loaded inside the loop, 1. and 2.
2692 // are invalidated and we can exit.
2693 if (VgprUse.contains(RegNo))
2694 return false;
2695 VgprDef.insert(RegNo);
2696 }
2697 }
2698 }
2699 }
2700 }
2701 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2702 return true;
2703 return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
2704}
2705
2706bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
2707 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2708 auto *PDT =
2709 &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
2710 AliasAnalysis *AA = nullptr;
2711 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2712 AA = &AAR->getAAResults();
2713
2714 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
2715}
2716
2717PreservedAnalyses
2720 auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
2721 auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
2723 .getManager()
2724 .getCachedResult<AAManager>(MF.getFunction());
2725
2726 if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
2727 return PreservedAnalyses::all();
2728
2731 .preserve<AAManager>();
2732}
2733
2734bool SIInsertWaitcnts::run(MachineFunction &MF) {
2735 ST = &MF.getSubtarget<GCNSubtarget>();
2736 TII = ST->getInstrInfo();
2737 TRI = &TII->getRegisterInfo();
2738 MRI = &MF.getRegInfo();
2740
2742
2743 if (ST->hasExtendedWaitCounts()) {
2744 MaxCounter = NUM_EXTENDED_INST_CNTS;
2745 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
2746 WCG = &WCGGFX12Plus;
2747 } else {
2748 MaxCounter = NUM_NORMAL_INST_CNTS;
2749 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
2750 WCG = &WCGPreGFX12;
2751 }
2752
2753 for (auto T : inst_counter_types())
2754 ForceEmitWaitcnt[T] = false;
2755
2756 WaitEventMaskForInst = WCG->getWaitEventMask();
2757
2758 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
2759
2760 if (ST->hasExtendedWaitCounts()) {
2761 Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
2762 Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
2763 } else {
2764 Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
2765 Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
2766 }
2767 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
2768 Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
2769 Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
2770 Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
2771 Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
2772 Limits.XcntMax = AMDGPU::getXcntBitMask(IV);
2773
2774 [[maybe_unused]] unsigned NumVGPRsMax =
2775 ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize());
2776 [[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
2777 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2778 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2779
2780 BlockInfos.clear();
2781 bool Modified = false;
2782
2783 MachineBasicBlock &EntryBB = MF.front();
2785
2786 if (!MFI->isEntryFunction()) {
2787 // Wait for any outstanding memory operations that the input registers may
2788 // depend on. We can't track them and it's better to do the wait after the
2789 // costly call sequence.
2790
2791 // TODO: Could insert earlier and schedule more liberally with operations
2792 // that only use caller preserved registers.
2793 for (MachineBasicBlock::iterator E = EntryBB.end();
2794 I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
2795 ;
2796
2797 if (ST->hasExtendedWaitCounts()) {
2798 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2799 .addImm(0);
2800 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2801 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
2802 continue;
2803
2804 if (!ST->hasImageInsts() &&
2805 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
2806 continue;
2807
2808 BuildMI(EntryBB, I, DebugLoc(),
2809 TII->get(instrsForExtendedCounterTypes[CT]))
2810 .addImm(0);
2811 }
2812 } else {
2813 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
2814 }
2815
2816 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
2817 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2818 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2819
2820 Modified = true;
2821 }
2822
2823 // Keep iterating over the blocks in reverse post order, inserting and
2824 // updating s_waitcnt where needed, until a fix point is reached.
2825 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
2826 BlockInfos.try_emplace(MBB);
2827
2828 std::unique_ptr<WaitcntBrackets> Brackets;
2829 bool Repeat;
2830 do {
2831 Repeat = false;
2832
2833 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
2834 ++BII) {
2835 MachineBasicBlock *MBB = BII->first;
2836 BlockInfo &BI = BII->second;
2837 if (!BI.Dirty)
2838 continue;
2839
2840 if (BI.Incoming) {
2841 if (!Brackets)
2842 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
2843 else
2844 *Brackets = *BI.Incoming;
2845 } else {
2846 if (!Brackets) {
2847 Brackets = std::make_unique<WaitcntBrackets>(this);
2848 } else {
2849 // Reinitialize in-place. N.B. do not do this by assigning from a
2850 // temporary because the WaitcntBrackets class is large and it could
2851 // cause this function to use an unreasonable amount of stack space.
2852 Brackets->~WaitcntBrackets();
2853 new (Brackets.get()) WaitcntBrackets(this);
2854 }
2855 }
2856
2857 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
2858 BI.Dirty = false;
2859
2860 if (Brackets->hasPendingEvent()) {
2861 BlockInfo *MoveBracketsToSucc = nullptr;
2862 for (MachineBasicBlock *Succ : MBB->successors()) {
2863 auto *SuccBII = BlockInfos.find(Succ);
2864 BlockInfo &SuccBI = SuccBII->second;
2865 if (!SuccBI.Incoming) {
2866 SuccBI.Dirty = true;
2867 if (SuccBII <= BII) {
2868 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
2869 Repeat = true;
2870 }
2871 if (!MoveBracketsToSucc) {
2872 MoveBracketsToSucc = &SuccBI;
2873 } else {
2874 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
2875 }
2876 } else if (SuccBI.Incoming->merge(*Brackets)) {
2877 SuccBI.Dirty = true;
2878 if (SuccBII <= BII) {
2879 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
2880 Repeat = true;
2881 }
2882 }
2883 }
2884 if (MoveBracketsToSucc)
2885 MoveBracketsToSucc->Incoming = std::move(Brackets);
2886 }
2887 }
2888 } while (Repeat);
2889
2890 if (ST->hasScalarStores()) {
2891 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
2892 bool HaveScalarStores = false;
2893
2894 for (MachineBasicBlock &MBB : MF) {
2895 for (MachineInstr &MI : MBB) {
2896 if (!HaveScalarStores && TII->isScalarStore(MI))
2897 HaveScalarStores = true;
2898
2899 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
2900 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2901 EndPgmBlocks.push_back(&MBB);
2902 }
2903 }
2904
2905 if (HaveScalarStores) {
2906 // If scalar writes are used, the cache must be flushed or else the next
2907 // wave to reuse the same scratch memory can be clobbered.
2908 //
2909 // Insert s_dcache_wb at wave termination points if there were any scalar
2910 // stores, and only if the cache hasn't already been flushed. This could
2911 // be improved by looking across blocks for flushes in postdominating
2912 // blocks from the stores but an explicitly requested flush is probably
2913 // very rare.
2914 for (MachineBasicBlock *MBB : EndPgmBlocks) {
2915 bool SeenDCacheWB = false;
2916
2917 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
2918 I != E; ++I) {
2919 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
2920 SeenDCacheWB = true;
2921 else if (TII->isScalarStore(*I))
2922 SeenDCacheWB = false;
2923
2924 // FIXME: It would be better to insert this before a waitcnt if any.
2925 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
2926 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2927 !SeenDCacheWB) {
2928 Modified = true;
2929 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
2930 }
2931 }
2932 }
2933 }
2934 }
2935
2936 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
2937 // This is done in different ways depending on how the VGPRs were allocated
2938 // (i.e. whether we're in dynamic VGPR mode or not).
2939 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
2940 // waveslot limited kernel runs slower with the deallocation.
2941 if (MFI->isDynamicVGPREnabled()) {
2942 for (MachineInstr *MI : ReleaseVGPRInsts) {
2943 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2944 TII->get(AMDGPU::S_ALLOC_VGPR))
2945 .addImm(0);
2946 Modified = true;
2947 }
2948 } else {
2949 if (!ReleaseVGPRInsts.empty() &&
2950 (MF.getFrameInfo().hasCalls() ||
2951 ST->getOccupancyWithNumVGPRs(
2952 TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
2953 /*IsDynamicVGPR=*/false) <
2955 for (MachineInstr *MI : ReleaseVGPRInsts) {
2956 if (ST->requiresNopBeforeDeallocVGPRs()) {
2957 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2958 TII->get(AMDGPU::S_NOP))
2959 .addImm(0);
2960 }
2961 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2962 TII->get(AMDGPU::S_SENDMSG))
2964 Modified = true;
2965 }
2966 }
2967 }
2968 ReleaseVGPRInsts.clear();
2969 PreheadersToFlush.clear();
2970 SLoadAddresses.clear();
2971
2972 return Modified;
2973}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
#define T
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
#define AMDGPU_EVENT_NAME(Name)
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
Provides some synthesis utilities to produce sequences of values.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
static bool isCounterSet(unsigned ID)
static bool shouldExecute(unsigned CounterName)
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
Register getReg() const
getReg - Returns the register number.
iterator end()
Definition MapVector.h:67
iterator find(const KeyT &Key)
Definition MapVector.h:149
iterator begin()
Definition MapVector.h:65
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:854
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
unsigned getStorecntBitMask(const IsaVersion &Version)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getSamplecntBitMask(const IsaVersion &Version)
unsigned getKmcntBitMask(const IsaVersion &Version)
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getXcntBitMask(const IsaVersion &Version)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getBvhcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getLoadcntBitMask(const IsaVersion &Version)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned getDscntBitMask(const IsaVersion &Version)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition Sequence.h:337
@ Wait
Definition Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ Other
Any other memory.
Definition ModRef.h:68
DWARFExpression::Operation Op
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.