LLVM 22.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
31#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/Sequence.h"
39#include "llvm/IR/Dominators.h"
43
44using namespace llvm;
45
46#define DEBUG_TYPE "si-insert-waitcnts"
47
48DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
49 "Force emit s_waitcnt expcnt(0) instrs");
50DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
51 "Force emit s_waitcnt lgkmcnt(0) instrs");
52DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
53 "Force emit s_waitcnt vmcnt(0) instrs");
54
55static cl::opt<bool>
56 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
57 cl::desc("Force all waitcnt instrs to be emitted as "
58 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
59 cl::init(false), cl::Hidden);
60
62 "amdgpu-waitcnt-load-forcezero",
63 cl::desc("Force all waitcnt load counters to wait until 0"),
64 cl::init(false), cl::Hidden);
65
66namespace {
67// Class of object that encapsulates latest instruction counter score
68// associated with the operand. Used for determining whether
69// s_waitcnt instruction needs to be emitted.
70
71enum InstCounterType {
72 LOAD_CNT = 0, // VMcnt prior to gfx12.
73 DS_CNT, // LKGMcnt prior to gfx12.
74 EXP_CNT, //
75 STORE_CNT, // VScnt in gfx10/gfx11.
76 NUM_NORMAL_INST_CNTS,
77 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
78 BVH_CNT, // gfx12+ only.
79 KM_CNT, // gfx12+ only.
80 X_CNT, // gfx1250.
81 NUM_EXTENDED_INST_CNTS,
82 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
83};
84} // namespace
85
86namespace llvm {
87template <> struct enum_iteration_traits<InstCounterType> {
88 static constexpr bool is_iterable = true;
89};
90} // namespace llvm
91
92namespace {
93// Return an iterator over all counters between LOAD_CNT (the first counter)
94// and \c MaxCounter (exclusive, default value yields an enumeration over
95// all counters).
96auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
97 return enum_seq(LOAD_CNT, MaxCounter);
98}
99
100/// Integer IDs used to track vector memory locations we may have to wait on.
101/// Encoded as u16 chunks:
102///
103/// [0, REGUNITS_END ): MCRegUnit
104/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
105///
106/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
107/// It gives (2 << 16) - 1 entries per category which is more than enough
108/// for all register units. MCPhysReg is u16 so we don't even support >u16
109/// physical register numbers at this time, let alone >u16 register units.
110/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
111/// is enough for all register units.
112using VMEMID = uint32_t;
113
114enum : VMEMID {
115 TRACKINGID_RANGE_LEN = (1 << 16),
116
117 // Important: MCRegUnits must always be tracked starting from 0, as we
118 // need to be able to convert between a MCRegUnit and a VMEMID freely.
119 REGUNITS_BEGIN = 0,
120 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
121
122 // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
123 // entry, which is updated for all LDS DMA operations encountered.
124 // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
125 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
126 LDSDMA_BEGIN = REGUNITS_END,
127 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
128};
129
130/// Convert a MCRegUnit to a VMEMID.
131static constexpr VMEMID toVMEMID(MCRegUnit RU) {
132 return static_cast<unsigned>(RU);
133}
134
135struct HardwareLimits {
136 unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
137 unsigned ExpcntMax;
138 unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
139 unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
140 unsigned SamplecntMax; // gfx12+ only.
141 unsigned BvhcntMax; // gfx12+ only.
142 unsigned KmcntMax; // gfx12+ only.
143 unsigned XcntMax; // gfx1250.
144};
145
146#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
147 DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \
148 DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
149 DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
150 DECL(GLOBAL_INV_ACCESS) /* GLOBAL_INV (gfx12+ only) */ \
151 DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
152 DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
153 DECL(VMEM_GROUP) /* vmem group */ \
154 DECL(LDS_ACCESS) /* lds read & write */ \
155 DECL(GDS_ACCESS) /* gds read & write */ \
156 DECL(SQ_MESSAGE) /* send message */ \
157 DECL(SCC_WRITE) /* write to SCC from barrier */ \
158 DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
159 DECL(SMEM_GROUP) /* scalar-memory group */ \
160 DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
161 DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
162 DECL(EXP_POS_ACCESS) /* write to export position */ \
163 DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
164 DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
165 DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */
166
167// clang-format off
168#define AMDGPU_EVENT_ENUM(Name) Name,
169enum WaitEventType {
171 NUM_WAIT_EVENTS
172};
173#undef AMDGPU_EVENT_ENUM
174
175#define AMDGPU_EVENT_NAME(Name) #Name,
176static constexpr StringLiteral WaitEventTypeName[] = {
178};
179#undef AMDGPU_EVENT_NAME
180// clang-format on
181
182// Enumerate different types of result-returning VMEM operations. Although
183// s_waitcnt orders them all with a single vmcnt counter, in the absence of
184// s_waitcnt only instructions of the same VmemType are guaranteed to write
185// their results in order -- so there is no need to insert an s_waitcnt between
186// two instructions of the same type that write the same vgpr.
187enum VmemType {
188 // BUF instructions and MIMG instructions without a sampler.
189 VMEM_NOSAMPLER,
190 // MIMG instructions with a sampler.
191 VMEM_SAMPLER,
192 // BVH instructions
193 VMEM_BVH,
194 NUM_VMEM_TYPES
195};
196
197// Maps values of InstCounterType to the instruction that waits on that
198// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
199// returns true.
200static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
201 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
202 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
203 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
204
205static bool updateVMCntOnly(const MachineInstr &Inst) {
206 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
208}
209
210#ifndef NDEBUG
211static bool isNormalMode(InstCounterType MaxCounter) {
212 return MaxCounter == NUM_NORMAL_INST_CNTS;
213}
214#endif // NDEBUG
215
216VmemType getVmemType(const MachineInstr &Inst) {
217 assert(updateVMCntOnly(Inst));
218 if (!SIInstrInfo::isImage(Inst))
219 return VMEM_NOSAMPLER;
221 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
223
224 if (BaseInfo->BVH)
225 return VMEM_BVH;
226
227 // We have to make an additional check for isVSAMPLE here since some
228 // instructions don't have a sampler, but are still classified as sampler
229 // instructions for the purposes of e.g. waitcnt.
230 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
231 return VMEM_SAMPLER;
232
233 return VMEM_NOSAMPLER;
234}
235
236unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
237 switch (T) {
238 case LOAD_CNT:
239 return Wait.LoadCnt;
240 case EXP_CNT:
241 return Wait.ExpCnt;
242 case DS_CNT:
243 return Wait.DsCnt;
244 case STORE_CNT:
245 return Wait.StoreCnt;
246 case SAMPLE_CNT:
247 return Wait.SampleCnt;
248 case BVH_CNT:
249 return Wait.BvhCnt;
250 case KM_CNT:
251 return Wait.KmCnt;
252 case X_CNT:
253 return Wait.XCnt;
254 default:
255 llvm_unreachable("bad InstCounterType");
256 }
257}
258
259void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
260 unsigned &WC = getCounterRef(Wait, T);
261 WC = std::min(WC, Count);
262}
263
264void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
265 getCounterRef(Wait, T) = ~0u;
266}
267
268unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
269 return getCounterRef(Wait, T);
270}
271
272// Mapping from event to counter according to the table masks.
273InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
274 for (auto T : inst_counter_types()) {
275 if (masks[T] & (1 << E))
276 return T;
277 }
278 llvm_unreachable("event type has no associated counter");
279}
280
281class WaitcntBrackets;
282
283// This abstracts the logic for generating and updating S_WAIT* instructions
284// away from the analysis that determines where they are needed. This was
285// done because the set of counters and instructions for waiting on them
286// underwent a major shift with gfx12, sufficiently so that having this
287// abstraction allows the main analysis logic to be simpler than it would
288// otherwise have had to become.
289class WaitcntGenerator {
290protected:
291 const GCNSubtarget *ST = nullptr;
292 const SIInstrInfo *TII = nullptr;
293 AMDGPU::IsaVersion IV;
294 InstCounterType MaxCounter;
295 bool OptNone;
296
297public:
298 WaitcntGenerator() = default;
299 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
300 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
301 IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
302 OptNone(MF.getFunction().hasOptNone() ||
303 MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
304
305 // Return true if the current function should be compiled with no
306 // optimization.
307 bool isOptNone() const { return OptNone; }
308
309 // Edits an existing sequence of wait count instructions according
310 // to an incoming Waitcnt value, which is itself updated to reflect
311 // any new wait count instructions which may need to be generated by
312 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
313 // were made.
314 //
315 // This editing will usually be merely updated operands, but it may also
316 // delete instructions if the incoming Wait value indicates they are not
317 // needed. It may also remove existing instructions for which a wait
318 // is needed if it can be determined that it is better to generate new
319 // instructions later, as can happen on gfx12.
320 virtual bool
321 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
322 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
324
325 // Transform a soft waitcnt into a normal one.
326 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
327
328 // Generates new wait count instructions according to the value of
329 // Wait, returning true if any new instructions were created.
330 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
332 AMDGPU::Waitcnt Wait) = 0;
333
334 // Returns an array of bit masks which can be used to map values in
335 // WaitEventType to corresponding counter values in InstCounterType.
336 virtual const unsigned *getWaitEventMask() const = 0;
337
338 // Returns a new waitcnt with all counters except VScnt set to 0. If
339 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
340 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
341
342 virtual ~WaitcntGenerator() = default;
343
344 // Create a mask value from the initializer list of wait event types.
345 static constexpr unsigned
346 eventMask(std::initializer_list<WaitEventType> Events) {
347 unsigned Mask = 0;
348 for (auto &E : Events)
349 Mask |= 1 << E;
350
351 return Mask;
352 }
353};
354
355class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
356public:
357 using WaitcntGenerator::WaitcntGenerator;
358
359 bool
360 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
361 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
362 MachineBasicBlock::instr_iterator It) const override;
363
364 bool createNewWaitcnt(MachineBasicBlock &Block,
366 AMDGPU::Waitcnt Wait) override;
367
368 const unsigned *getWaitEventMask() const override {
369 assert(ST);
370
371 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
372 eventMask(
373 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
374 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
375 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
376 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
377 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
378 0,
379 0,
380 0,
381 0};
382
383 return WaitEventMaskForInstPreGFX12;
384 }
385
386 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
387};
388
389class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
390public:
391 using WaitcntGenerator::WaitcntGenerator;
392
393 bool
394 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
395 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
396 MachineBasicBlock::instr_iterator It) const override;
397
398 bool createNewWaitcnt(MachineBasicBlock &Block,
400 AMDGPU::Waitcnt Wait) override;
401
402 const unsigned *getWaitEventMask() const override {
403 assert(ST);
404
405 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
406 eventMask({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
407 eventMask({LDS_ACCESS, GDS_ACCESS}),
408 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
409 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
410 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
411 eventMask({VMEM_SAMPLER_READ_ACCESS}),
412 eventMask({VMEM_BVH_READ_ACCESS}),
413 eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
414 eventMask({VMEM_GROUP, SMEM_GROUP})};
415
416 return WaitEventMaskForInstGFX12Plus;
417 }
418
419 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
420};
421
422class SIInsertWaitcnts {
423public:
424 const GCNSubtarget *ST;
425 const SIInstrInfo *TII = nullptr;
426 const SIRegisterInfo *TRI = nullptr;
427 const MachineRegisterInfo *MRI = nullptr;
428 InstCounterType SmemAccessCounter;
429 InstCounterType MaxCounter;
430 const unsigned *WaitEventMaskForInst;
431
432private:
433 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
434 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
435 MachineLoopInfo *MLI;
436 MachinePostDominatorTree *PDT;
437 AliasAnalysis *AA = nullptr;
438
439 struct BlockInfo {
440 std::unique_ptr<WaitcntBrackets> Incoming;
441 bool Dirty = true;
442 };
443
444 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
445
446 bool ForceEmitWaitcnt[NUM_INST_CNTS];
447
448 // In any given run of this pass, WCG will point to one of these two
449 // generator objects, which must have been re-initialised before use
450 // from a value made using a subtarget constructor.
451 WaitcntGeneratorPreGFX12 WCGPreGFX12;
452 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
453
454 WaitcntGenerator *WCG = nullptr;
455
456 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
457 // message.
458 DenseSet<MachineInstr *> ReleaseVGPRInsts;
459
460 HardwareLimits Limits;
461
462public:
463 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
464 AliasAnalysis *AA)
465 : MLI(MLI), PDT(PDT), AA(AA) {
466 (void)ForceExpCounter;
467 (void)ForceLgkmCounter;
468 (void)ForceVMCounter;
469 }
470
471 unsigned getWaitCountMax(InstCounterType T) const {
472 switch (T) {
473 case LOAD_CNT:
474 return Limits.LoadcntMax;
475 case DS_CNT:
476 return Limits.DscntMax;
477 case EXP_CNT:
478 return Limits.ExpcntMax;
479 case STORE_CNT:
480 return Limits.StorecntMax;
481 case SAMPLE_CNT:
482 return Limits.SamplecntMax;
483 case BVH_CNT:
484 return Limits.BvhcntMax;
485 case KM_CNT:
486 return Limits.KmcntMax;
487 case X_CNT:
488 return Limits.XcntMax;
489 default:
490 break;
491 }
492 return 0;
493 }
494
495 bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
496 bool isPreheaderToFlush(MachineBasicBlock &MBB,
497 const WaitcntBrackets &ScoreBrackets);
498 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
499 bool run(MachineFunction &MF);
500
501 void setForceEmitWaitcnt() {
502// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
503// For debug builds, get the debug counter info and adjust if need be
504#ifndef NDEBUG
505 if (DebugCounter::isCounterSet(ForceExpCounter) &&
506 DebugCounter::shouldExecute(ForceExpCounter)) {
507 ForceEmitWaitcnt[EXP_CNT] = true;
508 } else {
509 ForceEmitWaitcnt[EXP_CNT] = false;
510 }
511
512 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
513 DebugCounter::shouldExecute(ForceLgkmCounter)) {
514 ForceEmitWaitcnt[DS_CNT] = true;
515 ForceEmitWaitcnt[KM_CNT] = true;
516 } else {
517 ForceEmitWaitcnt[DS_CNT] = false;
518 ForceEmitWaitcnt[KM_CNT] = false;
519 }
520
521 if (DebugCounter::isCounterSet(ForceVMCounter) &&
522 DebugCounter::shouldExecute(ForceVMCounter)) {
523 ForceEmitWaitcnt[LOAD_CNT] = true;
524 ForceEmitWaitcnt[SAMPLE_CNT] = true;
525 ForceEmitWaitcnt[BVH_CNT] = true;
526 } else {
527 ForceEmitWaitcnt[LOAD_CNT] = false;
528 ForceEmitWaitcnt[SAMPLE_CNT] = false;
529 ForceEmitWaitcnt[BVH_CNT] = false;
530 }
531#endif // NDEBUG
532 }
533
534 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
535 // instruction.
536 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
537 switch (Inst.getOpcode()) {
538 // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
539 case AMDGPU::GLOBAL_INV:
540 return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write
541 // VGPRs
542 case AMDGPU::GLOBAL_WB:
543 case AMDGPU::GLOBAL_WBINV:
544 return VMEM_WRITE_ACCESS; // tracked using storecnt
545 default:
546 break;
547 }
548
549 // Maps VMEM access types to their corresponding WaitEventType.
550 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
551 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
552
554 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
555 // these should use VM_CNT.
556 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
557 return VMEM_ACCESS;
558 if (Inst.mayStore() &&
559 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
560 if (TII->mayAccessScratch(Inst))
561 return SCRATCH_WRITE_ACCESS;
562 return VMEM_WRITE_ACCESS;
563 }
564 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
565 return VMEM_ACCESS;
566 return VmemReadMapping[getVmemType(Inst)];
567 }
568
569 bool isVmemAccess(const MachineInstr &MI) const;
570 bool generateWaitcntInstBefore(MachineInstr &MI,
571 WaitcntBrackets &ScoreBrackets,
572 MachineInstr *OldWaitcntInstr,
573 bool FlushVmCnt);
574 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
576 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
577 MachineInstr *OldWaitcntInstr);
578 void updateEventWaitcntAfter(MachineInstr &Inst,
579 WaitcntBrackets *ScoreBrackets);
580 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
581 MachineBasicBlock *Block) const;
582 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
583 WaitcntBrackets &ScoreBrackets);
584 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
585 WaitcntBrackets &ScoreBrackets);
586};
587
588// This objects maintains the current score brackets of each wait counter, and
589// a per-register scoreboard for each wait counter.
590//
591// We also maintain the latest score for every event type that can change the
592// waitcnt in order to know if there are multiple types of events within
593// the brackets. When multiple types of event happen in the bracket,
594// wait count may get decreased out of order, therefore we need to put in
595// "s_waitcnt 0" before use.
596class WaitcntBrackets {
597public:
598 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
599 assert(Context->TRI->getNumRegUnits() < REGUNITS_END);
600 }
601
602#ifndef NDEBUG
603 ~WaitcntBrackets() {
604 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
605 for (auto &[ID, Val] : VMem) {
606 if (Val.empty())
607 ++NumUnusedVmem;
608 }
609 for (auto &[ID, Val] : SGPRs) {
610 if (Val.empty())
611 ++NumUnusedSGPRs;
612 }
613
614 if (NumUnusedVmem || NumUnusedSGPRs) {
615 errs() << "WaitcntBracket had unused entries at destruction time: "
616 << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
617 << " SGPR unused entries\n";
618 std::abort();
619 }
620 }
621#endif
622
623 bool isSmemCounter(InstCounterType T) const {
624 return T == Context->SmemAccessCounter || T == X_CNT;
625 }
626
627 unsigned getSgprScoresIdx(InstCounterType T) const {
628 assert(isSmemCounter(T) && "Invalid SMEM counter");
629 return T == X_CNT ? 1 : 0;
630 }
631
632 unsigned getScoreLB(InstCounterType T) const {
633 assert(T < NUM_INST_CNTS);
634 return ScoreLBs[T];
635 }
636
637 unsigned getScoreUB(InstCounterType T) const {
638 assert(T < NUM_INST_CNTS);
639 return ScoreUBs[T];
640 }
641
642 unsigned getScoreRange(InstCounterType T) const {
643 return getScoreUB(T) - getScoreLB(T);
644 }
645
646 unsigned getSGPRScore(MCRegUnit RU, InstCounterType T) const {
647 auto It = SGPRs.find(RU);
648 return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(T)] : 0;
649 }
650
651 unsigned getVMemScore(VMEMID TID, InstCounterType T) const {
652 auto It = VMem.find(TID);
653 return It != VMem.end() ? It->second.Scores[T] : 0;
654 }
655
656 bool merge(const WaitcntBrackets &Other);
657
658 bool counterOutOfOrder(InstCounterType T) const;
659 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait);
660 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
661 bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);
662 bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
663 void simplifyXcnt(AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait);
664
665 void determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
666 AMDGPU::Waitcnt &Wait) const;
667 void determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
668 AMDGPU::Waitcnt &Wait) const;
669 void tryClearSCCWriteEvent(MachineInstr *Inst);
670
671 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
672 void applyWaitcnt(InstCounterType T, unsigned Count);
673 void updateByEvent(WaitEventType E, MachineInstr &MI);
674
675 unsigned hasPendingEvent() const { return PendingEvents; }
676 unsigned hasPendingEvent(WaitEventType E) const {
677 return PendingEvents & (1 << E);
678 }
679 unsigned hasPendingEvent(InstCounterType T) const {
680 unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T];
681 assert((HasPending != 0) == (getScoreRange(T) != 0));
682 return HasPending;
683 }
684
685 bool hasMixedPendingEvents(InstCounterType T) const {
686 unsigned Events = hasPendingEvent(T);
687 // Return true if more than one bit is set in Events.
688 return Events & (Events - 1);
689 }
690
691 bool hasPendingFlat() const {
692 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
693 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
694 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
695 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
696 }
697
698 void setPendingFlat() {
699 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
700 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
701 }
702
703 bool hasPendingGDS() const {
704 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
705 }
706
707 unsigned getPendingGDSWait() const {
708 return std::min(getScoreUB(DS_CNT) - LastGDS,
709 Context->getWaitCountMax(DS_CNT) - 1);
710 }
711
712 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
713
714 // Return true if there might be pending writes to the vgpr-interval by VMEM
715 // instructions with types different from V.
716 bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
717 for (MCRegUnit RU : regunits(Reg)) {
718 auto It = VMem.find(toVMEMID(RU));
719 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
720 return true;
721 }
722 return false;
723 }
724
725 void clearVgprVmemTypes(MCPhysReg Reg) {
726 for (MCRegUnit RU : regunits(Reg)) {
727 if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
728 It->second.VMEMTypes = 0;
729 if (It->second.empty())
730 VMem.erase(It);
731 }
732 }
733 }
734
735 void setStateOnFunctionEntryOrReturn() {
736 setScoreUB(STORE_CNT,
737 getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
738 PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
739 }
740
741 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
742 return LDSDMAStores;
743 }
744
745 bool hasPointSampleAccel(const MachineInstr &MI) const;
746 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
747 MCPhysReg RU) const;
748
749 void print(raw_ostream &) const;
750 void dump() const { print(dbgs()); }
751
752 // Free up memory by removing empty entries from the DenseMap that track event
753 // scores.
754 void purgeEmptyTrackingData();
755
756private:
757 struct MergeInfo {
758 unsigned OldLB;
759 unsigned OtherLB;
760 unsigned MyShift;
761 unsigned OtherShift;
762 };
763
764 void determineWaitForScore(InstCounterType T, unsigned Score,
765 AMDGPU::Waitcnt &Wait) const;
766
767 static bool mergeScore(const MergeInfo &M, unsigned &Score,
768 unsigned OtherScore);
769
771 assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
772 if (!Context->TRI->isInAllocatableClass(Reg))
773 return {{}, {}};
774 const TargetRegisterClass *RC = Context->TRI->getPhysRegBaseClass(Reg);
775 unsigned Size = Context->TRI->getRegSizeInBits(*RC);
776 if (Size == 16 && Context->ST->hasD16Writes32BitVgpr())
777 Reg = Context->TRI->get32BitRegister(Reg);
778 return Context->TRI->regunits(Reg);
779 }
780
781 void setScoreLB(InstCounterType T, unsigned Val) {
782 assert(T < NUM_INST_CNTS);
783 ScoreLBs[T] = Val;
784 }
785
786 void setScoreUB(InstCounterType T, unsigned Val) {
787 assert(T < NUM_INST_CNTS);
788 ScoreUBs[T] = Val;
789
790 if (T != EXP_CNT)
791 return;
792
793 if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
794 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
795 }
796
797 void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {
798 const SIRegisterInfo *TRI = Context->TRI;
799 if (Reg == AMDGPU::SCC) {
800 SCCScore = Val;
801 } else if (TRI->isVectorRegister(*Context->MRI, Reg)) {
802 for (MCRegUnit RU : regunits(Reg))
803 VMem[toVMEMID(RU)].Scores[T] = Val;
804 } else if (TRI->isSGPRReg(*Context->MRI, Reg)) {
805 auto STy = getSgprScoresIdx(T);
806 for (MCRegUnit RU : regunits(Reg))
807 SGPRs[RU].Scores[STy] = Val;
808 } else {
809 llvm_unreachable("Register cannot be tracked/unknown register!");
810 }
811 }
812
813 void setVMemScore(VMEMID TID, InstCounterType T, unsigned Val) {
814 VMem[TID].Scores[T] = Val;
815 }
816
817 void setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy,
818 unsigned Val);
819
820 const SIInsertWaitcnts *Context;
821
822 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
823 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
824 unsigned PendingEvents = 0;
825 // Remember the last flat memory operation.
826 unsigned LastFlat[NUM_INST_CNTS] = {0};
827 // Remember the last GDS operation.
828 unsigned LastGDS = 0;
829
830 // The score tracking logic is fragmented as follows:
831 // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
832 // - SGPRs: SGPR RegUnits
833 // - SCC: Non-allocatable and not general purpose: not a SGPR.
834 //
835 // For the VMem case, if the key is within the range of LDS DMA IDs,
836 // then the corresponding index into the `LDSDMAStores` vector below is:
837 // Key - LDSDMA_BEGIN - 1
838 // This is because LDSDMA_BEGIN is a generic entry and does not have an
839 // associated MachineInstr.
840 //
841 // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
842
843 struct VMEMInfo {
844 // Scores for all instruction counters.
845 std::array<unsigned, NUM_INST_CNTS> Scores = {0};
846 // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
847 unsigned VMEMTypes = 0;
848
849 bool empty() const {
850 return all_of(Scores, [](unsigned K) { return K == 0; }) && !VMEMTypes;
851 }
852 };
853
854 struct SGPRInfo {
855 // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
856 // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
857 // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps
858 // the X_CNT score.
859 std::array<unsigned, 2> Scores = {0};
860
861 bool empty() const { return !Scores[0] && !Scores[1]; }
862 };
863
864 DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
865 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
866
867 // Reg score for SCC.
868 unsigned SCCScore = 0;
869 // The unique instruction that has an SCC write pending, if there is one.
870 const MachineInstr *PendingSCCWrite = nullptr;
871
872 // Store representative LDS DMA operations. The only useful info here is
873 // alias info. One store is kept per unique AAInfo.
874 SmallVector<const MachineInstr *> LDSDMAStores;
875};
876
877class SIInsertWaitcntsLegacy : public MachineFunctionPass {
878public:
879 static char ID;
880 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
881
882 bool runOnMachineFunction(MachineFunction &MF) override;
883
884 StringRef getPassName() const override {
885 return "SI insert wait instructions";
886 }
887
888 void getAnalysisUsage(AnalysisUsage &AU) const override {
889 AU.setPreservesCFG();
890 AU.addRequired<MachineLoopInfoWrapperPass>();
891 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
892 AU.addUsedIfAvailable<AAResultsWrapperPass>();
893 AU.addPreserved<AAResultsWrapperPass>();
895 }
896};
897
898} // end anonymous namespace
899
900void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
901 InstCounterType CntTy, unsigned Score) {
902 setRegScore(Op.getReg().asMCReg(), CntTy, Score);
903}
904
905// Return true if the subtarget is one that enables Point Sample Acceleration
906// and the MachineInstr passed in is one to which it might be applied (the
907// hardware makes this decision based on several factors, but we can't determine
908// this at compile time, so we have to assume it might be applied if the
909// instruction supports it).
910bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
911 if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
912 return false;
913
914 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
915 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
917 return BaseInfo->PointSampleAccel;
918}
919
920// Return true if the subtarget enables Point Sample Acceleration, the supplied
921// MachineInstr is one to which it might be applied and the supplied interval is
922// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
923// (this is the type that a point sample accelerated instruction effectively
924// becomes)
925bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
926 MCPhysReg Reg) const {
927 if (!hasPointSampleAccel(MI))
928 return false;
929
930 return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER);
931}
932
933void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
934 InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
935 assert(T < Context->MaxCounter);
936
937 unsigned UB = getScoreUB(T);
938 unsigned CurrScore = UB + 1;
939 if (CurrScore == 0)
940 report_fatal_error("InsertWaitcnt score wraparound");
941 // PendingEvents and ScoreUB need to be update regardless if this event
942 // changes the score of a register or not.
943 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
944 PendingEvents |= 1 << E;
945 setScoreUB(T, CurrScore);
946
947 const SIRegisterInfo *TRI = Context->TRI;
948 const MachineRegisterInfo *MRI = Context->MRI;
949 const SIInstrInfo *TII = Context->TII;
950
951 if (T == EXP_CNT) {
952 // Put score on the source vgprs. If this is a store, just use those
953 // specific register(s).
954 if (TII->isDS(Inst) && Inst.mayLoadOrStore()) {
955 // All GDS operations must protect their address register (same as
956 // export.)
957 if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
958 setScoreByOperand(*AddrOp, EXP_CNT, CurrScore);
959
960 if (Inst.mayStore()) {
961 if (const auto *Data0 =
962 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
963 setScoreByOperand(*Data0, EXP_CNT, CurrScore);
964 if (const auto *Data1 =
965 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
966 setScoreByOperand(*Data1, EXP_CNT, CurrScore);
967 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
968 Inst.getOpcode() != AMDGPU::DS_APPEND &&
969 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
970 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
971 for (const MachineOperand &Op : Inst.all_uses()) {
972 if (TRI->isVectorRegister(*MRI, Op.getReg()))
973 setScoreByOperand(Op, EXP_CNT, CurrScore);
974 }
975 }
976 } else if (TII->isFLAT(Inst)) {
977 if (Inst.mayStore()) {
978 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
979 EXP_CNT, CurrScore);
980 } else if (SIInstrInfo::isAtomicRet(Inst)) {
981 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
982 EXP_CNT, CurrScore);
983 }
984 } else if (TII->isMIMG(Inst)) {
985 if (Inst.mayStore()) {
986 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
987 } else if (SIInstrInfo::isAtomicRet(Inst)) {
988 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
989 EXP_CNT, CurrScore);
990 }
991 } else if (TII->isMTBUF(Inst)) {
992 if (Inst.mayStore())
993 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
994 } else if (TII->isMUBUF(Inst)) {
995 if (Inst.mayStore()) {
996 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
997 } else if (SIInstrInfo::isAtomicRet(Inst)) {
998 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
999 EXP_CNT, CurrScore);
1000 }
1001 } else if (TII->isLDSDIR(Inst)) {
1002 // LDSDIR instructions attach the score to the destination.
1003 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
1004 EXP_CNT, CurrScore);
1005 } else {
1006 if (TII->isEXP(Inst)) {
1007 // For export the destination registers are really temps that
1008 // can be used as the actual source after export patching, so
1009 // we need to treat them like sources and set the EXP_CNT
1010 // score.
1011 for (MachineOperand &DefMO : Inst.all_defs()) {
1012 if (TRI->isVGPR(*MRI, DefMO.getReg())) {
1013 setScoreByOperand(DefMO, EXP_CNT, CurrScore);
1014 }
1015 }
1016 }
1017 for (const MachineOperand &Op : Inst.all_uses()) {
1018 if (TRI->isVectorRegister(*MRI, Op.getReg()))
1019 setScoreByOperand(Op, EXP_CNT, CurrScore);
1020 }
1021 }
1022 } else if (T == X_CNT) {
1023 WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1024 if (PendingEvents & (1 << OtherEvent)) {
1025 // Hardware inserts an implicit xcnt between interleaved
1026 // SMEM and VMEM operations. So there will never be
1027 // outstanding address translations for both SMEM and
1028 // VMEM at the same time.
1029 setScoreLB(T, getScoreUB(T) - 1);
1030 PendingEvents &= ~(1 << OtherEvent);
1031 }
1032 for (const MachineOperand &Op : Inst.all_uses())
1033 setScoreByOperand(Op, T, CurrScore);
1034 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1035 // Match the score to the destination registers.
1036 //
1037 // Check only explicit operands. Stores, especially spill stores, include
1038 // implicit uses and defs of their super registers which would create an
1039 // artificial dependency, while these are there only for register liveness
1040 // accounting purposes.
1041 //
1042 // Special cases where implicit register defs exists, such as M0 or VCC,
1043 // but none with memory instructions.
1044 for (const MachineOperand &Op : Inst.defs()) {
1045 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
1046 if (!TRI->isVectorRegister(*MRI, Op.getReg())) // TODO: add wrapper
1047 continue;
1048 if (updateVMCntOnly(Inst)) {
1049 // updateVMCntOnly should only leave us with VGPRs
1050 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1051 // defs. That's required for a sane index into `VgprMemTypes` below
1052 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
1053 VmemType V = getVmemType(Inst);
1054 unsigned char TypesMask = 1 << V;
1055 // If instruction can have Point Sample Accel applied, we have to flag
1056 // this with another potential dependency
1057 if (hasPointSampleAccel(Inst))
1058 TypesMask |= 1 << VMEM_NOSAMPLER;
1059 for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))
1060 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1061 }
1062 }
1063 setScoreByOperand(Op, T, CurrScore);
1064 }
1065 if (Inst.mayStore() &&
1066 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
1067 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1068 // written can be accessed. A load from LDS to VMEM does not need a wait.
1069 //
1070 // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
1071 // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
1072 // store. The "Slot" is the index into LDSDMAStores + 1.
1073 unsigned Slot = 0;
1074 for (const auto *MemOp : Inst.memoperands()) {
1075 if (!MemOp->isStore() ||
1076 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1077 continue;
1078 // Comparing just AA info does not guarantee memoperands are equal
1079 // in general, but this is so for LDS DMA in practice.
1080 auto AAI = MemOp->getAAInfo();
1081 // Alias scope information gives a way to definitely identify an
1082 // original memory object and practically produced in the module LDS
1083 // lowering pass. If there is no scope available we will not be able
1084 // to disambiguate LDS aliasing as after the module lowering all LDS
1085 // is squashed into a single big object.
1086 if (!AAI || !AAI.Scope)
1087 break;
1088 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1089 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1090 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1091 Slot = I + 1;
1092 break;
1093 }
1094 }
1095 }
1096 if (Slot)
1097 break;
1098 // The slot may not be valid because it can be >= NUM_LDSDMA which
1099 // means the scoreboard cannot track it. We still want to preserve the
1100 // MI in order to check alias information, though.
1101 LDSDMAStores.push_back(&Inst);
1102 Slot = LDSDMAStores.size();
1103 break;
1104 }
1105 setVMemScore(LDSDMA_BEGIN, T, CurrScore);
1106 if (Slot && Slot < NUM_LDSDMA)
1107 setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
1108 }
1109
1111 setRegScore(AMDGPU::SCC, T, CurrScore);
1112 PendingSCCWrite = &Inst;
1113 }
1114 }
1115}
1116
1117void WaitcntBrackets::print(raw_ostream &OS) const {
1118 const GCNSubtarget *ST = Context->ST;
1119
1120 OS << '\n';
1121 for (auto T : inst_counter_types(Context->MaxCounter)) {
1122 unsigned SR = getScoreRange(T);
1123
1124 switch (T) {
1125 case LOAD_CNT:
1126 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1127 << SR << "):";
1128 break;
1129 case DS_CNT:
1130 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1131 << SR << "):";
1132 break;
1133 case EXP_CNT:
1134 OS << " EXP_CNT(" << SR << "):";
1135 break;
1136 case STORE_CNT:
1137 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1138 << SR << "):";
1139 break;
1140 case SAMPLE_CNT:
1141 OS << " SAMPLE_CNT(" << SR << "):";
1142 break;
1143 case BVH_CNT:
1144 OS << " BVH_CNT(" << SR << "):";
1145 break;
1146 case KM_CNT:
1147 OS << " KM_CNT(" << SR << "):";
1148 break;
1149 case X_CNT:
1150 OS << " X_CNT(" << SR << "):";
1151 break;
1152 default:
1153 OS << " UNKNOWN(" << SR << "):";
1154 break;
1155 }
1156
1157 if (SR != 0) {
1158 // Print vgpr scores.
1159 unsigned LB = getScoreLB(T);
1160
1161 SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
1162 sort(SortedVMEMIDs);
1163
1164 for (auto ID : SortedVMEMIDs) {
1165 unsigned RegScore = VMem.at(ID).Scores[T];
1166 if (RegScore <= LB)
1167 continue;
1168 unsigned RelScore = RegScore - LB - 1;
1169 if (ID < REGUNITS_END) {
1170 OS << ' ' << RelScore << ":vRU" << ID;
1171 } else {
1172 assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
1173 "Unhandled/unexpected ID value!");
1174 OS << ' ' << RelScore << ":LDSDMA" << ID;
1175 }
1176 }
1177
1178 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1179 if (isSmemCounter(T)) {
1180 SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
1181 sort(SortedSMEMIDs);
1182 for (auto ID : SortedSMEMIDs) {
1183 unsigned RegScore = SGPRs.at(ID).Scores[getSgprScoresIdx(T)];
1184 if (RegScore <= LB)
1185 continue;
1186 unsigned RelScore = RegScore - LB - 1;
1187 OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
1188 }
1189 }
1190
1191 if (T == KM_CNT && SCCScore > 0)
1192 OS << ' ' << SCCScore << ":scc";
1193 }
1194 OS << '\n';
1195 }
1196
1197 OS << "Pending Events: ";
1198 if (hasPendingEvent()) {
1199 ListSeparator LS;
1200 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1201 if (hasPendingEvent((WaitEventType)I)) {
1202 OS << LS << WaitEventTypeName[I];
1203 }
1204 }
1205 } else {
1206 OS << "none";
1207 }
1208 OS << '\n';
1209
1210 OS << '\n';
1211}
1212
1213/// Simplify the waitcnt, in the sense of removing redundant counts, and return
1214/// whether a waitcnt instruction is needed at all.
1215void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) {
1216 simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1217 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
1218 simplifyWaitcnt(DS_CNT, Wait.DsCnt);
1219 simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
1220 simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1221 simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
1222 simplifyWaitcnt(KM_CNT, Wait.KmCnt);
1223 simplifyXcnt(Wait, Wait);
1224}
1225
1226void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1227 unsigned &Count) const {
1228 // The number of outstanding events for this type, T, can be calculated
1229 // as (UB - LB). If the current Count is greater than or equal to the number
1230 // of outstanding events, then the wait for this counter is redundant.
1231 if (Count >= getScoreRange(T))
1232 Count = ~0u;
1233}
1234
1235void WaitcntBrackets::purgeEmptyTrackingData() {
1236 for (auto &[K, V] : make_early_inc_range(VMem)) {
1237 if (V.empty())
1238 VMem.erase(K);
1239 }
1240 for (auto &[K, V] : make_early_inc_range(SGPRs)) {
1241 if (V.empty())
1242 SGPRs.erase(K);
1243 }
1244}
1245
1246void WaitcntBrackets::determineWaitForScore(InstCounterType T,
1247 unsigned ScoreToWait,
1248 AMDGPU::Waitcnt &Wait) const {
1249 const unsigned LB = getScoreLB(T);
1250 const unsigned UB = getScoreUB(T);
1251
1252 // If the score falls within the bracket, we need a waitcnt.
1253 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1254 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1255 !Context->ST->hasFlatLgkmVMemCountInOrder()) {
1256 // If there is a pending FLAT operation, and this is a VMem or LGKM
1257 // waitcnt and the target can report early completion, then we need
1258 // to force a waitcnt 0.
1259 addWait(Wait, T, 0);
1260 } else if (counterOutOfOrder(T)) {
1261 // Counter can get decremented out-of-order when there
1262 // are multiple types event in the bracket. Also emit an s_wait counter
1263 // with a conservative value of 0 for the counter.
1264 addWait(Wait, T, 0);
1265 } else {
1266 // If a counter has been maxed out avoid overflow by waiting for
1267 // MAX(CounterType) - 1 instead.
1268 unsigned NeededWait =
1269 std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
1270 addWait(Wait, T, NeededWait);
1271 }
1272 }
1273}
1274
1275void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
1276 AMDGPU::Waitcnt &Wait) const {
1277 if (Reg == AMDGPU::SCC) {
1278 determineWaitForScore(T, SCCScore, Wait);
1279 } else {
1280 bool IsVGPR = Context->TRI->isVectorRegister(*Context->MRI, Reg);
1281 for (MCRegUnit RU : regunits(Reg))
1282 determineWaitForScore(
1283 T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
1284 Wait);
1285 }
1286}
1287
1288void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
1289 AMDGPU::Waitcnt &Wait) const {
1290 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1291 determineWaitForScore(T, getVMemScore(TID, T), Wait);
1292}
1293
1294void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1295 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1296 // SCC has landed
1297 if (PendingSCCWrite &&
1298 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1299 PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
1300 unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
1301 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1302 if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) ==
1303 SCC_WRITE_PendingEvent) {
1304 setScoreLB(KM_CNT, getScoreUB(KM_CNT));
1305 }
1306
1307 PendingEvents &= ~SCC_WRITE_PendingEvent;
1308 PendingSCCWrite = nullptr;
1309 }
1310}
1311
1312void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1313 applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1314 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1315 applyWaitcnt(DS_CNT, Wait.DsCnt);
1316 applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1317 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1318 applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1319 applyWaitcnt(KM_CNT, Wait.KmCnt);
1320 applyWaitcnt(X_CNT, Wait.XCnt);
1321}
1322
1323void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1324 const unsigned UB = getScoreUB(T);
1325 if (Count >= UB)
1326 return;
1327 if (Count != 0) {
1328 if (counterOutOfOrder(T))
1329 return;
1330 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1331 } else {
1332 setScoreLB(T, UB);
1333 PendingEvents &= ~Context->WaitEventMaskForInst[T];
1334 }
1335}
1336
1337bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) {
1338 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1339 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1340 // zero.
1341 return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP);
1342}
1343
1344bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
1345 // If we have pending store we cannot optimize XCnt because we do not wait for
1346 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1347 // decremented to the same number as LOADCnt.
1348 return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1349 !hasPendingEvent(STORE_CNT);
1350}
1351
1352void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &CheckWait,
1353 AMDGPU::Waitcnt &UpdateWait) {
1354 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1355 // optimizations. On entry to a block with multiple predescessors, there may
1356 // be pending SMEM and VMEM events active at the same time.
1357 // In such cases, only clear one active event at a time.
1358 // TODO: Revisit xcnt optimizations for gfx1250.
1359 if (hasRedundantXCntWithKmCnt(CheckWait)) {
1360 if (!hasMixedPendingEvents(X_CNT)) {
1361 applyWaitcnt(X_CNT, 0);
1362 } else {
1363 PendingEvents &= ~(1 << SMEM_GROUP);
1364 }
1365 } else if (canOptimizeXCntWithLoadCnt(CheckWait)) {
1366 if (!hasMixedPendingEvents(X_CNT)) {
1367 applyWaitcnt(X_CNT, std::min(CheckWait.XCnt, CheckWait.LoadCnt));
1368 } else if (CheckWait.LoadCnt == 0) {
1369 PendingEvents &= ~(1 << VMEM_GROUP);
1370 }
1371 }
1372 simplifyWaitcnt(X_CNT, UpdateWait.XCnt);
1373}
1374
1375// Where there are multiple types of event in the bracket of a counter,
1376// the decrement may go out of order.
1377bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1378 // Scalar memory read always can go out of order.
1379 if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1380 (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
1381 return true;
1382
1383 // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
1384 // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
1385 // out-of-order completion.
1386 if (T == LOAD_CNT) {
1387 unsigned Events = hasPendingEvent(T);
1388 // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
1389 // events
1390 Events &= ~(1 << GLOBAL_INV_ACCESS);
1391 // Return true only if there are still multiple event types after removing
1392 // GLOBAL_INV
1393 return Events & (Events - 1);
1394 }
1395
1396 return hasMixedPendingEvents(T);
1397}
1398
1399INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1400 false, false)
1403INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1405
1406char SIInsertWaitcntsLegacy::ID = 0;
1407
1408char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1409
1411 return new SIInsertWaitcntsLegacy();
1412}
1413
1414static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1415 unsigned NewEnc) {
1416 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1417 assert(OpIdx >= 0);
1418
1419 MachineOperand &MO = MI.getOperand(OpIdx);
1420
1421 if (NewEnc == MO.getImm())
1422 return false;
1423
1424 MO.setImm(NewEnc);
1425 return true;
1426}
1427
1428/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1429/// and if so, which counter it is waiting on.
1430static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1431 switch (Opcode) {
1432 case AMDGPU::S_WAIT_LOADCNT:
1433 return LOAD_CNT;
1434 case AMDGPU::S_WAIT_EXPCNT:
1435 return EXP_CNT;
1436 case AMDGPU::S_WAIT_STORECNT:
1437 return STORE_CNT;
1438 case AMDGPU::S_WAIT_SAMPLECNT:
1439 return SAMPLE_CNT;
1440 case AMDGPU::S_WAIT_BVHCNT:
1441 return BVH_CNT;
1442 case AMDGPU::S_WAIT_DSCNT:
1443 return DS_CNT;
1444 case AMDGPU::S_WAIT_KMCNT:
1445 return KM_CNT;
1446 case AMDGPU::S_WAIT_XCNT:
1447 return X_CNT;
1448 default:
1449 return {};
1450 }
1451}
1452
1453bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1454 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1455 if (Opcode == Waitcnt->getOpcode())
1456 return false;
1457
1458 Waitcnt->setDesc(TII->get(Opcode));
1459 return true;
1460}
1461
1462/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1463/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1464/// from \p Wait that were added by previous passes. Currently this pass
1465/// conservatively assumes that these preexisting waits are required for
1466/// correctness.
1467bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1468 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1469 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1470 assert(ST);
1471 assert(isNormalMode(MaxCounter));
1472
1473 bool Modified = false;
1474 MachineInstr *WaitcntInstr = nullptr;
1475 MachineInstr *WaitcntVsCntInstr = nullptr;
1476
1477 LLVM_DEBUG({
1478 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1479 if (It == OldWaitcntInstr.getParent()->instr_end())
1480 dbgs() << "end of block\n";
1481 else
1482 dbgs() << *It;
1483 });
1484
1485 for (auto &II :
1486 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1487 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1488 if (II.isMetaInstruction()) {
1489 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1490 continue;
1491 }
1492
1493 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1494 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1495
1496 // Update required wait count. If this is a soft waitcnt (= it was added
1497 // by an earlier pass), it may be entirely removed.
1498 if (Opcode == AMDGPU::S_WAITCNT) {
1499 unsigned IEnc = II.getOperand(0).getImm();
1500 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1501 if (TrySimplify)
1502 ScoreBrackets.simplifyWaitcnt(OldWait);
1503 Wait = Wait.combined(OldWait);
1504
1505 // Merge consecutive waitcnt of the same type by erasing multiples.
1506 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1507 II.eraseFromParent();
1508 Modified = true;
1509 } else
1510 WaitcntInstr = &II;
1511 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1512 assert(ST->hasVMemToLDSLoad());
1513 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1514 << "Before: " << Wait << '\n';);
1515 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, LDSDMA_BEGIN, Wait);
1516 LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
1517
1518 // It is possible (but unlikely) that this is the only wait instruction,
1519 // in which case, we exit this loop without a WaitcntInstr to consume
1520 // `Wait`. But that works because `Wait` was passed in by reference, and
1521 // the callee eventually calls createNewWaitcnt on it. We test this
1522 // possibility in an articial MIR test since such a situation cannot be
1523 // recreated by running the memory legalizer.
1524 II.eraseFromParent();
1525 } else {
1526 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1527 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1528
1529 unsigned OldVSCnt =
1530 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1531 if (TrySimplify)
1532 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1533 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1534
1535 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1536 II.eraseFromParent();
1537 Modified = true;
1538 } else
1539 WaitcntVsCntInstr = &II;
1540 }
1541 }
1542
1543 if (WaitcntInstr) {
1544 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1546 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1547
1548 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1549 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1550 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1551 Wait.LoadCnt = ~0u;
1552 Wait.ExpCnt = ~0u;
1553 Wait.DsCnt = ~0u;
1554
1555 LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1556 ? dbgs()
1557 << "applied pre-existing waitcnt\n"
1558 << "New Instr at block end: " << *WaitcntInstr << '\n'
1559 : dbgs() << "applied pre-existing waitcnt\n"
1560 << "Old Instr: " << *It
1561 << "New Instr: " << *WaitcntInstr << '\n');
1562 }
1563
1564 if (WaitcntVsCntInstr) {
1565 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
1566 AMDGPU::OpName::simm16, Wait.StoreCnt);
1567 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1568
1569 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1570 Wait.StoreCnt = ~0u;
1571
1572 LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1573 ? dbgs() << "applied pre-existing waitcnt\n"
1574 << "New Instr at block end: " << *WaitcntVsCntInstr
1575 << '\n'
1576 : dbgs() << "applied pre-existing waitcnt\n"
1577 << "Old Instr: " << *It
1578 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1579 }
1580
1581 return Modified;
1582}
1583
1584/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1585/// required counters in \p Wait
1586bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1587 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1588 AMDGPU::Waitcnt Wait) {
1589 assert(ST);
1590 assert(isNormalMode(MaxCounter));
1591
1592 bool Modified = false;
1593 const DebugLoc &DL = Block.findDebugLoc(It);
1594
1595 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1596 // single instruction while VScnt has its own instruction.
1597 if (Wait.hasWaitExceptStoreCnt()) {
1598 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1599 [[maybe_unused]] auto SWaitInst =
1600 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1601 Modified = true;
1602
1603 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1604 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1605 dbgs() << "New Instr: " << *SWaitInst << '\n');
1606 }
1607
1608 if (Wait.hasWaitStoreCnt()) {
1609 assert(ST->hasVscnt());
1610
1611 [[maybe_unused]] auto SWaitInst =
1612 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1613 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1614 .addImm(Wait.StoreCnt);
1615 Modified = true;
1616
1617 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1618 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1619 dbgs() << "New Instr: " << *SWaitInst << '\n');
1620 }
1621
1622 return Modified;
1623}
1624
1625AMDGPU::Waitcnt
1626WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1627 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1628}
1629
1630AMDGPU::Waitcnt
1631WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1632 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1633 ~0u /* XCNT */);
1634}
1635
1636/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1637/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1638/// were added by previous passes. Currently this pass conservatively
1639/// assumes that these preexisting waits are required for correctness.
1640bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1641 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1642 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1643 assert(ST);
1644 assert(!isNormalMode(MaxCounter));
1645
1646 bool Modified = false;
1647 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1648 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1649 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1650
1651 LLVM_DEBUG({
1652 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
1653 if (It == OldWaitcntInstr.getParent()->instr_end())
1654 dbgs() << "end of block\n";
1655 else
1656 dbgs() << *It;
1657 });
1658
1659 for (auto &II :
1660 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1661 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1662 if (II.isMetaInstruction()) {
1663 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1664 continue;
1665 }
1666
1667 MachineInstr **UpdatableInstr;
1668
1669 // Update required wait count. If this is a soft waitcnt (= it was added
1670 // by an earlier pass), it may be entirely removed.
1671
1672 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1673 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1674
1675 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1676 // attempt to do more than that either.
1677 if (Opcode == AMDGPU::S_WAITCNT)
1678 continue;
1679
1680 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1681 unsigned OldEnc =
1682 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1683 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
1684 if (TrySimplify)
1685 ScoreBrackets.simplifyWaitcnt(OldWait);
1686 Wait = Wait.combined(OldWait);
1687 UpdatableInstr = &CombinedLoadDsCntInstr;
1688 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1689 unsigned OldEnc =
1690 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1691 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
1692 if (TrySimplify)
1693 ScoreBrackets.simplifyWaitcnt(OldWait);
1694 Wait = Wait.combined(OldWait);
1695 UpdatableInstr = &CombinedStoreDsCntInstr;
1696 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1697 // Architectures higher than GFX10 do not have direct loads to
1698 // LDS, so no work required here yet.
1699 II.eraseFromParent();
1700 continue;
1701 } else {
1702 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1703 assert(CT.has_value());
1704 unsigned OldCnt =
1705 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1706 if (TrySimplify)
1707 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
1708 addWait(Wait, CT.value(), OldCnt);
1709 UpdatableInstr = &WaitInstrs[CT.value()];
1710 }
1711
1712 // Merge consecutive waitcnt of the same type by erasing multiples.
1713 if (!*UpdatableInstr) {
1714 *UpdatableInstr = &II;
1715 } else {
1716 II.eraseFromParent();
1717 Modified = true;
1718 }
1719 }
1720
1721 // Save the pre combine waitcnt in order to make xcnt checks.
1722 AMDGPU::Waitcnt PreCombine = Wait;
1723 if (CombinedLoadDsCntInstr) {
1724 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1725 // to be waited for. Otherwise, let the instruction be deleted so
1726 // the appropriate single counter wait instruction can be inserted
1727 // instead, when new S_WAIT_*CNT instructions are inserted by
1728 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1729 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1730 // the loop below that deals with single counter instructions.
1731 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1732 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1733 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1734 AMDGPU::OpName::simm16, NewEnc);
1735 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1736 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1737 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1738 Wait.LoadCnt = ~0u;
1739 Wait.DsCnt = ~0u;
1740
1741 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1742 ? dbgs() << "applied pre-existing waitcnt\n"
1743 << "New Instr at block end: "
1744 << *CombinedLoadDsCntInstr << '\n'
1745 : dbgs() << "applied pre-existing waitcnt\n"
1746 << "Old Instr: " << *It << "New Instr: "
1747 << *CombinedLoadDsCntInstr << '\n');
1748 } else {
1749 CombinedLoadDsCntInstr->eraseFromParent();
1750 Modified = true;
1751 }
1752 }
1753
1754 if (CombinedStoreDsCntInstr) {
1755 // Similarly for S_WAIT_STORECNT_DSCNT.
1756 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1757 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1758 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
1759 AMDGPU::OpName::simm16, NewEnc);
1760 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1761 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1762 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1763 Wait.StoreCnt = ~0u;
1764 Wait.DsCnt = ~0u;
1765
1766 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1767 ? dbgs() << "applied pre-existing waitcnt\n"
1768 << "New Instr at block end: "
1769 << *CombinedStoreDsCntInstr << '\n'
1770 : dbgs() << "applied pre-existing waitcnt\n"
1771 << "Old Instr: " << *It << "New Instr: "
1772 << *CombinedStoreDsCntInstr << '\n');
1773 } else {
1774 CombinedStoreDsCntInstr->eraseFromParent();
1775 Modified = true;
1776 }
1777 }
1778
1779 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
1780 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1781 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1782 // instructions so that createNewWaitcnt() will create new combined
1783 // instructions to replace them.
1784
1785 if (Wait.DsCnt != ~0u) {
1786 // This is a vector of addresses in WaitInstrs pointing to instructions
1787 // that should be removed if they are present.
1789
1790 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1791 // both) need to be waited for, ensure that there are no existing
1792 // individual wait count instructions for these.
1793
1794 if (Wait.LoadCnt != ~0u) {
1795 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
1796 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1797 } else if (Wait.StoreCnt != ~0u) {
1798 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
1799 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1800 }
1801
1802 for (MachineInstr **WI : WaitsToErase) {
1803 if (!*WI)
1804 continue;
1805
1806 (*WI)->eraseFromParent();
1807 *WI = nullptr;
1808 Modified = true;
1809 }
1810 }
1811
1812 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1813 if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(PreCombine)) ||
1814 (CT == LOAD_CNT &&
1815 ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine))) {
1816 // Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
1817 // due to taking the backedge of a block.
1818 ScoreBrackets.simplifyXcnt(PreCombine, Wait);
1819 }
1820 if (!WaitInstrs[CT])
1821 continue;
1822
1823 unsigned NewCnt = getWait(Wait, CT);
1824 if (NewCnt != ~0u) {
1825 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
1826 AMDGPU::OpName::simm16, NewCnt);
1827 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1828
1829 ScoreBrackets.applyWaitcnt(CT, NewCnt);
1830 setNoWait(Wait, CT);
1831
1832 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1833 ? dbgs() << "applied pre-existing waitcnt\n"
1834 << "New Instr at block end: " << *WaitInstrs[CT]
1835 << '\n'
1836 : dbgs() << "applied pre-existing waitcnt\n"
1837 << "Old Instr: " << *It
1838 << "New Instr: " << *WaitInstrs[CT] << '\n');
1839 } else {
1840 WaitInstrs[CT]->eraseFromParent();
1841 Modified = true;
1842 }
1843 }
1844
1845 return Modified;
1846}
1847
1848/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
1849bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1850 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1851 AMDGPU::Waitcnt Wait) {
1852 assert(ST);
1853 assert(!isNormalMode(MaxCounter));
1854
1855 bool Modified = false;
1856 const DebugLoc &DL = Block.findDebugLoc(It);
1857
1858 // Check for opportunities to use combined wait instructions.
1859 if (Wait.DsCnt != ~0u) {
1860 MachineInstr *SWaitInst = nullptr;
1861
1862 if (Wait.LoadCnt != ~0u) {
1863 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1864
1865 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
1866 .addImm(Enc);
1867
1868 Wait.LoadCnt = ~0u;
1869 Wait.DsCnt = ~0u;
1870 } else if (Wait.StoreCnt != ~0u) {
1871 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1872
1873 SWaitInst =
1874 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
1875 .addImm(Enc);
1876
1877 Wait.StoreCnt = ~0u;
1878 Wait.DsCnt = ~0u;
1879 }
1880
1881 if (SWaitInst) {
1882 Modified = true;
1883
1884 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
1885 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1886 dbgs() << "New Instr: " << *SWaitInst << '\n');
1887 }
1888 }
1889
1890 // Generate an instruction for any remaining counter that needs
1891 // waiting for.
1892
1893 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1894 unsigned Count = getWait(Wait, CT);
1895 if (Count == ~0u)
1896 continue;
1897
1898 [[maybe_unused]] auto SWaitInst =
1899 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
1900 .addImm(Count);
1901
1902 Modified = true;
1903
1904 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
1905 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1906 dbgs() << "New Instr: " << *SWaitInst << '\n');
1907 }
1908
1909 return Modified;
1910}
1911
1912/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
1914 // Currently all conventions wait, but this may not always be the case.
1915 //
1916 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
1917 // senses to omit the wait and do it in the caller.
1918 return true;
1919}
1920
1921/// \returns true if the callee is expected to wait for any outstanding waits
1922/// before returning.
1923static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
1924
1925/// Generate s_waitcnt instruction to be placed before cur_Inst.
1926/// Instructions of a given type are returned in order,
1927/// but instructions of different types can complete out of order.
1928/// We rely on this in-order completion
1929/// and simply assign a score to the memory access instructions.
1930/// We keep track of the active "score bracket" to determine
1931/// if an access of a memory read requires an s_waitcnt
1932/// and if so what the value of each counter is.
1933/// The "score bracket" is bound by the lower bound and upper bound
1934/// scores (*_score_LB and *_score_ub respectively).
1935/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1936/// flush the vmcnt counter here.
1937bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1938 WaitcntBrackets &ScoreBrackets,
1939 MachineInstr *OldWaitcntInstr,
1940 bool FlushVmCnt) {
1941 setForceEmitWaitcnt();
1942
1943 assert(!MI.isMetaInstruction());
1944
1945 AMDGPU::Waitcnt Wait;
1946 const unsigned Opc = MI.getOpcode();
1947
1948 // FIXME: This should have already been handled by the memory legalizer.
1949 // Removing this currently doesn't affect any lit tests, but we need to
1950 // verify that nothing was relying on this. The number of buffer invalidates
1951 // being handled here should not be expanded.
1952 if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC ||
1953 Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV ||
1954 Opc == AMDGPU::BUFFER_GL1_INV) {
1955 Wait.LoadCnt = 0;
1956 }
1957
1958 // All waits must be resolved at call return.
1959 // NOTE: this could be improved with knowledge of all call sites or
1960 // with knowledge of the called routines.
1961 if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN ||
1962 Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
1963 Opc == AMDGPU::S_SETPC_B64_return ||
1964 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1965 AMDGPU::Waitcnt AllZeroWait =
1966 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
1967 // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
1968 // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
1969 // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
1970 // no need to wait for it at function boundaries.
1971 if (ST->hasExtendedWaitCounts() &&
1972 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
1973 AllZeroWait.LoadCnt = ~0u;
1974 Wait = Wait.combined(AllZeroWait);
1975 }
1976 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
1977 // Technically the hardware will do this on its own if we don't, but that
1978 // might cost extra cycles compared to doing it explicitly.
1979 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
1980 // have to wait for outstanding VMEM stores. In this case it can be useful to
1981 // send a message to explicitly release all VGPRs before the stores have
1982 // completed, but it is only safe to do this if there are no outstanding
1983 // scratch stores.
1984 else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) {
1985 if (!WCG->isOptNone() &&
1986 (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
1987 (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
1988 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1989 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
1990 ReleaseVGPRInsts.insert(&MI);
1991 }
1992 // Resolve vm waits before gs-done.
1993 else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) &&
1994 ST->hasLegacyGeometry() &&
1995 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1997 Wait.LoadCnt = 0;
1998 }
1999
2000 // Export & GDS instructions do not read the EXEC mask until after the export
2001 // is granted (which can occur well after the instruction is issued).
2002 // The shader program must flush all EXP operations on the export-count
2003 // before overwriting the EXEC mask.
2004 else {
2005 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
2006 // Export and GDS are tracked individually, either may trigger a waitcnt
2007 // for EXEC.
2008 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2009 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2010 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2011 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2012 Wait.ExpCnt = 0;
2013 }
2014 }
2015
2016 // Wait for any pending GDS instruction to complete before any
2017 // "Always GDS" instruction.
2018 if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
2019 addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
2020
2021 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
2022 // The function is going to insert a wait on everything in its prolog.
2023 // This still needs to be careful if the call target is a load (e.g. a GOT
2024 // load). We also need to check WAW dependency with saved PC.
2025 Wait = AMDGPU::Waitcnt();
2026
2027 const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2028 if (CallAddrOp.isReg()) {
2029 ScoreBrackets.determineWaitForPhysReg(
2030 SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait);
2031
2032 if (const auto *RtnAddrOp =
2033 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
2034 ScoreBrackets.determineWaitForPhysReg(
2035 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait);
2036 }
2037 }
2038 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
2039 ScoreBrackets.tryClearSCCWriteEvent(&MI);
2040 } else {
2041 // FIXME: Should not be relying on memoperands.
2042 // Look at the source operands of every instruction to see if
2043 // any of them results from a previous memory operation that affects
2044 // its current usage. If so, an s_waitcnt instruction needs to be
2045 // emitted.
2046 // If the source operand was defined by a load, add the s_waitcnt
2047 // instruction.
2048 //
2049 // Two cases are handled for destination operands:
2050 // 1) If the destination operand was defined by a load, add the s_waitcnt
2051 // instruction to guarantee the right WAW order.
2052 // 2) If a destination operand that was used by a recent export/store ins,
2053 // add s_waitcnt on exp_cnt to guarantee the WAR order.
2054
2055 for (const MachineMemOperand *Memop : MI.memoperands()) {
2056 const Value *Ptr = Memop->getValue();
2057 if (Memop->isStore()) {
2058 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
2059 addWait(Wait, SmemAccessCounter, 0);
2060 if (PDT->dominates(MI.getParent(), It->second))
2061 SLoadAddresses.erase(It);
2062 }
2063 }
2064 unsigned AS = Memop->getAddrSpace();
2066 continue;
2067 // No need to wait before load from VMEM to LDS.
2068 if (TII->mayWriteLDSThroughDMA(MI))
2069 continue;
2070
2071 // LOAD_CNT is only relevant to vgpr or LDS.
2072 unsigned TID = LDSDMA_BEGIN;
2073 if (Ptr && Memop->getAAInfo()) {
2074 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2075 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2076 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
2077 if ((I + 1) >= NUM_LDSDMA) {
2078 // We didn't have enough slot to track this LDS DMA store, it
2079 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
2080 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
2081 break;
2082 }
2083
2084 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID + I + 1, Wait);
2085 }
2086 }
2087 } else {
2088 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
2089 }
2090 if (Memop->isStore()) {
2091 ScoreBrackets.determineWaitForLDSDMA(EXP_CNT, TID, Wait);
2092 }
2093 }
2094
2095 // Loop over use and def operands.
2096 for (const MachineOperand &Op : MI.operands()) {
2097 if (!Op.isReg())
2098 continue;
2099
2100 // If the instruction does not read tied source, skip the operand.
2101 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
2102 continue;
2103
2104 MCPhysReg Reg = Op.getReg().asMCReg();
2105
2106 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
2107 if (IsVGPR) {
2108 // Implicit VGPR defs and uses are never a part of the memory
2109 // instructions description and usually present to account for
2110 // super-register liveness.
2111 // TODO: Most of the other instructions also have implicit uses
2112 // for the liveness accounting only.
2113 if (Op.isImplicit() && MI.mayLoadOrStore())
2114 continue;
2115
2116 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2117 // previous write and this write are the same type of VMEM
2118 // instruction, in which case they are (in some architectures)
2119 // guaranteed to write their results in order anyway.
2120 // Additionally check instructions where Point Sample Acceleration
2121 // might be applied.
2122 if (Op.isUse() || !updateVMCntOnly(MI) ||
2123 ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||
2124 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
2125 !ST->hasVmemWriteVgprInOrder()) {
2126 ScoreBrackets.determineWaitForPhysReg(LOAD_CNT, Reg, Wait);
2127 ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT, Reg, Wait);
2128 ScoreBrackets.determineWaitForPhysReg(BVH_CNT, Reg, Wait);
2129 ScoreBrackets.clearVgprVmemTypes(Reg);
2130 }
2131
2132 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2133 ScoreBrackets.determineWaitForPhysReg(EXP_CNT, Reg, Wait);
2134 }
2135 ScoreBrackets.determineWaitForPhysReg(DS_CNT, Reg, Wait);
2136 } else if (Op.getReg() == AMDGPU::SCC) {
2137 ScoreBrackets.determineWaitForPhysReg(KM_CNT, Reg, Wait);
2138 } else {
2139 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait);
2140 }
2141
2142 if (ST->hasWaitXCnt() && Op.isDef())
2143 ScoreBrackets.determineWaitForPhysReg(X_CNT, Reg, Wait);
2144 }
2145 }
2146 }
2147
2148 // Ensure safety against exceptions from outstanding memory operations while
2149 // waiting for a barrier:
2150 //
2151 // * Some subtargets safely handle backing off the barrier in hardware
2152 // when an exception occurs.
2153 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2154 // there can be no outstanding memory operations during the wait.
2155 // * Subtargets with split barriers don't need to back off the barrier; it
2156 // is up to the trap handler to preserve the user barrier state correctly.
2157 //
2158 // In all other cases, ensure safety by ensuring that there are no outstanding
2159 // memory operations.
2160 if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() &&
2161 !ST->supportsBackOffBarrier()) {
2162 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2163 }
2164
2165 // TODO: Remove this work-around, enable the assert for Bug 457939
2166 // after fixing the scheduler. Also, the Shader Compiler code is
2167 // independent of target.
2168 if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() &&
2169 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2170 Wait.DsCnt = 0;
2171 }
2172
2173 // Verify that the wait is actually needed.
2174 ScoreBrackets.simplifyWaitcnt(Wait);
2175
2176 // Since the translation for VMEM addresses occur in-order, we can apply the
2177 // XCnt if the current instruction is of VMEM type and has a memory
2178 // dependency with another VMEM instruction in flight.
2179 if (Wait.XCnt != ~0u && isVmemAccess(MI)) {
2180 ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt);
2181 Wait.XCnt = ~0u;
2182 }
2183
2184 // When forcing emit, we need to skip terminators because that would break the
2185 // terminators of the MBB if we emit a waitcnt between terminators.
2186 if (ForceEmitZeroFlag && !MI.isTerminator())
2187 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2188
2189 if (ForceEmitWaitcnt[LOAD_CNT])
2190 Wait.LoadCnt = 0;
2191 if (ForceEmitWaitcnt[EXP_CNT])
2192 Wait.ExpCnt = 0;
2193 if (ForceEmitWaitcnt[DS_CNT])
2194 Wait.DsCnt = 0;
2195 if (ForceEmitWaitcnt[SAMPLE_CNT])
2196 Wait.SampleCnt = 0;
2197 if (ForceEmitWaitcnt[BVH_CNT])
2198 Wait.BvhCnt = 0;
2199 if (ForceEmitWaitcnt[KM_CNT])
2200 Wait.KmCnt = 0;
2201 if (ForceEmitWaitcnt[X_CNT])
2202 Wait.XCnt = 0;
2203
2204 if (FlushVmCnt) {
2205 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2206 Wait.LoadCnt = 0;
2207 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2208 Wait.SampleCnt = 0;
2209 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2210 Wait.BvhCnt = 0;
2211 }
2212
2213 if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
2214 Wait.LoadCnt = 0;
2215
2216 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2217 OldWaitcntInstr);
2218}
2219
2220bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2222 MachineBasicBlock &Block,
2223 WaitcntBrackets &ScoreBrackets,
2224 MachineInstr *OldWaitcntInstr) {
2225 bool Modified = false;
2226
2227 if (OldWaitcntInstr)
2228 // Try to merge the required wait with preexisting waitcnt instructions.
2229 // Also erase redundant waitcnt.
2230 Modified =
2231 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2232
2233 // Any counts that could have been applied to any existing waitcnt
2234 // instructions will have been done so, now deal with any remaining.
2235 ScoreBrackets.applyWaitcnt(Wait);
2236
2237 // ExpCnt can be merged into VINTERP.
2238 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
2240 MachineOperand *WaitExp =
2241 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2242 if (Wait.ExpCnt < WaitExp->getImm()) {
2243 WaitExp->setImm(Wait.ExpCnt);
2244 Modified = true;
2245 }
2246 Wait.ExpCnt = ~0u;
2247
2248 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2249 << "Update Instr: " << *It);
2250 }
2251
2252 if (WCG->createNewWaitcnt(Block, It, Wait))
2253 Modified = true;
2254
2255 return Modified;
2256}
2257
2258bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2259 return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||
2260 (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2261}
2262
2263// Return true if the next instruction is S_ENDPGM, following fallthrough
2264// blocks if necessary.
2265bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2266 MachineBasicBlock *Block) const {
2267 auto BlockEnd = Block->getParent()->end();
2268 auto BlockIter = Block->getIterator();
2269
2270 while (true) {
2271 if (It.isEnd()) {
2272 if (++BlockIter != BlockEnd) {
2273 It = BlockIter->instr_begin();
2274 continue;
2275 }
2276
2277 return false;
2278 }
2279
2280 if (!It->isMetaInstruction())
2281 break;
2282
2283 It++;
2284 }
2285
2286 assert(!It.isEnd());
2287
2288 return It->getOpcode() == AMDGPU::S_ENDPGM;
2289}
2290
2291// Add a wait after an instruction if architecture requirements mandate one.
2292bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2293 MachineBasicBlock &Block,
2294 WaitcntBrackets &ScoreBrackets) {
2295 AMDGPU::Waitcnt Wait;
2296 bool NeedsEndPGMCheck = false;
2297
2298 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2299 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2301
2302 if (TII->isAlwaysGDS(Inst.getOpcode())) {
2303 Wait.DsCnt = 0;
2304 NeedsEndPGMCheck = true;
2305 }
2306
2307 ScoreBrackets.simplifyWaitcnt(Wait);
2308
2309 auto SuccessorIt = std::next(Inst.getIterator());
2310 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2311 /*OldWaitcntInstr=*/nullptr);
2312
2313 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2314 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
2315 .addImm(0);
2316 }
2317
2318 return Result;
2319}
2320
2321void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2322 WaitcntBrackets *ScoreBrackets) {
2323 // Now look at the instruction opcode. If it is a memory access
2324 // instruction, update the upper-bound of the appropriate counter's
2325 // bracket and the destination operand scores.
2326 // For architectures with X_CNT, mark the source address operands
2327 // with the appropriate counter values.
2328 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
2329
2330 bool IsVMEMAccess = false;
2331 bool IsSMEMAccess = false;
2332 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
2333 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
2334 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2335 ScoreBrackets->updateByEvent(GDS_ACCESS, Inst);
2336 ScoreBrackets->updateByEvent(GDS_GPR_LOCK, Inst);
2337 ScoreBrackets->setPendingGDS();
2338 } else {
2339 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2340 }
2341 } else if (TII->isFLAT(Inst)) {
2343 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2344 return;
2345 }
2346
2347 assert(Inst.mayLoadOrStore());
2348
2349 int FlatASCount = 0;
2350
2351 if (TII->mayAccessVMEMThroughFlat(Inst)) {
2352 ++FlatASCount;
2353 IsVMEMAccess = true;
2354 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2355 }
2356
2357 if (TII->mayAccessLDSThroughFlat(Inst)) {
2358 ++FlatASCount;
2359 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2360 }
2361
2362 // Async/LDSDMA operations have FLAT encoding but do not actually use flat
2363 // pointers. They do have two operands that each access global and LDS, thus
2364 // making it appear at this point that they are using a flat pointer. Filter
2365 // them out, and for the rest, generate a dependency on flat pointers so
2366 // that both VM and LGKM counters are flushed.
2367 if (!SIInstrInfo::isLDSDMA(Inst) && FlatASCount > 1)
2368 ScoreBrackets->setPendingFlat();
2369 } else if (SIInstrInfo::isVMEM(Inst) &&
2371 IsVMEMAccess = true;
2372 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2373
2374 if (ST->vmemWriteNeedsExpWaitcnt() &&
2375 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2376 ScoreBrackets->updateByEvent(VMW_GPR_LOCK, Inst);
2377 }
2378 } else if (TII->isSMRD(Inst)) {
2379 IsSMEMAccess = true;
2380 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2381 } else if (Inst.isCall()) {
2382 if (callWaitsOnFunctionReturn(Inst)) {
2383 // Act as a wait on everything
2384 ScoreBrackets->applyWaitcnt(
2385 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2386 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2387 } else {
2388 // May need to way wait for anything.
2389 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
2390 }
2391 } else if (SIInstrInfo::isLDSDIR(Inst)) {
2392 ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst);
2393 } else if (TII->isVINTERP(Inst)) {
2394 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2395 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2396 } else if (SIInstrInfo::isEXP(Inst)) {
2397 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2399 ScoreBrackets->updateByEvent(EXP_PARAM_ACCESS, Inst);
2400 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2401 ScoreBrackets->updateByEvent(EXP_POS_ACCESS, Inst);
2402 else
2403 ScoreBrackets->updateByEvent(EXP_GPR_LOCK, Inst);
2404 } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
2405 ScoreBrackets->updateByEvent(SCC_WRITE, Inst);
2406 } else {
2407 switch (Inst.getOpcode()) {
2408 case AMDGPU::S_SENDMSG:
2409 case AMDGPU::S_SENDMSG_RTN_B32:
2410 case AMDGPU::S_SENDMSG_RTN_B64:
2411 case AMDGPU::S_SENDMSGHALT:
2412 ScoreBrackets->updateByEvent(SQ_MESSAGE, Inst);
2413 break;
2414 case AMDGPU::S_MEMTIME:
2415 case AMDGPU::S_MEMREALTIME:
2416 case AMDGPU::S_GET_BARRIER_STATE_M0:
2417 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2418 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2419 break;
2420 }
2421 }
2422
2423 if (!ST->hasWaitXCnt())
2424 return;
2425
2426 if (IsVMEMAccess)
2427 ScoreBrackets->updateByEvent(VMEM_GROUP, Inst);
2428
2429 if (IsSMEMAccess)
2430 ScoreBrackets->updateByEvent(SMEM_GROUP, Inst);
2431}
2432
2433bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2434 unsigned OtherScore) {
2435 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2436 unsigned OtherShifted =
2437 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2438 Score = std::max(MyShifted, OtherShifted);
2439 return OtherShifted > MyShifted;
2440}
2441
2442/// Merge the pending events and associater score brackets of \p Other into
2443/// this brackets status.
2444///
2445/// Returns whether the merge resulted in a change that requires tighter waits
2446/// (i.e. the merged brackets strictly dominate the original brackets).
2447bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2448 bool StrictDom = false;
2449
2450 // Check if "other" has keys we don't have, and create default entries for
2451 // those. If they remain empty after merging, we will clean it up after.
2452 for (auto K : Other.VMem.keys())
2453 VMem.try_emplace(K);
2454 for (auto K : Other.SGPRs.keys())
2455 SGPRs.try_emplace(K);
2456
2457 for (auto T : inst_counter_types(Context->MaxCounter)) {
2458 // Merge event flags for this counter
2459 const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst;
2460 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2461 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2462 if (OtherEvents & ~OldEvents)
2463 StrictDom = true;
2464 PendingEvents |= OtherEvents;
2465
2466 // Merge scores for this counter
2467 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2468 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2469 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2470 if (NewUB < ScoreLBs[T])
2471 report_fatal_error("waitcnt score overflow");
2472
2473 MergeInfo M;
2474 M.OldLB = ScoreLBs[T];
2475 M.OtherLB = Other.ScoreLBs[T];
2476 M.MyShift = NewUB - ScoreUBs[T];
2477 M.OtherShift = NewUB - Other.ScoreUBs[T];
2478
2479 ScoreUBs[T] = NewUB;
2480
2481 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
2482
2483 if (T == DS_CNT)
2484 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
2485
2486 if (T == KM_CNT) {
2487 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
2488 if (Other.hasPendingEvent(SCC_WRITE)) {
2489 unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
2490 if (!OldEventsHasSCCWrite) {
2491 PendingSCCWrite = Other.PendingSCCWrite;
2492 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
2493 PendingSCCWrite = nullptr;
2494 }
2495 }
2496 }
2497
2498 for (auto &[RegID, Info] : VMem)
2499 StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));
2500
2501 if (isSmemCounter(T)) {
2502 unsigned Idx = getSgprScoresIdx(T);
2503 for (auto &[RegID, Info] : SGPRs) {
2504 auto It = Other.SGPRs.find(RegID);
2505 unsigned OtherScore =
2506 (It != Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;
2507 StrictDom |= mergeScore(M, Info.Scores[Idx], OtherScore);
2508 }
2509 }
2510 }
2511
2512 for (auto &[TID, Info] : VMem) {
2513 if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {
2514 unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;
2515 StrictDom |= NewVmemTypes != Info.VMEMTypes;
2516 Info.VMEMTypes = NewVmemTypes;
2517 }
2518 }
2519
2520 purgeEmptyTrackingData();
2521 return StrictDom;
2522}
2523
2524static bool isWaitInstr(MachineInstr &Inst) {
2525 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2526 return Opcode == AMDGPU::S_WAITCNT ||
2527 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2528 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2529 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2530 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2531 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2532 counterTypeForInstr(Opcode).has_value();
2533}
2534
2535// Generate s_waitcnt instructions where needed.
2536bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2537 MachineBasicBlock &Block,
2538 WaitcntBrackets &ScoreBrackets) {
2539 bool Modified = false;
2540
2541 LLVM_DEBUG({
2542 dbgs() << "*** Begin Block: ";
2543 Block.printName(dbgs());
2544 ScoreBrackets.dump();
2545 });
2546
2547 // Track the correctness of vccz through this basic block. There are two
2548 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2549 // ST->partialVCCWritesUpdateVCCZ().
2550 bool VCCZCorrect = true;
2551 if (ST->hasReadVCCZBug()) {
2552 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2553 // to vcc and then issued an smem load.
2554 VCCZCorrect = false;
2555 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2556 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2557 // to vcc_lo or vcc_hi.
2558 VCCZCorrect = false;
2559 }
2560
2561 // Walk over the instructions.
2562 MachineInstr *OldWaitcntInstr = nullptr;
2563
2564 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2565 E = Block.instr_end();
2566 Iter != E;) {
2567 MachineInstr &Inst = *Iter;
2568 if (Inst.isMetaInstruction()) {
2569 ++Iter;
2570 continue;
2571 }
2572
2573 // Track pre-existing waitcnts that were added in earlier iterations or by
2574 // the memory legalizer.
2575 if (isWaitInstr(Inst)) {
2576 if (!OldWaitcntInstr)
2577 OldWaitcntInstr = &Inst;
2578 ++Iter;
2579 continue;
2580 }
2581
2582 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2583 isPreheaderToFlush(Block, ScoreBrackets);
2584
2585 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
2586 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2587 FlushVmCnt);
2588 OldWaitcntInstr = nullptr;
2589
2590 // Restore vccz if it's not known to be correct already.
2591 bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::isCBranchVCCZRead(Inst);
2592
2593 // Don't examine operands unless we need to track vccz correctness.
2594 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
2595 if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
2596 Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) {
2597 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2598 if (!ST->partialVCCWritesUpdateVCCZ())
2599 VCCZCorrect = false;
2600 } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) {
2601 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
2602 // vccz bit, so when we detect that an instruction may read from a
2603 // corrupt vccz bit, we need to:
2604 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2605 // operations to complete.
2606 // 2. Restore the correct value of vccz by writing the current value
2607 // of vcc back to vcc.
2608 if (ST->hasReadVCCZBug() &&
2609 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2610 // Writes to vcc while there's an outstanding smem read may get
2611 // clobbered as soon as any read completes.
2612 VCCZCorrect = false;
2613 } else {
2614 // Writes to vcc will fix any incorrect value in vccz.
2615 VCCZCorrect = true;
2616 }
2617 }
2618 }
2619
2620 if (TII->isSMRD(Inst)) {
2621 for (const MachineMemOperand *Memop : Inst.memoperands()) {
2622 // No need to handle invariant loads when avoiding WAR conflicts, as
2623 // there cannot be a vector store to the same memory location.
2624 if (!Memop->isInvariant()) {
2625 const Value *Ptr = Memop->getValue();
2626 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
2627 }
2628 }
2629 if (ST->hasReadVCCZBug()) {
2630 // This smem read could complete and clobber vccz at any time.
2631 VCCZCorrect = false;
2632 }
2633 }
2634
2635 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2636
2637 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
2638
2639 LLVM_DEBUG({
2640 Inst.print(dbgs());
2641 ScoreBrackets.dump();
2642 });
2643
2644 // TODO: Remove this work-around after fixing the scheduler and enable the
2645 // assert above.
2646 if (RestoreVCCZ) {
2647 // Restore the vccz bit. Any time a value is written to vcc, the vcc
2648 // bit is updated, so we can restore the bit by reading the value of
2649 // vcc and then writing it back to the register.
2650 BuildMI(Block, Inst, Inst.getDebugLoc(),
2651 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2652 TRI->getVCC())
2653 .addReg(TRI->getVCC());
2654 VCCZCorrect = true;
2655 Modified = true;
2656 }
2657
2658 ++Iter;
2659 }
2660
2661 // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
2662 // needed.
2663 AMDGPU::Waitcnt Wait;
2664 if (Block.getFirstTerminator() == Block.end() &&
2665 isPreheaderToFlush(Block, ScoreBrackets)) {
2666 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2667 Wait.LoadCnt = 0;
2668 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2669 Wait.SampleCnt = 0;
2670 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2671 Wait.BvhCnt = 0;
2672 }
2673
2674 // Combine or remove any redundant waitcnts at the end of the block.
2675 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
2676 OldWaitcntInstr);
2677
2678 LLVM_DEBUG({
2679 dbgs() << "*** End Block: ";
2680 Block.printName(dbgs());
2681 ScoreBrackets.dump();
2682 });
2683
2684 return Modified;
2685}
2686
2687// Return true if the given machine basic block is a preheader of a loop in
2688// which we want to flush the vmcnt counter, and false otherwise.
2689bool SIInsertWaitcnts::isPreheaderToFlush(
2690 MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets) {
2691 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
2692 if (!IsInserted)
2693 return Iterator->second;
2694
2695 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
2696 if (!Succ)
2697 return false;
2698
2699 MachineLoop *Loop = MLI->getLoopFor(Succ);
2700 if (!Loop)
2701 return false;
2702
2703 if (Loop->getLoopPreheader() == &MBB &&
2704 shouldFlushVmCnt(Loop, ScoreBrackets)) {
2705 Iterator->second = true;
2706 return true;
2707 }
2708
2709 return false;
2710}
2711
2712bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2714 return TII->mayAccessVMEMThroughFlat(MI);
2715 return SIInstrInfo::isVMEM(MI);
2716}
2717
2718// Return true if it is better to flush the vmcnt counter in the preheader of
2719// the given loop. We currently decide to flush in two situations:
2720// 1. The loop contains vmem store(s), no vmem load and at least one use of a
2721// vgpr containing a value that is loaded outside of the loop. (Only on
2722// targets with no vscnt counter).
2723// 2. The loop contains vmem load(s), but the loaded values are not used in the
2724// loop, and at least one use of a vgpr containing a value that is loaded
2725// outside of the loop.
2726bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2727 const WaitcntBrackets &Brackets) {
2728 bool HasVMemLoad = false;
2729 bool HasVMemStore = false;
2730 bool UsesVgprLoadedOutside = false;
2731 DenseSet<MCRegUnit> VgprUse;
2732 DenseSet<MCRegUnit> VgprDef;
2733
2734 for (MachineBasicBlock *MBB : ML->blocks()) {
2735 for (MachineInstr &MI : *MBB) {
2736 if (isVMEMOrFlatVMEM(MI)) {
2737 HasVMemLoad |= MI.mayLoad();
2738 HasVMemStore |= MI.mayStore();
2739 }
2740
2741 for (const MachineOperand &Op : MI.all_uses()) {
2742 if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
2743 continue;
2744 // Vgpr use
2745 for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
2746 // If we find a register that is loaded inside the loop, 1. and 2.
2747 // are invalidated and we can exit.
2748 if (VgprDef.contains(RU))
2749 return false;
2750 VgprUse.insert(RU);
2751 // If at least one of Op's registers is in the score brackets, the
2752 // value is likely loaded outside of the loop.
2753 VMEMID ID = toVMEMID(RU);
2754 if (Brackets.getVMemScore(ID, LOAD_CNT) >
2755 Brackets.getScoreLB(LOAD_CNT) ||
2756 Brackets.getVMemScore(ID, SAMPLE_CNT) >
2757 Brackets.getScoreLB(SAMPLE_CNT) ||
2758 Brackets.getVMemScore(ID, BVH_CNT) >
2759 Brackets.getScoreLB(BVH_CNT)) {
2760 UsesVgprLoadedOutside = true;
2761 break;
2762 }
2763 }
2764 }
2765
2766 // VMem load vgpr def
2767 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
2768 for (const MachineOperand &Op : MI.all_defs()) {
2769 for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
2770 // If we find a register that is loaded inside the loop, 1. and 2.
2771 // are invalidated and we can exit.
2772 if (VgprUse.contains(RU))
2773 return false;
2774 VgprDef.insert(RU);
2775 }
2776 }
2777 }
2778 }
2779 }
2780 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2781 return true;
2782 return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
2783}
2784
2785bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
2786 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2787 auto *PDT =
2788 &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
2789 AliasAnalysis *AA = nullptr;
2790 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2791 AA = &AAR->getAAResults();
2792
2793 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
2794}
2795
2796PreservedAnalyses
2799 auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
2800 auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
2802 .getManager()
2803 .getCachedResult<AAManager>(MF.getFunction());
2804
2805 if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
2806 return PreservedAnalyses::all();
2807
2810 .preserve<AAManager>();
2811}
2812
2813bool SIInsertWaitcnts::run(MachineFunction &MF) {
2814 ST = &MF.getSubtarget<GCNSubtarget>();
2815 TII = ST->getInstrInfo();
2816 TRI = &TII->getRegisterInfo();
2817 MRI = &MF.getRegInfo();
2819
2821
2822 if (ST->hasExtendedWaitCounts()) {
2823 MaxCounter = NUM_EXTENDED_INST_CNTS;
2824 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
2825 WCG = &WCGGFX12Plus;
2826 } else {
2827 MaxCounter = NUM_NORMAL_INST_CNTS;
2828 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, MaxCounter);
2829 WCG = &WCGPreGFX12;
2830 }
2831
2832 for (auto T : inst_counter_types())
2833 ForceEmitWaitcnt[T] = false;
2834
2835 WaitEventMaskForInst = WCG->getWaitEventMask();
2836
2837 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
2838
2839 if (ST->hasExtendedWaitCounts()) {
2840 Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
2841 Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
2842 } else {
2843 Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
2844 Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
2845 }
2846 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
2847 Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
2848 Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
2849 Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
2850 Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
2851 Limits.XcntMax = AMDGPU::getXcntBitMask(IV);
2852
2853 BlockInfos.clear();
2854 bool Modified = false;
2855
2856 MachineBasicBlock &EntryBB = MF.front();
2857
2858 if (!MFI->isEntryFunction()) {
2859 // Wait for any outstanding memory operations that the input registers may
2860 // depend on. We can't track them and it's better to do the wait after the
2861 // costly call sequence.
2862
2863 // TODO: Could insert earlier and schedule more liberally with operations
2864 // that only use caller preserved registers.
2866 while (I != EntryBB.end() && I->isMetaInstruction())
2867 ++I;
2868
2869 if (ST->hasExtendedWaitCounts()) {
2870 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2871 .addImm(0);
2872 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2873 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
2874 continue;
2875
2876 if (!ST->hasImageInsts() &&
2877 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
2878 continue;
2879
2880 BuildMI(EntryBB, I, DebugLoc(),
2881 TII->get(instrsForExtendedCounterTypes[CT]))
2882 .addImm(0);
2883 }
2884 } else {
2885 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
2886 }
2887
2888 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
2889 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2890 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2891
2892 Modified = true;
2893 }
2894
2895 // Keep iterating over the blocks in reverse post order, inserting and
2896 // updating s_waitcnt where needed, until a fix point is reached.
2897 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
2898 BlockInfos.try_emplace(MBB);
2899
2900 std::unique_ptr<WaitcntBrackets> Brackets;
2901 bool Repeat;
2902 do {
2903 Repeat = false;
2904
2905 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
2906 ++BII) {
2907 MachineBasicBlock *MBB = BII->first;
2908 BlockInfo &BI = BII->second;
2909 if (!BI.Dirty)
2910 continue;
2911
2912 if (BI.Incoming) {
2913 if (!Brackets)
2914 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
2915 else
2916 *Brackets = *BI.Incoming;
2917 } else {
2918 if (!Brackets) {
2919 Brackets = std::make_unique<WaitcntBrackets>(this);
2920 } else {
2921 // Reinitialize in-place. N.B. do not do this by assigning from a
2922 // temporary because the WaitcntBrackets class is large and it could
2923 // cause this function to use an unreasonable amount of stack space.
2924 Brackets->~WaitcntBrackets();
2925 new (Brackets.get()) WaitcntBrackets(this);
2926 }
2927 }
2928
2929 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
2930 BI.Dirty = false;
2931
2932 if (Brackets->hasPendingEvent()) {
2933 BlockInfo *MoveBracketsToSucc = nullptr;
2934 for (MachineBasicBlock *Succ : MBB->successors()) {
2935 auto *SuccBII = BlockInfos.find(Succ);
2936 BlockInfo &SuccBI = SuccBII->second;
2937 if (!SuccBI.Incoming) {
2938 SuccBI.Dirty = true;
2939 if (SuccBII <= BII) {
2940 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
2941 Repeat = true;
2942 }
2943 if (!MoveBracketsToSucc) {
2944 MoveBracketsToSucc = &SuccBI;
2945 } else {
2946 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
2947 }
2948 } else if (SuccBI.Incoming->merge(*Brackets)) {
2949 SuccBI.Dirty = true;
2950 if (SuccBII <= BII) {
2951 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
2952 Repeat = true;
2953 }
2954 }
2955 }
2956 if (MoveBracketsToSucc)
2957 MoveBracketsToSucc->Incoming = std::move(Brackets);
2958 }
2959 }
2960 } while (Repeat);
2961
2962 if (ST->hasScalarStores()) {
2963 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
2964 bool HaveScalarStores = false;
2965
2966 for (MachineBasicBlock &MBB : MF) {
2967 for (MachineInstr &MI : MBB) {
2968 if (!HaveScalarStores && TII->isScalarStore(MI))
2969 HaveScalarStores = true;
2970
2971 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
2972 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2973 EndPgmBlocks.push_back(&MBB);
2974 }
2975 }
2976
2977 if (HaveScalarStores) {
2978 // If scalar writes are used, the cache must be flushed or else the next
2979 // wave to reuse the same scratch memory can be clobbered.
2980 //
2981 // Insert s_dcache_wb at wave termination points if there were any scalar
2982 // stores, and only if the cache hasn't already been flushed. This could
2983 // be improved by looking across blocks for flushes in postdominating
2984 // blocks from the stores but an explicitly requested flush is probably
2985 // very rare.
2986 for (MachineBasicBlock *MBB : EndPgmBlocks) {
2987 bool SeenDCacheWB = false;
2988
2989 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
2990 I != E; ++I) {
2991 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
2992 SeenDCacheWB = true;
2993 else if (TII->isScalarStore(*I))
2994 SeenDCacheWB = false;
2995
2996 // FIXME: It would be better to insert this before a waitcnt if any.
2997 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
2998 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2999 !SeenDCacheWB) {
3000 Modified = true;
3001 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
3002 }
3003 }
3004 }
3005 }
3006 }
3007
3008 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
3009 // This is done in different ways depending on how the VGPRs were allocated
3010 // (i.e. whether we're in dynamic VGPR mode or not).
3011 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
3012 // waveslot limited kernel runs slower with the deallocation.
3013 if (MFI->isDynamicVGPREnabled()) {
3014 for (MachineInstr *MI : ReleaseVGPRInsts) {
3015 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3016 TII->get(AMDGPU::S_ALLOC_VGPR))
3017 .addImm(0);
3018 Modified = true;
3019 }
3020 } else {
3021 if (!ReleaseVGPRInsts.empty() &&
3022 (MF.getFrameInfo().hasCalls() ||
3023 ST->getOccupancyWithNumVGPRs(
3024 TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
3025 /*IsDynamicVGPR=*/false) <
3027 for (MachineInstr *MI : ReleaseVGPRInsts) {
3028 if (ST->requiresNopBeforeDeallocVGPRs()) {
3029 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3030 TII->get(AMDGPU::S_NOP))
3031 .addImm(0);
3032 }
3033 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3034 TII->get(AMDGPU::S_SENDMSG))
3036 Modified = true;
3037 }
3038 }
3039 }
3040 ReleaseVGPRInsts.clear();
3041 PreheadersToFlush.clear();
3042 SLoadAddresses.clear();
3043
3044 return Modified;
3045}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
#define T
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
#define AMDGPU_EVENT_NAME(Name)
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
Provides some synthesis utilities to produce sequences of values.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
Register getReg() const
getReg - Returns the register number.
iterator end()
Definition MapVector.h:67
iterator find(const KeyT &Key)
Definition MapVector.h:154
iterator begin()
Definition MapVector.h:65
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:854
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
unsigned getStorecntBitMask(const IsaVersion &Version)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getSamplecntBitMask(const IsaVersion &Version)
unsigned getKmcntBitMask(const IsaVersion &Version)
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getXcntBitMask(const IsaVersion &Version)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getBvhcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getLoadcntBitMask(const IsaVersion &Version)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned getDscntBitMask(const IsaVersion &Version)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
bool empty() const
Definition BasicBlock.h:101
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition Sequence.h:337
@ Wait
Definition Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.