LLVM 19.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
31#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/Sequence.h"
40using namespace llvm;
41
42#define DEBUG_TYPE "si-insert-waitcnts"
43
44DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
45 "Force emit s_waitcnt expcnt(0) instrs");
46DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
47 "Force emit s_waitcnt lgkmcnt(0) instrs");
48DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
49 "Force emit s_waitcnt vmcnt(0) instrs");
50
52 "amdgpu-waitcnt-forcezero",
53 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
54 cl::init(false), cl::Hidden);
55
56namespace {
57// Class of object that encapsulates latest instruction counter score
58// associated with the operand. Used for determining whether
59// s_waitcnt instruction needs to be emitted.
60
61enum InstCounterType {
62 LOAD_CNT = 0, // VMcnt prior to gfx12.
63 DS_CNT, // LKGMcnt prior to gfx12.
64 EXP_CNT, //
65 STORE_CNT, // VScnt in gfx10/gfx11.
66 NUM_NORMAL_INST_CNTS,
67 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
68 BVH_CNT, // gfx12+ only.
69 KM_CNT, // gfx12+ only.
70 NUM_EXTENDED_INST_CNTS,
71 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
72};
73} // namespace
74
75namespace llvm {
76template <> struct enum_iteration_traits<InstCounterType> {
77 static constexpr bool is_iterable = true;
78};
79} // namespace llvm
80
81namespace {
82// Return an iterator over all counters between LOAD_CNT (the first counter)
83// and \c MaxCounter (exclusive, default value yields an enumeration over
84// all counters).
85auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
86 return enum_seq(LOAD_CNT, MaxCounter);
87}
88
89using RegInterval = std::pair<int, int>;
90
91struct HardwareLimits {
92 unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
93 unsigned ExpcntMax;
94 unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
95 unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
96 unsigned SamplecntMax; // gfx12+ only.
97 unsigned BvhcntMax; // gfx12+ only.
98 unsigned KmcntMax; // gfx12+ only.
99};
100
101struct RegisterEncoding {
102 unsigned VGPR0;
103 unsigned VGPRL;
104 unsigned SGPR0;
105 unsigned SGPRL;
106};
107
108enum WaitEventType {
109 VMEM_ACCESS, // vector-memory read & write
110 VMEM_READ_ACCESS, // vector-memory read
111 VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
112 VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only)
113 VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
114 SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
115 LDS_ACCESS, // lds read & write
116 GDS_ACCESS, // gds read & write
117 SQ_MESSAGE, // send message
118 SMEM_ACCESS, // scalar-memory read & write
119 EXP_GPR_LOCK, // export holding on its data src
120 GDS_GPR_LOCK, // GDS holding on its data and addr src
121 EXP_POS_ACCESS, // write to export position
122 EXP_PARAM_ACCESS, // write to export parameter
123 VMW_GPR_LOCK, // vector-memory write holding on its data src
124 EXP_LDS_ACCESS, // read by ldsdir counting as export
125 NUM_WAIT_EVENTS,
126};
127
128// The mapping is:
129// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
130// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
131// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
132// We reserve a fixed number of VGPR slots in the scoring tables for
133// special tokens like SCMEM_LDS (needed for buffer load to LDS).
134enum RegisterMapping {
135 SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
136 AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
137 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
138 NUM_EXTRA_VGPRS = 9, // Reserved slots for DS.
139 // Artificial register slots to track LDS writes into specific LDS locations
140 // if a location is known. When slots are exhausted or location is
141 // unknown use the first slot. The first slot is also always updated in
142 // addition to known location's slot to properly generate waits if dependent
143 // instruction's location is unknown.
144 EXTRA_VGPR_LDS = 0,
145 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
146};
147
148// Enumerate different types of result-returning VMEM operations. Although
149// s_waitcnt orders them all with a single vmcnt counter, in the absence of
150// s_waitcnt only instructions of the same VmemType are guaranteed to write
151// their results in order -- so there is no need to insert an s_waitcnt between
152// two instructions of the same type that write the same vgpr.
153enum VmemType {
154 // BUF instructions and MIMG instructions without a sampler.
155 VMEM_NOSAMPLER,
156 // MIMG instructions with a sampler.
157 VMEM_SAMPLER,
158 // BVH instructions
159 VMEM_BVH,
160 NUM_VMEM_TYPES
161};
162
163// Maps values of InstCounterType to the instruction that waits on that
164// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
165// returns true.
166static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
167 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
168 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
169 AMDGPU::S_WAIT_KMCNT};
170
171static bool updateVMCntOnly(const MachineInstr &Inst) {
172 return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
174}
175
176#ifndef NDEBUG
177static bool isNormalMode(InstCounterType MaxCounter) {
178 return MaxCounter == NUM_NORMAL_INST_CNTS;
179}
180#endif // NDEBUG
181
182VmemType getVmemType(const MachineInstr &Inst) {
183 assert(updateVMCntOnly(Inst));
184 if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) &&
186 return VMEM_NOSAMPLER;
188 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
190 return BaseInfo->BVH ? VMEM_BVH
191 : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
192}
193
194unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
195 switch (T) {
196 case LOAD_CNT:
197 return Wait.LoadCnt;
198 case EXP_CNT:
199 return Wait.ExpCnt;
200 case DS_CNT:
201 return Wait.DsCnt;
202 case STORE_CNT:
203 return Wait.StoreCnt;
204 case SAMPLE_CNT:
205 return Wait.SampleCnt;
206 case BVH_CNT:
207 return Wait.BvhCnt;
208 case KM_CNT:
209 return Wait.KmCnt;
210 default:
211 llvm_unreachable("bad InstCounterType");
212 }
213}
214
215void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
216 unsigned &WC = getCounterRef(Wait, T);
217 WC = std::min(WC, Count);
218}
219
220void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
221 getCounterRef(Wait, T) = ~0u;
222}
223
224unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
225 return getCounterRef(Wait, T);
226}
227
228// Mapping from event to counter according to the table masks.
229InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
230 for (auto T : inst_counter_types()) {
231 if (masks[T] & (1 << E))
232 return T;
233 }
234 llvm_unreachable("event type has no associated counter");
235}
236
237// This objects maintains the current score brackets of each wait counter, and
238// a per-register scoreboard for each wait counter.
239//
240// We also maintain the latest score for every event type that can change the
241// waitcnt in order to know if there are multiple types of events within
242// the brackets. When multiple types of event happen in the bracket,
243// wait count may get decreased out of order, therefore we need to put in
244// "s_waitcnt 0" before use.
245class WaitcntBrackets {
246public:
247 WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
248 HardwareLimits Limits, RegisterEncoding Encoding,
249 const unsigned *WaitEventMaskForInst,
250 InstCounterType SmemAccessCounter)
251 : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
252 Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
253 SmemAccessCounter(SmemAccessCounter) {}
254
255 unsigned getWaitCountMax(InstCounterType T) const {
256 switch (T) {
257 case LOAD_CNT:
258 return Limits.LoadcntMax;
259 case DS_CNT:
260 return Limits.DscntMax;
261 case EXP_CNT:
262 return Limits.ExpcntMax;
263 case STORE_CNT:
264 return Limits.StorecntMax;
265 case SAMPLE_CNT:
266 return Limits.SamplecntMax;
267 case BVH_CNT:
268 return Limits.BvhcntMax;
269 case KM_CNT:
270 return Limits.KmcntMax;
271 default:
272 break;
273 }
274 return 0;
275 }
276
277 unsigned getScoreLB(InstCounterType T) const {
278 assert(T < NUM_INST_CNTS);
279 return ScoreLBs[T];
280 }
281
282 unsigned getScoreUB(InstCounterType T) const {
283 assert(T < NUM_INST_CNTS);
284 return ScoreUBs[T];
285 }
286
287 unsigned getScoreRange(InstCounterType T) const {
288 return getScoreUB(T) - getScoreLB(T);
289 }
290
291 unsigned getRegScore(int GprNo, InstCounterType T) const {
292 if (GprNo < NUM_ALL_VGPRS) {
293 return VgprScores[T][GprNo];
294 }
295 assert(T == SmemAccessCounter);
296 return SgprScores[GprNo - NUM_ALL_VGPRS];
297 }
298
299 bool merge(const WaitcntBrackets &Other);
300
301 RegInterval getRegInterval(const MachineInstr *MI,
303 const SIRegisterInfo *TRI, unsigned OpNo) const;
304
305 bool counterOutOfOrder(InstCounterType T) const;
306 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
307 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
308 void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
309 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
310 void applyWaitcnt(InstCounterType T, unsigned Count);
311 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
312 const MachineRegisterInfo *MRI, WaitEventType E,
314
315 unsigned hasPendingEvent() const { return PendingEvents; }
316 unsigned hasPendingEvent(WaitEventType E) const {
317 return PendingEvents & (1 << E);
318 }
319 unsigned hasPendingEvent(InstCounterType T) const {
320 unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
321 assert((HasPending != 0) == (getScoreRange(T) != 0));
322 return HasPending;
323 }
324
325 bool hasMixedPendingEvents(InstCounterType T) const {
326 unsigned Events = hasPendingEvent(T);
327 // Return true if more than one bit is set in Events.
328 return Events & (Events - 1);
329 }
330
331 bool hasPendingFlat() const {
332 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
333 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
334 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
335 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
336 }
337
338 void setPendingFlat() {
339 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
340 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
341 }
342
343 // Return true if there might be pending writes to the specified vgpr by VMEM
344 // instructions with types different from V.
345 bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
346 assert(GprNo < NUM_ALL_VGPRS);
347 return VgprVmemTypes[GprNo] & ~(1 << V);
348 }
349
350 void clearVgprVmemTypes(int GprNo) {
351 assert(GprNo < NUM_ALL_VGPRS);
352 VgprVmemTypes[GprNo] = 0;
353 }
354
355 void setStateOnFunctionEntryOrReturn() {
356 setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
357 PendingEvents |= WaitEventMaskForInst[STORE_CNT];
358 }
359
360 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
361 return LDSDMAStores;
362 }
363
364 void print(raw_ostream &);
365 void dump() { print(dbgs()); }
366
367private:
368 struct MergeInfo {
369 unsigned OldLB;
370 unsigned OtherLB;
371 unsigned MyShift;
372 unsigned OtherShift;
373 };
374 static bool mergeScore(const MergeInfo &M, unsigned &Score,
375 unsigned OtherScore);
376
377 void setScoreLB(InstCounterType T, unsigned Val) {
378 assert(T < NUM_INST_CNTS);
379 ScoreLBs[T] = Val;
380 }
381
382 void setScoreUB(InstCounterType T, unsigned Val) {
383 assert(T < NUM_INST_CNTS);
384 ScoreUBs[T] = Val;
385
386 if (T != EXP_CNT)
387 return;
388
389 if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
390 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
391 }
392
393 void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
394 if (GprNo < NUM_ALL_VGPRS) {
395 VgprUB = std::max(VgprUB, GprNo);
396 VgprScores[T][GprNo] = Val;
397 } else {
398 assert(T == SmemAccessCounter);
399 SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
400 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
401 }
402 }
403
404 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
406 unsigned OpNo, unsigned Val);
407
408 const GCNSubtarget *ST = nullptr;
409 InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
410 HardwareLimits Limits = {};
411 RegisterEncoding Encoding = {};
412 const unsigned *WaitEventMaskForInst;
413 InstCounterType SmemAccessCounter;
414 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
415 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
416 unsigned PendingEvents = 0;
417 // Remember the last flat memory operation.
418 unsigned LastFlat[NUM_INST_CNTS] = {0};
419 // wait_cnt scores for every vgpr.
420 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
421 int VgprUB = -1;
422 int SgprUB = -1;
423 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
424 // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
425 // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
426 unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
427 // Bitmask of the VmemTypes of VMEM instructions that might have a pending
428 // write to each vgpr.
429 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
430 // Store representative LDS DMA operations. The only useful info here is
431 // alias info. One store is kept per unique AAInfo.
432 SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
433};
434
435// This abstracts the logic for generating and updating S_WAIT* instructions
436// away from the analysis that determines where they are needed. This was
437// done because the set of counters and instructions for waiting on them
438// underwent a major shift with gfx12, sufficiently so that having this
439// abstraction allows the main analysis logic to be simpler than it would
440// otherwise have had to become.
441class WaitcntGenerator {
442protected:
443 const GCNSubtarget *ST = nullptr;
444 const SIInstrInfo *TII = nullptr;
446 InstCounterType MaxCounter;
447
448public:
449 WaitcntGenerator() {}
450 WaitcntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter)
451 : ST(ST), TII(ST->getInstrInfo()),
452 IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter) {}
453
454 // Edits an existing sequence of wait count instructions according
455 // to an incoming Waitcnt value, which is itself updated to reflect
456 // any new wait count instructions which may need to be generated by
457 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
458 // were made.
459 //
460 // This editing will usually be merely updated operands, but it may also
461 // delete instructions if the incoming Wait value indicates they are not
462 // needed. It may also remove existing instructions for which a wait
463 // is needed if it can be determined that it is better to generate new
464 // instructions later, as can happen on gfx12.
465 virtual bool
466 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
467 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
469
470 // Transform a soft waitcnt into a normal one.
471 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
472
473 // Generates new wait count instructions according to the value of
474 // Wait, returning true if any new instructions were created.
475 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
478
479 // Returns an array of bit masks which can be used to map values in
480 // WaitEventType to corresponding counter values in InstCounterType.
481 virtual const unsigned *getWaitEventMask() const = 0;
482
483 // Returns a new waitcnt with all counters except VScnt set to 0. If
484 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
485 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
486
487 virtual ~WaitcntGenerator() = default;
488
489 // Create a mask value from the initializer list of wait event types.
490 static constexpr unsigned
491 eventMask(std::initializer_list<WaitEventType> Events) {
492 unsigned Mask = 0;
493 for (auto &E : Events)
494 Mask |= 1 << E;
495
496 return Mask;
497 }
498};
499
500class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
501public:
502 WaitcntGeneratorPreGFX12() {}
503 WaitcntGeneratorPreGFX12(const GCNSubtarget *ST)
504 : WaitcntGenerator(ST, NUM_NORMAL_INST_CNTS) {}
505
506 bool
507 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
508 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
509 MachineBasicBlock::instr_iterator It) const override;
510
511 bool createNewWaitcnt(MachineBasicBlock &Block,
513 AMDGPU::Waitcnt Wait) override;
514
515 const unsigned *getWaitEventMask() const override {
516 assert(ST);
517
518 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
519 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
520 VMEM_BVH_READ_ACCESS}),
521 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
522 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
523 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
524 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
525 0,
526 0,
527 0};
528
529 return WaitEventMaskForInstPreGFX12;
530 }
531
532 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
533};
534
535class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
536public:
537 WaitcntGeneratorGFX12Plus() {}
538 WaitcntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter)
539 : WaitcntGenerator(ST, MaxCounter) {}
540
541 bool
542 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
543 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
544 MachineBasicBlock::instr_iterator It) const override;
545
546 bool createNewWaitcnt(MachineBasicBlock &Block,
548 AMDGPU::Waitcnt Wait) override;
549
550 const unsigned *getWaitEventMask() const override {
551 assert(ST);
552
553 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
554 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
555 eventMask({LDS_ACCESS, GDS_ACCESS}),
556 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
557 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
558 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
559 eventMask({VMEM_SAMPLER_READ_ACCESS}),
560 eventMask({VMEM_BVH_READ_ACCESS}),
561 eventMask({SMEM_ACCESS, SQ_MESSAGE})};
562
563 return WaitEventMaskForInstGFX12Plus;
564 }
565
566 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
567};
568
569class SIInsertWaitcnts : public MachineFunctionPass {
570private:
571 const GCNSubtarget *ST = nullptr;
572 const SIInstrInfo *TII = nullptr;
573 const SIRegisterInfo *TRI = nullptr;
574 const MachineRegisterInfo *MRI = nullptr;
575
577 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
578 MachineLoopInfo *MLI;
580 AliasAnalysis *AA = nullptr;
581
582 struct BlockInfo {
583 std::unique_ptr<WaitcntBrackets> Incoming;
584 bool Dirty = true;
585 };
586
587 InstCounterType SmemAccessCounter;
588
590
591 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
592 // because of amdgpu-waitcnt-forcezero flag
593 bool ForceEmitZeroWaitcnts;
594 bool ForceEmitWaitcnt[NUM_INST_CNTS];
595
596 bool OptNone;
597
598 // In any given run of this pass, WCG will point to one of these two
599 // generator objects, which must have been re-initialised before use
600 // from a value made using a subtarget constructor.
601 WaitcntGeneratorPreGFX12 WCGPreGFX12;
602 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
603
604 WaitcntGenerator *WCG = nullptr;
605
606 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
607 // message.
608 DenseSet<MachineInstr *> ReleaseVGPRInsts;
609
610 InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
611
612public:
613 static char ID;
614
615 SIInsertWaitcnts() : MachineFunctionPass(ID) {
616 (void)ForceExpCounter;
617 (void)ForceLgkmCounter;
618 (void)ForceVMCounter;
619 }
620
621 bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
622 bool isPreheaderToFlush(MachineBasicBlock &MBB,
623 WaitcntBrackets &ScoreBrackets);
624 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
625 bool runOnMachineFunction(MachineFunction &MF) override;
626
627 StringRef getPassName() const override {
628 return "SI insert wait instructions";
629 }
630
631 void getAnalysisUsage(AnalysisUsage &AU) const override {
632 AU.setPreservesCFG();
638 }
639
640 bool isForceEmitWaitcnt() const {
641 for (auto T : inst_counter_types())
642 if (ForceEmitWaitcnt[T])
643 return true;
644 return false;
645 }
646
647 void setForceEmitWaitcnt() {
648// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
649// For debug builds, get the debug counter info and adjust if need be
650#ifndef NDEBUG
651 if (DebugCounter::isCounterSet(ForceExpCounter) &&
652 DebugCounter::shouldExecute(ForceExpCounter)) {
653 ForceEmitWaitcnt[EXP_CNT] = true;
654 } else {
655 ForceEmitWaitcnt[EXP_CNT] = false;
656 }
657
658 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
659 DebugCounter::shouldExecute(ForceLgkmCounter)) {
660 ForceEmitWaitcnt[DS_CNT] = true;
661 ForceEmitWaitcnt[KM_CNT] = true;
662 } else {
663 ForceEmitWaitcnt[DS_CNT] = false;
664 ForceEmitWaitcnt[KM_CNT] = false;
665 }
666
667 if (DebugCounter::isCounterSet(ForceVMCounter) &&
668 DebugCounter::shouldExecute(ForceVMCounter)) {
669 ForceEmitWaitcnt[LOAD_CNT] = true;
670 ForceEmitWaitcnt[SAMPLE_CNT] = true;
671 ForceEmitWaitcnt[BVH_CNT] = true;
672 } else {
673 ForceEmitWaitcnt[LOAD_CNT] = false;
674 ForceEmitWaitcnt[SAMPLE_CNT] = false;
675 ForceEmitWaitcnt[BVH_CNT] = false;
676 }
677#endif // NDEBUG
678 }
679
680 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
681 // FLAT instruction.
682 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
683 // Maps VMEM access types to their corresponding WaitEventType.
684 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
685 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
686
688 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
689 // these should use VM_CNT.
690 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
691 return VMEM_ACCESS;
692 if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) {
693 // FLAT and SCRATCH instructions may access scratch. Other VMEM
694 // instructions do not.
695 if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
696 return SCRATCH_WRITE_ACCESS;
697 return VMEM_WRITE_ACCESS;
698 }
699 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
700 return VMEM_READ_ACCESS;
701 return VmemReadMapping[getVmemType(Inst)];
702 }
703
704 bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
705 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
706 bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
707 bool generateWaitcntInstBefore(MachineInstr &MI,
708 WaitcntBrackets &ScoreBrackets,
709 MachineInstr *OldWaitcntInstr,
710 bool FlushVmCnt);
711 bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
712 WaitcntBrackets &ScoreBrackets,
713 MachineInstr *OldWaitcntInstr);
714 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
716 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
717 MachineInstr *OldWaitcntInstr);
718 void updateEventWaitcntAfter(MachineInstr &Inst,
719 WaitcntBrackets *ScoreBrackets);
720 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
721 WaitcntBrackets &ScoreBrackets);
722};
723
724} // end anonymous namespace
725
726RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
728 const SIRegisterInfo *TRI,
729 unsigned OpNo) const {
730 const MachineOperand &Op = MI->getOperand(OpNo);
731 if (!TRI->isInAllocatableClass(Op.getReg()))
732 return {-1, -1};
733
734 // A use via a PW operand does not need a waitcnt.
735 // A partial write is not a WAW.
736 assert(!Op.getSubReg() || !Op.isUndef());
737
738 RegInterval Result;
739
740 unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
742
743 if (TRI->isVectorRegister(*MRI, Op.getReg())) {
744 assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
745 Result.first = Reg - Encoding.VGPR0;
746 if (TRI->isAGPR(*MRI, Op.getReg()))
747 Result.first += AGPR_OFFSET;
748 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
749 } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
750 assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
751 Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
752 assert(Result.first >= NUM_ALL_VGPRS &&
753 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
754 }
755 // TODO: Handle TTMP
756 // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
757 else
758 return {-1, -1};
759
760 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
761 unsigned Size = TRI->getRegSizeInBits(*RC);
762 Result.second = Result.first + ((Size + 16) / 32);
763
764 return Result;
765}
766
767void WaitcntBrackets::setExpScore(const MachineInstr *MI,
768 const SIInstrInfo *TII,
769 const SIRegisterInfo *TRI,
770 const MachineRegisterInfo *MRI, unsigned OpNo,
771 unsigned Val) {
772 RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo);
773 assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
774 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
775 setRegScore(RegNo, EXP_CNT, Val);
776 }
777}
778
779void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
780 const SIRegisterInfo *TRI,
782 WaitEventType E, MachineInstr &Inst) {
783 InstCounterType T = eventCounter(WaitEventMaskForInst, E);
784
785 unsigned UB = getScoreUB(T);
786 unsigned CurrScore = UB + 1;
787 if (CurrScore == 0)
788 report_fatal_error("InsertWaitcnt score wraparound");
789 // PendingEvents and ScoreUB need to be update regardless if this event
790 // changes the score of a register or not.
791 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
792 PendingEvents |= 1 << E;
793 setScoreUB(T, CurrScore);
794
795 if (T == EXP_CNT) {
796 // Put score on the source vgprs. If this is a store, just use those
797 // specific register(s).
798 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
799 int AddrOpIdx =
800 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
801 // All GDS operations must protect their address register (same as
802 // export.)
803 if (AddrOpIdx != -1) {
804 setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
805 }
806
807 if (Inst.mayStore()) {
808 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) {
809 setExpScore(
810 &Inst, TII, TRI, MRI,
811 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
812 CurrScore);
813 }
814 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) {
815 setExpScore(&Inst, TII, TRI, MRI,
817 AMDGPU::OpName::data1),
818 CurrScore);
819 }
820 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
821 Inst.getOpcode() != AMDGPU::DS_APPEND &&
822 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
823 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
824 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
825 const MachineOperand &Op = Inst.getOperand(I);
826 if (Op.isReg() && !Op.isDef() &&
827 TRI->isVectorRegister(*MRI, Op.getReg())) {
828 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
829 }
830 }
831 }
832 } else if (TII->isFLAT(Inst)) {
833 if (Inst.mayStore()) {
834 setExpScore(
835 &Inst, TII, TRI, MRI,
836 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
837 CurrScore);
838 } else if (SIInstrInfo::isAtomicRet(Inst)) {
839 setExpScore(
840 &Inst, TII, TRI, MRI,
841 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
842 CurrScore);
843 }
844 } else if (TII->isMIMG(Inst)) {
845 if (Inst.mayStore()) {
846 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
847 } else if (SIInstrInfo::isAtomicRet(Inst)) {
848 setExpScore(
849 &Inst, TII, TRI, MRI,
850 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
851 CurrScore);
852 }
853 } else if (TII->isMTBUF(Inst)) {
854 if (Inst.mayStore()) {
855 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
856 }
857 } else if (TII->isMUBUF(Inst)) {
858 if (Inst.mayStore()) {
859 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
860 } else if (SIInstrInfo::isAtomicRet(Inst)) {
861 setExpScore(
862 &Inst, TII, TRI, MRI,
863 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
864 CurrScore);
865 }
866 } else if (TII->isLDSDIR(Inst)) {
867 // LDSDIR instructions attach the score to the destination.
868 setExpScore(
869 &Inst, TII, TRI, MRI,
870 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
871 CurrScore);
872 } else {
873 if (TII->isEXP(Inst)) {
874 // For export the destination registers are really temps that
875 // can be used as the actual source after export patching, so
876 // we need to treat them like sources and set the EXP_CNT
877 // score.
878 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
879 MachineOperand &DefMO = Inst.getOperand(I);
880 if (DefMO.isReg() && DefMO.isDef() &&
881 TRI->isVGPR(*MRI, DefMO.getReg())) {
882 setRegScore(
883 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
884 EXP_CNT, CurrScore);
885 }
886 }
887 }
888 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
889 MachineOperand &MO = Inst.getOperand(I);
890 if (MO.isReg() && !MO.isDef() &&
891 TRI->isVectorRegister(*MRI, MO.getReg())) {
892 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
893 }
894 }
895 }
896#if 0 // TODO: check if this is handled by MUBUF code above.
897 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
898 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
899 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
900 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
901 unsigned OpNo;//TODO: find the OpNo for this operand;
902 RegInterval Interval = getRegInterval(&Inst, MRI, TRI, OpNo);
903 for (int RegNo = Interval.first; RegNo < Interval.second;
904 ++RegNo) {
905 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
906 }
907#endif
908 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
909 // Match the score to the destination registers.
910 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
911 auto &Op = Inst.getOperand(I);
912 if (!Op.isReg() || !Op.isDef())
913 continue;
914 RegInterval Interval = getRegInterval(&Inst, MRI, TRI, I);
915 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
916 if (Interval.first >= NUM_ALL_VGPRS)
917 continue;
918 if (updateVMCntOnly(Inst)) {
919 // updateVMCntOnly should only leave us with VGPRs
920 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
921 // defs. That's required for a sane index into `VgprMemTypes` below
922 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
923 VmemType V = getVmemType(Inst);
924 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
925 VgprVmemTypes[RegNo] |= 1 << V;
926 }
927 }
928 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
929 setRegScore(RegNo, T, CurrScore);
930 }
931 }
932 if (Inst.mayStore() &&
933 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
934 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
935 // written can be accessed. A load from LDS to VMEM does not need a wait.
936 unsigned Slot = 0;
937 for (const auto *MemOp : Inst.memoperands()) {
938 if (!MemOp->isStore() ||
939 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
940 continue;
941 // Comparing just AA info does not guarantee memoperands are equal
942 // in general, but this is so for LDS DMA in practice.
943 auto AAI = MemOp->getAAInfo();
944 // Alias scope information gives a way to definitely identify an
945 // original memory object and practically produced in the module LDS
946 // lowering pass. If there is no scope available we will not be able
947 // to disambiguate LDS aliasing as after the module lowering all LDS
948 // is squashed into a single big object. Do not attempt to use one of
949 // the limited LDSDMAStores for something we will not be able to use
950 // anyway.
951 if (!AAI || !AAI.Scope)
952 break;
953 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
954 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
955 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
956 Slot = I + 1;
957 break;
958 }
959 }
960 }
961 if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
962 break;
963 LDSDMAStores.push_back(&Inst);
964 Slot = LDSDMAStores.size();
965 break;
966 }
967 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore);
968 if (Slot)
969 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
970 }
971 }
972}
973
974void WaitcntBrackets::print(raw_ostream &OS) {
975 OS << '\n';
976 for (auto T : inst_counter_types(MaxCounter)) {
977 unsigned SR = getScoreRange(T);
978
979 switch (T) {
980 case LOAD_CNT:
981 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
982 << SR << "): ";
983 break;
984 case DS_CNT:
985 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
986 << SR << "): ";
987 break;
988 case EXP_CNT:
989 OS << " EXP_CNT(" << SR << "): ";
990 break;
991 case STORE_CNT:
992 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
993 << SR << "): ";
994 break;
995 case SAMPLE_CNT:
996 OS << " SAMPLE_CNT(" << SR << "): ";
997 break;
998 case BVH_CNT:
999 OS << " BVH_CNT(" << SR << "): ";
1000 break;
1001 case KM_CNT:
1002 OS << " KM_CNT(" << SR << "): ";
1003 break;
1004 default:
1005 OS << " UNKNOWN(" << SR << "): ";
1006 break;
1007 }
1008
1009 if (SR != 0) {
1010 // Print vgpr scores.
1011 unsigned LB = getScoreLB(T);
1012
1013 for (int J = 0; J <= VgprUB; J++) {
1014 unsigned RegScore = getRegScore(J, T);
1015 if (RegScore <= LB)
1016 continue;
1017 unsigned RelScore = RegScore - LB - 1;
1018 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
1019 OS << RelScore << ":v" << J << " ";
1020 } else {
1021 OS << RelScore << ":ds ";
1022 }
1023 }
1024 // Also need to print sgpr scores for lgkm_cnt.
1025 if (T == SmemAccessCounter) {
1026 for (int J = 0; J <= SgprUB; J++) {
1027 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
1028 if (RegScore <= LB)
1029 continue;
1030 unsigned RelScore = RegScore - LB - 1;
1031 OS << RelScore << ":s" << J << " ";
1032 }
1033 }
1034 }
1035 OS << '\n';
1036 }
1037 OS << '\n';
1038}
1039
1040/// Simplify the waitcnt, in the sense of removing redundant counts, and return
1041/// whether a waitcnt instruction is needed at all.
1042void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1043 simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1044 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
1045 simplifyWaitcnt(DS_CNT, Wait.DsCnt);
1046 simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
1047 simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1048 simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
1049 simplifyWaitcnt(KM_CNT, Wait.KmCnt);
1050}
1051
1052void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1053 unsigned &Count) const {
1054 // The number of outstanding events for this type, T, can be calculated
1055 // as (UB - LB). If the current Count is greater than or equal to the number
1056 // of outstanding events, then the wait for this counter is redundant.
1057 if (Count >= getScoreRange(T))
1058 Count = ~0u;
1059}
1060
1061void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
1062 AMDGPU::Waitcnt &Wait) const {
1063 unsigned ScoreToWait = getRegScore(RegNo, T);
1064
1065 // If the score of src_operand falls within the bracket, we need an
1066 // s_waitcnt instruction.
1067 const unsigned LB = getScoreLB(T);
1068 const unsigned UB = getScoreUB(T);
1069 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1070 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1071 !ST->hasFlatLgkmVMemCountInOrder()) {
1072 // If there is a pending FLAT operation, and this is a VMem or LGKM
1073 // waitcnt and the target can report early completion, then we need
1074 // to force a waitcnt 0.
1075 addWait(Wait, T, 0);
1076 } else if (counterOutOfOrder(T)) {
1077 // Counter can get decremented out-of-order when there
1078 // are multiple types event in the bracket. Also emit an s_wait counter
1079 // with a conservative value of 0 for the counter.
1080 addWait(Wait, T, 0);
1081 } else {
1082 // If a counter has been maxed out avoid overflow by waiting for
1083 // MAX(CounterType) - 1 instead.
1084 unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
1085 addWait(Wait, T, NeededWait);
1086 }
1087 }
1088}
1089
1090void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1091 applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1092 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1093 applyWaitcnt(DS_CNT, Wait.DsCnt);
1094 applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1095 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1096 applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1097 applyWaitcnt(KM_CNT, Wait.KmCnt);
1098}
1099
1100void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1101 const unsigned UB = getScoreUB(T);
1102 if (Count >= UB)
1103 return;
1104 if (Count != 0) {
1105 if (counterOutOfOrder(T))
1106 return;
1107 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1108 } else {
1109 setScoreLB(T, UB);
1110 PendingEvents &= ~WaitEventMaskForInst[T];
1111 }
1112}
1113
1114// Where there are multiple types of event in the bracket of a counter,
1115// the decrement may go out of order.
1116bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1117 // Scalar memory read always can go out of order.
1118 if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
1119 return true;
1120 return hasMixedPendingEvents(T);
1121}
1122
1123INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1124 false)
1127INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1128 false)
1129
1130char SIInsertWaitcnts::ID = 0;
1131
1132char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
1133
1135 return new SIInsertWaitcnts();
1136}
1137
1139 unsigned NewEnc) {
1140 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1141 assert(OpIdx >= 0);
1142
1143 MachineOperand &MO = MI.getOperand(OpIdx);
1144
1145 if (NewEnc == MO.getImm())
1146 return false;
1147
1148 MO.setImm(NewEnc);
1149 return true;
1150}
1151
1152/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1153/// and if so, which counter it is waiting on.
1154static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1155 switch (Opcode) {
1156 case AMDGPU::S_WAIT_LOADCNT:
1157 return LOAD_CNT;
1158 case AMDGPU::S_WAIT_EXPCNT:
1159 return EXP_CNT;
1160 case AMDGPU::S_WAIT_STORECNT:
1161 return STORE_CNT;
1162 case AMDGPU::S_WAIT_SAMPLECNT:
1163 return SAMPLE_CNT;
1164 case AMDGPU::S_WAIT_BVHCNT:
1165 return BVH_CNT;
1166 case AMDGPU::S_WAIT_DSCNT:
1167 return DS_CNT;
1168 case AMDGPU::S_WAIT_KMCNT:
1169 return KM_CNT;
1170 default:
1171 return {};
1172 }
1173}
1174
1175bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1176 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1177 if (Opcode == Waitcnt->getOpcode())
1178 return false;
1179
1180 Waitcnt->setDesc(TII->get(Opcode));
1181 return true;
1182}
1183
1184/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1185/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1186/// from \p Wait that were added by previous passes. Currently this pass
1187/// conservatively assumes that these preexisting waits are required for
1188/// correctness.
1189bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1190 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1192 assert(ST);
1193 assert(isNormalMode(MaxCounter));
1194
1195 bool Modified = false;
1196 MachineInstr *WaitcntInstr = nullptr;
1197 MachineInstr *WaitcntVsCntInstr = nullptr;
1198
1199 for (auto &II :
1200 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1201 if (II.isMetaInstruction())
1202 continue;
1203
1204 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1205 bool IsSoft = Opcode != II.getOpcode();
1206
1207 // Update required wait count. If this is a soft waitcnt (= it was added
1208 // by an earlier pass), it may be entirely removed.
1209 if (Opcode == AMDGPU::S_WAITCNT) {
1210 unsigned IEnc = II.getOperand(0).getImm();
1211 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1212 if (IsSoft)
1213 ScoreBrackets.simplifyWaitcnt(OldWait);
1214 Wait = Wait.combined(OldWait);
1215
1216 // Merge consecutive waitcnt of the same type by erasing multiples.
1217 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && IsSoft)) {
1218 II.eraseFromParent();
1219 Modified = true;
1220 } else
1221 WaitcntInstr = &II;
1222 } else {
1223 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1224 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1225
1226 unsigned OldVSCnt =
1227 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1228 if (IsSoft)
1229 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1230 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1231
1232 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && IsSoft)) {
1233 II.eraseFromParent();
1234 Modified = true;
1235 } else
1236 WaitcntVsCntInstr = &II;
1237 }
1238 }
1239
1240 if (WaitcntInstr) {
1241 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1243 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1244
1245 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1246 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1247 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1248 Wait.LoadCnt = ~0u;
1249 Wait.ExpCnt = ~0u;
1250 Wait.DsCnt = ~0u;
1251
1252 LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1253 ? dbgs()
1254 << "applyPreexistingWaitcnt\n"
1255 << "New Instr at block end: " << *WaitcntInstr << '\n'
1256 : dbgs() << "applyPreexistingWaitcnt\n"
1257 << "Old Instr: " << *It
1258 << "New Instr: " << *WaitcntInstr << '\n');
1259 }
1260
1261 if (WaitcntVsCntInstr) {
1262 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
1263 AMDGPU::OpName::simm16, Wait.StoreCnt);
1264 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1265
1266 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1267 Wait.StoreCnt = ~0u;
1268
1269 LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1270 ? dbgs() << "applyPreexistingWaitcnt\n"
1271 << "New Instr at block end: " << *WaitcntVsCntInstr
1272 << '\n'
1273 : dbgs() << "applyPreexistingWaitcnt\n"
1274 << "Old Instr: " << *It
1275 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1276 }
1277
1278 return Modified;
1279}
1280
1281/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1282/// required counters in \p Wait
1283bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1286 assert(ST);
1287 assert(isNormalMode(MaxCounter));
1288
1289 bool Modified = false;
1290 const DebugLoc &DL = Block.findDebugLoc(It);
1291
1292 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1293 // single instruction while VScnt has its own instruction.
1294 if (Wait.hasWaitExceptStoreCnt()) {
1295 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1296 [[maybe_unused]] auto SWaitInst =
1297 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1298 Modified = true;
1299
1300 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1301 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1302 dbgs() << "New Instr: " << *SWaitInst << '\n');
1303 }
1304
1305 if (Wait.hasWaitStoreCnt()) {
1306 assert(ST->hasVscnt());
1307
1308 [[maybe_unused]] auto SWaitInst =
1309 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1310 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1311 .addImm(Wait.StoreCnt);
1312 Modified = true;
1313
1314 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1315 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1316 dbgs() << "New Instr: " << *SWaitInst << '\n');
1317 }
1318
1319 return Modified;
1320}
1321
1323WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1324 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1325}
1326
1328WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1329 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
1330}
1331
1332/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1333/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1334/// were added by previous passes. Currently this pass conservatively
1335/// assumes that these preexisting waits are required for correctness.
1336bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1337 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1339 assert(ST);
1340 assert(!isNormalMode(MaxCounter));
1341
1342 bool Modified = false;
1343 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1344 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1345 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1346
1347 for (auto &II :
1348 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1349 if (II.isMetaInstruction())
1350 continue;
1351
1352 MachineInstr **UpdatableInstr;
1353
1354 // Update required wait count. If this is a soft waitcnt (= it was added
1355 // by an earlier pass), it may be entirely removed.
1356
1357 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1358 bool IsSoft = Opcode != II.getOpcode();
1359
1360 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1361 unsigned OldEnc =
1362 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1364 if (IsSoft)
1365 ScoreBrackets.simplifyWaitcnt(OldWait);
1366 Wait = Wait.combined(OldWait);
1367 UpdatableInstr = &CombinedLoadDsCntInstr;
1368 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1369 unsigned OldEnc =
1370 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1372 if (IsSoft)
1373 ScoreBrackets.simplifyWaitcnt(OldWait);
1374 Wait = Wait.combined(OldWait);
1375 UpdatableInstr = &CombinedStoreDsCntInstr;
1376 } else {
1377 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1378 assert(CT.has_value());
1379 unsigned OldCnt =
1380 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1381 if (IsSoft)
1382 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
1383 addWait(Wait, CT.value(), OldCnt);
1384 UpdatableInstr = &WaitInstrs[CT.value()];
1385 }
1386
1387 // Merge consecutive waitcnt of the same type by erasing multiples.
1388 if (!*UpdatableInstr) {
1389 *UpdatableInstr = &II;
1390 } else {
1391 II.eraseFromParent();
1392 Modified = true;
1393 }
1394 }
1395
1396 if (CombinedLoadDsCntInstr) {
1397 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1398 // to be waited for. Otherwise, let the instruction be deleted so
1399 // the appropriate single counter wait instruction can be inserted
1400 // instead, when new S_WAIT_*CNT instructions are inserted by
1401 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1402 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1403 // the loop below that deals with single counter instructions.
1404 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1405 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1406 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1407 AMDGPU::OpName::simm16, NewEnc);
1408 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1409 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1410 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1411 Wait.LoadCnt = ~0u;
1412 Wait.DsCnt = ~0u;
1413
1414 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1415 ? dbgs() << "applyPreexistingWaitcnt\n"
1416 << "New Instr at block end: "
1417 << *CombinedLoadDsCntInstr << '\n'
1418 : dbgs() << "applyPreexistingWaitcnt\n"
1419 << "Old Instr: " << *It << "New Instr: "
1420 << *CombinedLoadDsCntInstr << '\n');
1421 } else {
1422 CombinedLoadDsCntInstr->eraseFromParent();
1423 Modified = true;
1424 }
1425 }
1426
1427 if (CombinedStoreDsCntInstr) {
1428 // Similarly for S_WAIT_STORECNT_DSCNT.
1429 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1430 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1431 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
1432 AMDGPU::OpName::simm16, NewEnc);
1433 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1434 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1435 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1436 Wait.StoreCnt = ~0u;
1437 Wait.DsCnt = ~0u;
1438
1439 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1440 ? dbgs() << "applyPreexistingWaitcnt\n"
1441 << "New Instr at block end: "
1442 << *CombinedStoreDsCntInstr << '\n'
1443 : dbgs() << "applyPreexistingWaitcnt\n"
1444 << "Old Instr: " << *It << "New Instr: "
1445 << *CombinedStoreDsCntInstr << '\n');
1446 } else {
1447 CombinedStoreDsCntInstr->eraseFromParent();
1448 Modified = true;
1449 }
1450 }
1451
1452 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
1453 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1454 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1455 // instructions so that createNewWaitcnt() will create new combined
1456 // instructions to replace them.
1457
1458 if (Wait.DsCnt != ~0u) {
1459 // This is a vector of addresses in WaitInstrs pointing to instructions
1460 // that should be removed if they are present.
1462
1463 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1464 // both) need to be waited for, ensure that there are no existing
1465 // individual wait count instructions for these.
1466
1467 if (Wait.LoadCnt != ~0u) {
1468 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
1469 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1470 } else if (Wait.StoreCnt != ~0u) {
1471 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
1472 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1473 }
1474
1475 for (MachineInstr **WI : WaitsToErase) {
1476 if (!*WI)
1477 continue;
1478
1479 (*WI)->eraseFromParent();
1480 *WI = nullptr;
1481 Modified = true;
1482 }
1483 }
1484
1485 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1486 if (!WaitInstrs[CT])
1487 continue;
1488
1489 unsigned NewCnt = getWait(Wait, CT);
1490 if (NewCnt != ~0u) {
1491 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
1492 AMDGPU::OpName::simm16, NewCnt);
1493 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1494
1495 ScoreBrackets.applyWaitcnt(CT, NewCnt);
1496 setNoWait(Wait, CT);
1497
1498 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1499 ? dbgs() << "applyPreexistingWaitcnt\n"
1500 << "New Instr at block end: " << *WaitInstrs[CT]
1501 << '\n'
1502 : dbgs() << "applyPreexistingWaitcnt\n"
1503 << "Old Instr: " << *It
1504 << "New Instr: " << *WaitInstrs[CT] << '\n');
1505 } else {
1506 WaitInstrs[CT]->eraseFromParent();
1507 Modified = true;
1508 }
1509 }
1510
1511 return Modified;
1512}
1513
1514/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
1515bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1518 assert(ST);
1519 assert(!isNormalMode(MaxCounter));
1520
1521 bool Modified = false;
1522 const DebugLoc &DL = Block.findDebugLoc(It);
1523
1524 // Check for opportunities to use combined wait instructions.
1525 if (Wait.DsCnt != ~0u) {
1526 MachineInstr *SWaitInst = nullptr;
1527
1528 if (Wait.LoadCnt != ~0u) {
1529 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1530
1531 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
1532 .addImm(Enc);
1533
1534 Wait.LoadCnt = ~0u;
1535 Wait.DsCnt = ~0u;
1536 } else if (Wait.StoreCnt != ~0u) {
1537 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1538
1539 SWaitInst =
1540 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
1541 .addImm(Enc);
1542
1543 Wait.StoreCnt = ~0u;
1544 Wait.DsCnt = ~0u;
1545 }
1546
1547 if (SWaitInst) {
1548 Modified = true;
1549
1550 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1551 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1552 dbgs() << "New Instr: " << *SWaitInst << '\n');
1553 }
1554 }
1555
1556 // Generate an instruction for any remaining counter that needs
1557 // waiting for.
1558
1559 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1560 unsigned Count = getWait(Wait, CT);
1561 if (Count == ~0u)
1562 continue;
1563
1564 [[maybe_unused]] auto SWaitInst =
1565 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
1566 .addImm(Count);
1567
1568 Modified = true;
1569
1570 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1571 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1572 dbgs() << "New Instr: " << *SWaitInst << '\n');
1573 }
1574
1575 return Modified;
1576}
1577
1578static bool readsVCCZ(const MachineInstr &MI) {
1579 unsigned Opc = MI.getOpcode();
1580 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1581 !MI.getOperand(1).isUndef();
1582}
1583
1584/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
1586 // Currently all conventions wait, but this may not always be the case.
1587 //
1588 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
1589 // senses to omit the wait and do it in the caller.
1590 return true;
1591}
1592
1593/// \returns true if the callee is expected to wait for any outstanding waits
1594/// before returning.
1596 return true;
1597}
1598
1599/// Generate s_waitcnt instruction to be placed before cur_Inst.
1600/// Instructions of a given type are returned in order,
1601/// but instructions of different types can complete out of order.
1602/// We rely on this in-order completion
1603/// and simply assign a score to the memory access instructions.
1604/// We keep track of the active "score bracket" to determine
1605/// if an access of a memory read requires an s_waitcnt
1606/// and if so what the value of each counter is.
1607/// The "score bracket" is bound by the lower bound and upper bound
1608/// scores (*_score_LB and *_score_ub respectively).
1609/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1610/// flush the vmcnt counter here.
1611bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1612 WaitcntBrackets &ScoreBrackets,
1613 MachineInstr *OldWaitcntInstr,
1614 bool FlushVmCnt) {
1615 setForceEmitWaitcnt();
1616
1617 if (MI.isMetaInstruction())
1618 return false;
1619
1621
1622 // FIXME: This should have already been handled by the memory legalizer.
1623 // Removing this currently doesn't affect any lit tests, but we need to
1624 // verify that nothing was relying on this. The number of buffer invalidates
1625 // being handled here should not be expanded.
1626 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1627 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1628 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1629 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1630 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1631 Wait.LoadCnt = 0;
1632 }
1633
1634 // All waits must be resolved at call return.
1635 // NOTE: this could be improved with knowledge of all call sites or
1636 // with knowledge of the called routines.
1637 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1638 MI.getOpcode() == AMDGPU::SI_RETURN ||
1639 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1640 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1641 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
1642 }
1643 // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1644 // stores. In this case it can be useful to send a message to explicitly
1645 // release all VGPRs before the stores have completed, but it is only safe to
1646 // do this if:
1647 // * there are no outstanding scratch stores
1648 // * we are not in Dynamic VGPR mode
1649 else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1650 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1651 if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone &&
1652 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1653 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
1654 ReleaseVGPRInsts.insert(&MI);
1655 }
1656 // Resolve vm waits before gs-done.
1657 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
1658 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1659 ST->hasLegacyGeometry() &&
1660 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1662 Wait.LoadCnt = 0;
1663 }
1664#if 0 // TODO: the following blocks of logic when we have fence.
1665 else if (MI.getOpcode() == SC_FENCE) {
1666 const unsigned int group_size =
1667 context->shader_info->GetMaxThreadGroupSize();
1668 // group_size == 0 means thread group size is unknown at compile time
1669 const bool group_is_multi_wave =
1670 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
1671 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
1672
1673 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
1674 SCRegType src_type = Inst->GetSrcType(i);
1675 switch (src_type) {
1676 case SCMEM_LDS:
1677 if (group_is_multi_wave ||
1678 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
1679 EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT,
1680 ScoreBrackets->getScoreUB(DS_CNT));
1681 // LDS may have to wait for VMcnt after buffer load to LDS
1682 if (target_info->HasBufferLoadToLDS()) {
1683 EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT,
1684 ScoreBrackets->getScoreUB(LOAD_CNT));
1685 }
1686 }
1687 break;
1688
1689 case SCMEM_GDS:
1690 if (group_is_multi_wave || fence_is_global) {
1691 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1692 ScoreBrackets->getScoreUB(EXP_CNT));
1693 EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT,
1694 ScoreBrackets->getScoreUB(DS_CNT));
1695 }
1696 break;
1697
1698 case SCMEM_UAV:
1699 case SCMEM_TFBUF:
1700 case SCMEM_RING:
1701 case SCMEM_SCATTER:
1702 if (group_is_multi_wave || fence_is_global) {
1703 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1704 ScoreBrackets->getScoreUB(EXP_CNT));
1705 EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT,
1706 ScoreBrackets->getScoreUB(LOAD_CNT));
1707 }
1708 break;
1709
1710 case SCMEM_SCRATCH:
1711 default:
1712 break;
1713 }
1714 }
1715 }
1716#endif
1717
1718 // Export & GDS instructions do not read the EXEC mask until after the export
1719 // is granted (which can occur well after the instruction is issued).
1720 // The shader program must flush all EXP operations on the export-count
1721 // before overwriting the EXEC mask.
1722 else {
1723 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1724 // Export and GDS are tracked individually, either may trigger a waitcnt
1725 // for EXEC.
1726 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1727 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1728 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1729 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1730 Wait.ExpCnt = 0;
1731 }
1732 }
1733
1734 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1735 // The function is going to insert a wait on everything in its prolog.
1736 // This still needs to be careful if the call target is a load (e.g. a GOT
1737 // load). We also need to check WAW dependency with saved PC.
1739
1740 int CallAddrOpIdx =
1741 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
1742
1743 if (MI.getOperand(CallAddrOpIdx).isReg()) {
1744 RegInterval CallAddrOpInterval =
1745 ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOpIdx);
1746
1747 for (int RegNo = CallAddrOpInterval.first;
1748 RegNo < CallAddrOpInterval.second; ++RegNo)
1749 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1750
1751 int RtnAddrOpIdx =
1752 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
1753 if (RtnAddrOpIdx != -1) {
1754 RegInterval RtnAddrOpInterval =
1755 ScoreBrackets.getRegInterval(&MI, MRI, TRI, RtnAddrOpIdx);
1756
1757 for (int RegNo = RtnAddrOpInterval.first;
1758 RegNo < RtnAddrOpInterval.second; ++RegNo)
1759 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1760 }
1761 }
1762 } else {
1763 // FIXME: Should not be relying on memoperands.
1764 // Look at the source operands of every instruction to see if
1765 // any of them results from a previous memory operation that affects
1766 // its current usage. If so, an s_waitcnt instruction needs to be
1767 // emitted.
1768 // If the source operand was defined by a load, add the s_waitcnt
1769 // instruction.
1770 //
1771 // Two cases are handled for destination operands:
1772 // 1) If the destination operand was defined by a load, add the s_waitcnt
1773 // instruction to guarantee the right WAW order.
1774 // 2) If a destination operand that was used by a recent export/store ins,
1775 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1776
1777 for (const MachineMemOperand *Memop : MI.memoperands()) {
1778 const Value *Ptr = Memop->getValue();
1779 if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
1780 addWait(Wait, SmemAccessCounter, 0);
1781 if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1782 SLoadAddresses.erase(Ptr);
1783 }
1784 unsigned AS = Memop->getAddrSpace();
1786 continue;
1787 // No need to wait before load from VMEM to LDS.
1788 if (TII->mayWriteLDSThroughDMA(MI))
1789 continue;
1790
1791 // LOAD_CNT is only relevant to vgpr or LDS.
1792 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1793 bool FoundAliasingStore = false;
1794 // Only objects with alias scope info were added to LDSDMAScopes array.
1795 // In the absense of the scope info we will not be able to disambiguate
1796 // aliasing here. There is no need to try searching for a corresponding
1797 // store slot. This is conservatively correct because in that case we
1798 // will produce a wait using the first (general) LDS DMA wait slot which
1799 // will wait on all of them anyway.
1800 if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1801 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
1802 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
1803 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
1804 FoundAliasingStore = true;
1805 ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
1806 }
1807 }
1808 }
1809 if (!FoundAliasingStore)
1810 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1811 if (Memop->isStore()) {
1812 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1813 }
1814 }
1815
1816 // Loop over use and def operands.
1817 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1818 MachineOperand &Op = MI.getOperand(I);
1819 if (!Op.isReg())
1820 continue;
1821
1822 // If the instruction does not read tied source, skip the operand.
1823 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1824 continue;
1825
1826 RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, I);
1827
1828 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1829 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1830 if (IsVGPR) {
1831 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1832 // previous write and this write are the same type of VMEM
1833 // instruction, in which case they're guaranteed to write their
1834 // results in order anyway.
1835 if (Op.isUse() || !updateVMCntOnly(MI) ||
1836 ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1837 getVmemType(MI))) {
1838 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1839 ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait);
1840 ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait);
1841 ScoreBrackets.clearVgprVmemTypes(RegNo);
1842 }
1843 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1844 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1845 }
1846 ScoreBrackets.determineWait(DS_CNT, RegNo, Wait);
1847 } else {
1848 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1849 }
1850 }
1851 }
1852 }
1853 }
1854
1855 // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1856 // not, we need to ensure the subtarget is capable of backing off barrier
1857 // instructions in case there are any outstanding memory operations that may
1858 // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
1859 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1860 !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
1861 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
1862 }
1863
1864 // TODO: Remove this work-around, enable the assert for Bug 457939
1865 // after fixing the scheduler. Also, the Shader Compiler code is
1866 // independent of target.
1867 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1868 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1869 Wait.DsCnt = 0;
1870 }
1871 }
1872
1873 // Verify that the wait is actually needed.
1874 ScoreBrackets.simplifyWaitcnt(Wait);
1875
1876 if (ForceEmitZeroWaitcnts)
1877 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
1878
1879 if (ForceEmitWaitcnt[LOAD_CNT])
1880 Wait.LoadCnt = 0;
1881 if (ForceEmitWaitcnt[EXP_CNT])
1882 Wait.ExpCnt = 0;
1883 if (ForceEmitWaitcnt[DS_CNT])
1884 Wait.DsCnt = 0;
1885 if (ForceEmitWaitcnt[SAMPLE_CNT])
1886 Wait.SampleCnt = 0;
1887 if (ForceEmitWaitcnt[BVH_CNT])
1888 Wait.BvhCnt = 0;
1889 if (ForceEmitWaitcnt[KM_CNT])
1890 Wait.KmCnt = 0;
1891
1892 if (FlushVmCnt) {
1893 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
1894 Wait.LoadCnt = 0;
1895 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
1896 Wait.SampleCnt = 0;
1897 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
1898 Wait.BvhCnt = 0;
1899 }
1900
1901 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
1902 OldWaitcntInstr);
1903}
1904
1905// Add a waitcnt to flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the
1906// end of the given block if needed.
1907bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
1908 WaitcntBrackets &ScoreBrackets,
1909 MachineInstr *OldWaitcntInstr) {
1911
1912 unsigned LoadCntPending = ScoreBrackets.hasPendingEvent(LOAD_CNT);
1913 unsigned SampleCntPending = ScoreBrackets.hasPendingEvent(SAMPLE_CNT);
1914 unsigned BvhCntPending = ScoreBrackets.hasPendingEvent(BVH_CNT);
1915
1916 if (LoadCntPending == 0 && SampleCntPending == 0 && BvhCntPending == 0)
1917 return false;
1918
1919 if (LoadCntPending != 0)
1920 Wait.LoadCnt = 0;
1921 if (SampleCntPending != 0)
1922 Wait.SampleCnt = 0;
1923 if (BvhCntPending != 0)
1924 Wait.BvhCnt = 0;
1925
1926 return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
1927 OldWaitcntInstr);
1928}
1929
1930bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1933 WaitcntBrackets &ScoreBrackets,
1934 MachineInstr *OldWaitcntInstr) {
1935 bool Modified = false;
1936
1937 if (OldWaitcntInstr)
1938 // Try to merge the required wait with preexisting waitcnt instructions.
1939 // Also erase redundant waitcnt.
1940 Modified =
1941 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
1942
1943 // Any counts that could have been applied to any existing waitcnt
1944 // instructions will have been done so, now deal with any remaining.
1945 ScoreBrackets.applyWaitcnt(Wait);
1946
1947 // ExpCnt can be merged into VINTERP.
1948 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
1950 MachineOperand *WaitExp =
1951 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1952 if (Wait.ExpCnt < WaitExp->getImm()) {
1953 WaitExp->setImm(Wait.ExpCnt);
1954 Modified = true;
1955 }
1956 Wait.ExpCnt = ~0u;
1957
1958 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
1959 << "Update Instr: " << *It);
1960 }
1961
1962 if (WCG->createNewWaitcnt(Block, It, Wait))
1963 Modified = true;
1964
1965 return Modified;
1966}
1967
1968// This is a flat memory operation. Check to see if it has memory tokens other
1969// than LDS. Other address spaces supported by flat memory operations involve
1970// global memory.
1971bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1972 assert(TII->isFLAT(MI));
1973
1974 // All flat instructions use the VMEM counter.
1975 assert(TII->usesVM_CNT(MI));
1976
1977 // If there are no memory operands then conservatively assume the flat
1978 // operation may access VMEM.
1979 if (MI.memoperands_empty())
1980 return true;
1981
1982 // See if any memory operand specifies an address space that involves VMEM.
1983 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1984 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1985 // (GDS) address space is not supported by flat operations. Therefore, simply
1986 // return true unless only the LDS address space is found.
1987 for (const MachineMemOperand *Memop : MI.memoperands()) {
1988 unsigned AS = Memop->getAddrSpace();
1990 if (AS != AMDGPUAS::LOCAL_ADDRESS)
1991 return true;
1992 }
1993
1994 return false;
1995}
1996
1997// This is a flat memory operation. Check to see if it has memory tokens for
1998// either LDS or FLAT.
1999bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
2000 assert(TII->isFLAT(MI));
2001
2002 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
2003 if (!TII->usesLGKM_CNT(MI))
2004 return false;
2005
2006 // If in tgsplit mode then there can be no use of LDS.
2007 if (ST->isTgSplitEnabled())
2008 return false;
2009
2010 // If there are no memory operands then conservatively assume the flat
2011 // operation may access LDS.
2012 if (MI.memoperands_empty())
2013 return true;
2014
2015 // See if any memory operand specifies an address space that involves LDS.
2016 for (const MachineMemOperand *Memop : MI.memoperands()) {
2017 unsigned AS = Memop->getAddrSpace();
2019 return true;
2020 }
2021
2022 return false;
2023}
2024
2025// This is a flat memory operation. Check to see if it has memory tokens for
2026// either scratch or FLAT.
2027bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
2028 const MachineInstr &MI) const {
2029 assert(TII->isFLAT(MI));
2030
2031 // SCRATCH instructions always access scratch.
2032 if (TII->isFLATScratch(MI))
2033 return true;
2034
2035 // GLOBAL instructions never access scratch.
2036 if (TII->isFLATGlobal(MI))
2037 return false;
2038
2039 // If there are no memory operands then conservatively assume the flat
2040 // operation may access scratch.
2041 if (MI.memoperands_empty())
2042 return true;
2043
2044 // See if any memory operand specifies an address space that involves scratch.
2045 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
2046 unsigned AS = Memop->getAddrSpace();
2047 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
2048 });
2049}
2050
2052 auto Opc = Inst.getOpcode();
2053 return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
2054 Opc == AMDGPU::GLOBAL_WBINV;
2055}
2056
2057void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2058 WaitcntBrackets *ScoreBrackets) {
2059 // Now look at the instruction opcode. If it is a memory access
2060 // instruction, update the upper-bound of the appropriate counter's
2061 // bracket and the destination operand scores.
2062 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
2063
2064 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
2065 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
2066 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2067 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
2068 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
2069 } else {
2070 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
2071 }
2072 } else if (TII->isFLAT(Inst)) {
2073 // TODO: Track this properly.
2074 if (isCacheInvOrWBInst(Inst))
2075 return;
2076
2077 assert(Inst.mayLoadOrStore());
2078
2079 int FlatASCount = 0;
2080
2081 if (mayAccessVMEMThroughFlat(Inst)) {
2082 ++FlatASCount;
2083 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2084 Inst);
2085 }
2086
2087 if (mayAccessLDSThroughFlat(Inst)) {
2088 ++FlatASCount;
2089 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
2090 }
2091
2092 // A Flat memory operation must access at least one address space.
2093 assert(FlatASCount);
2094
2095 // This is a flat memory operation that access both VMEM and LDS, so note it
2096 // - it will require that both the VM and LGKM be flushed to zero if it is
2097 // pending when a VM or LGKM dependency occurs.
2098 if (FlatASCount > 1)
2099 ScoreBrackets->setPendingFlat();
2100 } else if (SIInstrInfo::isVMEM(Inst) &&
2102 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2103 Inst);
2104
2105 if (ST->vmemWriteNeedsExpWaitcnt() &&
2106 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2107 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
2108 }
2109 } else if (TII->isSMRD(Inst)) {
2110 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2111 } else if (Inst.isCall()) {
2112 if (callWaitsOnFunctionReturn(Inst)) {
2113 // Act as a wait on everything
2114 ScoreBrackets->applyWaitcnt(
2115 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2116 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2117 } else {
2118 // May need to way wait for anything.
2119 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
2120 }
2121 } else if (SIInstrInfo::isLDSDIR(Inst)) {
2122 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
2123 } else if (TII->isVINTERP(Inst)) {
2124 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2125 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2126 } else if (SIInstrInfo::isEXP(Inst)) {
2127 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2129 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
2130 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2131 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
2132 else
2133 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
2134 } else {
2135 switch (Inst.getOpcode()) {
2136 case AMDGPU::S_SENDMSG:
2137 case AMDGPU::S_SENDMSG_RTN_B32:
2138 case AMDGPU::S_SENDMSG_RTN_B64:
2139 case AMDGPU::S_SENDMSGHALT:
2140 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
2141 break;
2142 case AMDGPU::S_MEMTIME:
2143 case AMDGPU::S_MEMREALTIME:
2144 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2145 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2146 case AMDGPU::S_BARRIER_LEAVE:
2147 case AMDGPU::S_GET_BARRIER_STATE_M0:
2148 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2149 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2150 break;
2151 }
2152 }
2153}
2154
2155bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2156 unsigned OtherScore) {
2157 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2158 unsigned OtherShifted =
2159 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2160 Score = std::max(MyShifted, OtherShifted);
2161 return OtherShifted > MyShifted;
2162}
2163
2164/// Merge the pending events and associater score brackets of \p Other into
2165/// this brackets status.
2166///
2167/// Returns whether the merge resulted in a change that requires tighter waits
2168/// (i.e. the merged brackets strictly dominate the original brackets).
2169bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2170 bool StrictDom = false;
2171
2172 VgprUB = std::max(VgprUB, Other.VgprUB);
2173 SgprUB = std::max(SgprUB, Other.SgprUB);
2174
2175 for (auto T : inst_counter_types(MaxCounter)) {
2176 // Merge event flags for this counter
2177 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2178 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2179 if (OtherEvents & ~OldEvents)
2180 StrictDom = true;
2181 PendingEvents |= OtherEvents;
2182
2183 // Merge scores for this counter
2184 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2185 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2186 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2187 if (NewUB < ScoreLBs[T])
2188 report_fatal_error("waitcnt score overflow");
2189
2190 MergeInfo M;
2191 M.OldLB = ScoreLBs[T];
2192 M.OtherLB = Other.ScoreLBs[T];
2193 M.MyShift = NewUB - ScoreUBs[T];
2194 M.OtherShift = NewUB - Other.ScoreUBs[T];
2195
2196 ScoreUBs[T] = NewUB;
2197
2198 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
2199
2200 for (int J = 0; J <= VgprUB; J++)
2201 StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
2202
2203 if (T == SmemAccessCounter) {
2204 for (int J = 0; J <= SgprUB; J++)
2205 StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
2206 }
2207 }
2208
2209 for (int J = 0; J <= VgprUB; J++) {
2210 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
2211 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2212 VgprVmemTypes[J] = NewVmemTypes;
2213 }
2214
2215 return StrictDom;
2216}
2217
2218static bool isWaitInstr(MachineInstr &Inst) {
2219 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2220 return Opcode == AMDGPU::S_WAITCNT ||
2221 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2222 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2223 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2224 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2225 counterTypeForInstr(Opcode).has_value();
2226}
2227
2228// Generate s_waitcnt instructions where needed.
2229bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2231 WaitcntBrackets &ScoreBrackets) {
2232 bool Modified = false;
2233
2234 LLVM_DEBUG({
2235 dbgs() << "*** Block" << Block.getNumber() << " ***";
2236 ScoreBrackets.dump();
2237 });
2238
2239 // Track the correctness of vccz through this basic block. There are two
2240 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2241 // ST->partialVCCWritesUpdateVCCZ().
2242 bool VCCZCorrect = true;
2243 if (ST->hasReadVCCZBug()) {
2244 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2245 // to vcc and then issued an smem load.
2246 VCCZCorrect = false;
2247 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2248 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2249 // to vcc_lo or vcc_hi.
2250 VCCZCorrect = false;
2251 }
2252
2253 // Walk over the instructions.
2254 MachineInstr *OldWaitcntInstr = nullptr;
2255
2256 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2257 E = Block.instr_end();
2258 Iter != E;) {
2259 MachineInstr &Inst = *Iter;
2260
2261 // Track pre-existing waitcnts that were added in earlier iterations or by
2262 // the memory legalizer.
2263 if (isWaitInstr(Inst)) {
2264 if (!OldWaitcntInstr)
2265 OldWaitcntInstr = &Inst;
2266 ++Iter;
2267 continue;
2268 }
2269
2270 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2271 isPreheaderToFlush(Block, ScoreBrackets);
2272
2273 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
2274 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2275 FlushVmCnt);
2276 OldWaitcntInstr = nullptr;
2277
2278 // Restore vccz if it's not known to be correct already.
2279 bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
2280
2281 // Don't examine operands unless we need to track vccz correctness.
2282 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
2283 if (Inst.definesRegister(AMDGPU::VCC_LO) ||
2284 Inst.definesRegister(AMDGPU::VCC_HI)) {
2285 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2286 if (!ST->partialVCCWritesUpdateVCCZ())
2287 VCCZCorrect = false;
2288 } else if (Inst.definesRegister(AMDGPU::VCC)) {
2289 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
2290 // vccz bit, so when we detect that an instruction may read from a
2291 // corrupt vccz bit, we need to:
2292 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2293 // operations to complete.
2294 // 2. Restore the correct value of vccz by writing the current value
2295 // of vcc back to vcc.
2296 if (ST->hasReadVCCZBug() &&
2297 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2298 // Writes to vcc while there's an outstanding smem read may get
2299 // clobbered as soon as any read completes.
2300 VCCZCorrect = false;
2301 } else {
2302 // Writes to vcc will fix any incorrect value in vccz.
2303 VCCZCorrect = true;
2304 }
2305 }
2306 }
2307
2308 if (TII->isSMRD(Inst)) {
2309 for (const MachineMemOperand *Memop : Inst.memoperands()) {
2310 // No need to handle invariant loads when avoiding WAR conflicts, as
2311 // there cannot be a vector store to the same memory location.
2312 if (!Memop->isInvariant()) {
2313 const Value *Ptr = Memop->getValue();
2314 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
2315 }
2316 }
2317 if (ST->hasReadVCCZBug()) {
2318 // This smem read could complete and clobber vccz at any time.
2319 VCCZCorrect = false;
2320 }
2321 }
2322
2323 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2324
2325#if 0 // TODO: implement resource type check controlled by options with ub = LB.
2326 // If this instruction generates a S_SETVSKIP because it is an
2327 // indexed resource, and we are on Tahiti, then it will also force
2328 // an S_WAITCNT vmcnt(0)
2329 if (RequireCheckResourceType(Inst, context)) {
2330 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
2331 ScoreBrackets->setScoreLB(LOAD_CNT,
2332 ScoreBrackets->getScoreUB(LOAD_CNT));
2333 }
2334#endif
2335
2336 LLVM_DEBUG({
2337 Inst.print(dbgs());
2338 ScoreBrackets.dump();
2339 });
2340
2341 // TODO: Remove this work-around after fixing the scheduler and enable the
2342 // assert above.
2343 if (RestoreVCCZ) {
2344 // Restore the vccz bit. Any time a value is written to vcc, the vcc
2345 // bit is updated, so we can restore the bit by reading the value of
2346 // vcc and then writing it back to the register.
2347 BuildMI(Block, Inst, Inst.getDebugLoc(),
2348 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2349 TRI->getVCC())
2350 .addReg(TRI->getVCC());
2351 VCCZCorrect = true;
2352 Modified = true;
2353 }
2354
2355 ++Iter;
2356 }
2357
2358 if (Block.getFirstTerminator() == Block.end() &&
2359 isPreheaderToFlush(Block, ScoreBrackets))
2360 Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
2361
2362 return Modified;
2363}
2364
2365// Return true if the given machine basic block is a preheader of a loop in
2366// which we want to flush the vmcnt counter, and false otherwise.
2367bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
2368 WaitcntBrackets &ScoreBrackets) {
2369 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
2370 if (!IsInserted)
2371 return Iterator->second;
2372
2374 if (!Succ)
2375 return false;
2376
2377 MachineLoop *Loop = MLI->getLoopFor(Succ);
2378 if (!Loop)
2379 return false;
2380
2381 if (Loop->getLoopPreheader() == &MBB &&
2382 shouldFlushVmCnt(Loop, ScoreBrackets)) {
2383 Iterator->second = true;
2384 return true;
2385 }
2386
2387 return false;
2388}
2389
2390bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2391 return SIInstrInfo::isVMEM(MI) ||
2392 (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
2393}
2394
2395// Return true if it is better to flush the vmcnt counter in the preheader of
2396// the given loop. We currently decide to flush in two situations:
2397// 1. The loop contains vmem store(s), no vmem load and at least one use of a
2398// vgpr containing a value that is loaded outside of the loop. (Only on
2399// targets with no vscnt counter).
2400// 2. The loop contains vmem load(s), but the loaded values are not used in the
2401// loop, and at least one use of a vgpr containing a value that is loaded
2402// outside of the loop.
2403bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2404 WaitcntBrackets &Brackets) {
2405 bool HasVMemLoad = false;
2406 bool HasVMemStore = false;
2407 bool UsesVgprLoadedOutside = false;
2408 DenseSet<Register> VgprUse;
2409 DenseSet<Register> VgprDef;
2410
2411 for (MachineBasicBlock *MBB : ML->blocks()) {
2412 for (MachineInstr &MI : *MBB) {
2413 if (isVMEMOrFlatVMEM(MI)) {
2414 if (MI.mayLoad())
2415 HasVMemLoad = true;
2416 if (MI.mayStore())
2417 HasVMemStore = true;
2418 }
2419 for (unsigned I = 0; I < MI.getNumOperands(); I++) {
2420 MachineOperand &Op = MI.getOperand(I);
2421 if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
2422 continue;
2423 RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, I);
2424 // Vgpr use
2425 if (Op.isUse()) {
2426 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2427 // If we find a register that is loaded inside the loop, 1. and 2.
2428 // are invalidated and we can exit.
2429 if (VgprDef.contains(RegNo))
2430 return false;
2431 VgprUse.insert(RegNo);
2432 // If at least one of Op's registers is in the score brackets, the
2433 // value is likely loaded outside of the loop.
2434 if (Brackets.getRegScore(RegNo, LOAD_CNT) >
2435 Brackets.getScoreLB(LOAD_CNT) ||
2436 Brackets.getRegScore(RegNo, SAMPLE_CNT) >
2437 Brackets.getScoreLB(SAMPLE_CNT) ||
2438 Brackets.getRegScore(RegNo, BVH_CNT) >
2439 Brackets.getScoreLB(BVH_CNT)) {
2440 UsesVgprLoadedOutside = true;
2441 break;
2442 }
2443 }
2444 }
2445 // VMem load vgpr def
2446 else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
2447 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2448 // If we find a register that is loaded inside the loop, 1. and 2.
2449 // are invalidated and we can exit.
2450 if (VgprUse.contains(RegNo))
2451 return false;
2452 VgprDef.insert(RegNo);
2453 }
2454 }
2455 }
2456 }
2457 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2458 return true;
2459 return HasVMemLoad && UsesVgprLoadedOutside;
2460}
2461
2462bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
2463 ST = &MF.getSubtarget<GCNSubtarget>();
2464 TII = ST->getInstrInfo();
2465 TRI = &TII->getRegisterInfo();
2466 MRI = &MF.getRegInfo();
2468 MLI = &getAnalysis<MachineLoopInfo>();
2469 PDT = &getAnalysis<MachinePostDominatorTree>();
2470 if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2471 AA = &AAR->getAAResults();
2472
2474
2475 if (ST->hasExtendedWaitCounts()) {
2476 MaxCounter = NUM_EXTENDED_INST_CNTS;
2477 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(ST, MaxCounter);
2478 WCG = &WCGGFX12Plus;
2479 } else {
2480 MaxCounter = NUM_NORMAL_INST_CNTS;
2481 WCGPreGFX12 = WaitcntGeneratorPreGFX12(ST);
2482 WCG = &WCGPreGFX12;
2483 }
2484
2485 ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
2486 for (auto T : inst_counter_types())
2487 ForceEmitWaitcnt[T] = false;
2488
2489 const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
2490
2491 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
2492
2493 OptNone = MF.getFunction().hasOptNone() ||
2494 MF.getTarget().getOptLevel() == CodeGenOptLevel::None;
2495
2496 HardwareLimits Limits = {};
2497 if (ST->hasExtendedWaitCounts()) {
2498 Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
2499 Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
2500 } else {
2501 Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
2502 Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
2503 }
2504 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
2505 Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
2506 Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
2507 Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
2508 Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
2509
2510 unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
2511 unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
2512 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2513 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2514
2515 RegisterEncoding Encoding = {};
2516 Encoding.VGPR0 =
2517 TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2518 Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
2519 Encoding.SGPR0 =
2520 TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2521 Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
2522
2523 BlockInfos.clear();
2524 bool Modified = false;
2525
2526 MachineBasicBlock &EntryBB = MF.front();
2528
2529 if (!MFI->isEntryFunction()) {
2530 // Wait for any outstanding memory operations that the input registers may
2531 // depend on. We can't track them and it's better to do the wait after the
2532 // costly call sequence.
2533
2534 // TODO: Could insert earlier and schedule more liberally with operations
2535 // that only use caller preserved registers.
2536 for (MachineBasicBlock::iterator E = EntryBB.end();
2537 I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
2538 ;
2539
2540 if (ST->hasExtendedWaitCounts()) {
2541 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2542 .addImm(0);
2543 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2544 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
2545 continue;
2546
2547 BuildMI(EntryBB, I, DebugLoc(),
2548 TII->get(instrsForExtendedCounterTypes[CT]))
2549 .addImm(0);
2550 }
2551 } else {
2552 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
2553 }
2554
2555 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
2556 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
2557 SmemAccessCounter);
2558 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2559 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2560
2561 Modified = true;
2562 }
2563
2564 // Keep iterating over the blocks in reverse post order, inserting and
2565 // updating s_waitcnt where needed, until a fix point is reached.
2567 BlockInfos.insert({MBB, BlockInfo()});
2568
2569 std::unique_ptr<WaitcntBrackets> Brackets;
2570 bool Repeat;
2571 do {
2572 Repeat = false;
2573
2574 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
2575 ++BII) {
2576 MachineBasicBlock *MBB = BII->first;
2577 BlockInfo &BI = BII->second;
2578 if (!BI.Dirty)
2579 continue;
2580
2581 if (BI.Incoming) {
2582 if (!Brackets)
2583 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
2584 else
2585 *Brackets = *BI.Incoming;
2586 } else {
2587 if (!Brackets)
2588 Brackets = std::make_unique<WaitcntBrackets>(
2589 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
2590 SmemAccessCounter);
2591 else
2592 *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding,
2593 WaitEventMaskForInst, SmemAccessCounter);
2594 }
2595
2596 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
2597 BI.Dirty = false;
2598
2599 if (Brackets->hasPendingEvent()) {
2600 BlockInfo *MoveBracketsToSucc = nullptr;
2601 for (MachineBasicBlock *Succ : MBB->successors()) {
2602 auto SuccBII = BlockInfos.find(Succ);
2603 BlockInfo &SuccBI = SuccBII->second;
2604 if (!SuccBI.Incoming) {
2605 SuccBI.Dirty = true;
2606 if (SuccBII <= BII)
2607 Repeat = true;
2608 if (!MoveBracketsToSucc) {
2609 MoveBracketsToSucc = &SuccBI;
2610 } else {
2611 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
2612 }
2613 } else if (SuccBI.Incoming->merge(*Brackets)) {
2614 SuccBI.Dirty = true;
2615 if (SuccBII <= BII)
2616 Repeat = true;
2617 }
2618 }
2619 if (MoveBracketsToSucc)
2620 MoveBracketsToSucc->Incoming = std::move(Brackets);
2621 }
2622 }
2623 } while (Repeat);
2624
2625 if (ST->hasScalarStores()) {
2627 bool HaveScalarStores = false;
2628
2629 for (MachineBasicBlock &MBB : MF) {
2630 for (MachineInstr &MI : MBB) {
2631 if (!HaveScalarStores && TII->isScalarStore(MI))
2632 HaveScalarStores = true;
2633
2634 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
2635 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2636 EndPgmBlocks.push_back(&MBB);
2637 }
2638 }
2639
2640 if (HaveScalarStores) {
2641 // If scalar writes are used, the cache must be flushed or else the next
2642 // wave to reuse the same scratch memory can be clobbered.
2643 //
2644 // Insert s_dcache_wb at wave termination points if there were any scalar
2645 // stores, and only if the cache hasn't already been flushed. This could
2646 // be improved by looking across blocks for flushes in postdominating
2647 // blocks from the stores but an explicitly requested flush is probably
2648 // very rare.
2649 for (MachineBasicBlock *MBB : EndPgmBlocks) {
2650 bool SeenDCacheWB = false;
2651
2652 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
2653 I != E; ++I) {
2654 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
2655 SeenDCacheWB = true;
2656 else if (TII->isScalarStore(*I))
2657 SeenDCacheWB = false;
2658
2659 // FIXME: It would be better to insert this before a waitcnt if any.
2660 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
2661 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2662 !SeenDCacheWB) {
2663 Modified = true;
2664 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
2665 }
2666 }
2667 }
2668 }
2669 }
2670
2671 // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
2672 // instructions.
2673 for (MachineInstr *MI : ReleaseVGPRInsts) {
2674 if (ST->requiresNopBeforeDeallocVGPRs()) {
2675 BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_NOP))
2676 .addImm(0);
2677 }
2678 BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_SENDMSG))
2680 Modified = true;
2681 }
2682 ReleaseVGPRInsts.clear();
2683
2684 return Modified;
2685}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:182
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1290
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
static bool isCacheInvOrWBInst(MachineInstr &Inst)
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static bool readsVCCZ(const MachineInstr &MI)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define DEBUG_TYPE
SI Insert Waitcnts
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Provides some synthesis utilities to produce sequences of values.
static const uint32_t IV[8]
Definition: blake3_impl.h:78
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Represent the analysis usage information of a pass.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
This class represents an Operation in the Expression.
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:100
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:72
A debug info location.
Definition: DebugLoc.h:33
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
bool erase(const KeyT &Val)
Definition: DenseMap.h:329
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
bool hasOptNone() const
Do not optimize this function (-O0).
Definition: Function.h:674
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:36
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:544
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:327
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:916
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:547
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr fully defines the specified register.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:757
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:473
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:554
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
A description of a memory reference used in the backend.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
iterator find(const KeyT &Key)
Definition: MapVector.h:167
iterator begin()
Definition: MapVector.h:69
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
void clear()
Definition: MapVector.h:88
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:432
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:649
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:691
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:588
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:833
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:570
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:596
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:673
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:936
static bool isVINTERP(const MachineInstr &MI)
Definition: SIInstrInfo.h:841
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:580
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
self_iterator getIterator()
Definition: ilist_node.h:109
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
unsigned getStorecntBitMask(const IsaVersion &Version)
IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getSamplecntBitMask(const IsaVersion &Version)
unsigned getKmcntBitMask(const IsaVersion &Version)
unsigned getVmcntBitMask(const IsaVersion &Version)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getBvhcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getLoadcntBitMask(const IsaVersion &Version)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned getDscntBitMask(const IsaVersion &Version)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition: Sequence.h:337
@ Wait
Definition: Threading.h:61
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:665
char & SIInsertWaitcntsID
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1738
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
FunctionPass * createSIInsertWaitcntsPass()
Instruction set architecture version.
Definition: TargetParser.h:125
Represents the counter values to wait for in an s_waitcnt instruction.
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
static constexpr bool is_iterable
Definition: Sequence.h:100