LLVM 19.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
31#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/Sequence.h"
40using namespace llvm;
41
42#define DEBUG_TYPE "si-insert-waitcnts"
43
44DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
45 "Force emit s_waitcnt expcnt(0) instrs");
46DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
47 "Force emit s_waitcnt lgkmcnt(0) instrs");
48DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
49 "Force emit s_waitcnt vmcnt(0) instrs");
50
52 "amdgpu-waitcnt-forcezero",
53 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
54 cl::init(false), cl::Hidden);
55
56namespace {
57// Class of object that encapsulates latest instruction counter score
58// associated with the operand. Used for determining whether
59// s_waitcnt instruction needs to be emitted.
60
61enum InstCounterType {
62 LOAD_CNT = 0, // VMcnt prior to gfx12.
63 DS_CNT, // LKGMcnt prior to gfx12.
64 EXP_CNT, //
65 STORE_CNT, // VScnt in gfx10/gfx11.
66 NUM_NORMAL_INST_CNTS,
67 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
68 BVH_CNT, // gfx12+ only.
69 KM_CNT, // gfx12+ only.
70 NUM_EXTENDED_INST_CNTS,
71 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
72};
73} // namespace
74
75namespace llvm {
76template <> struct enum_iteration_traits<InstCounterType> {
77 static constexpr bool is_iterable = true;
78};
79} // namespace llvm
80
81namespace {
82// Return an iterator over all counters between LOAD_CNT (the first counter)
83// and \c MaxCounter (exclusive, default value yields an enumeration over
84// all counters).
85auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
86 return enum_seq(LOAD_CNT, MaxCounter);
87}
88
89using RegInterval = std::pair<int, int>;
90
91struct HardwareLimits {
92 unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
93 unsigned ExpcntMax;
94 unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
95 unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
96 unsigned SamplecntMax; // gfx12+ only.
97 unsigned BvhcntMax; // gfx12+ only.
98 unsigned KmcntMax; // gfx12+ only.
99};
100
101struct RegisterEncoding {
102 unsigned VGPR0;
103 unsigned VGPRL;
104 unsigned SGPR0;
105 unsigned SGPRL;
106};
107
108enum WaitEventType {
109 VMEM_ACCESS, // vector-memory read & write
110 VMEM_READ_ACCESS, // vector-memory read
111 VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
112 VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only)
113 VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
114 SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
115 LDS_ACCESS, // lds read & write
116 GDS_ACCESS, // gds read & write
117 SQ_MESSAGE, // send message
118 SMEM_ACCESS, // scalar-memory read & write
119 EXP_GPR_LOCK, // export holding on its data src
120 GDS_GPR_LOCK, // GDS holding on its data and addr src
121 EXP_POS_ACCESS, // write to export position
122 EXP_PARAM_ACCESS, // write to export parameter
123 VMW_GPR_LOCK, // vector-memory write holding on its data src
124 EXP_LDS_ACCESS, // read by ldsdir counting as export
125 NUM_WAIT_EVENTS,
126};
127
128// The mapping is:
129// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
130// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
131// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
132// We reserve a fixed number of VGPR slots in the scoring tables for
133// special tokens like SCMEM_LDS (needed for buffer load to LDS).
134enum RegisterMapping {
135 SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
136 AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
137 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
138 NUM_EXTRA_VGPRS = 9, // Reserved slots for DS.
139 // Artificial register slots to track LDS writes into specific LDS locations
140 // if a location is known. When slots are exhausted or location is
141 // unknown use the first slot. The first slot is also always updated in
142 // addition to known location's slot to properly generate waits if dependent
143 // instruction's location is unknown.
144 EXTRA_VGPR_LDS = 0,
145 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
146};
147
148// Enumerate different types of result-returning VMEM operations. Although
149// s_waitcnt orders them all with a single vmcnt counter, in the absence of
150// s_waitcnt only instructions of the same VmemType are guaranteed to write
151// their results in order -- so there is no need to insert an s_waitcnt between
152// two instructions of the same type that write the same vgpr.
153enum VmemType {
154 // BUF instructions and MIMG instructions without a sampler.
155 VMEM_NOSAMPLER,
156 // MIMG instructions with a sampler.
157 VMEM_SAMPLER,
158 // BVH instructions
159 VMEM_BVH,
160 NUM_VMEM_TYPES
161};
162
163// Maps values of InstCounterType to the instruction that waits on that
164// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
165// returns true.
166static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
167 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
168 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
169 AMDGPU::S_WAIT_KMCNT};
170
171static bool updateVMCntOnly(const MachineInstr &Inst) {
172 return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
174}
175
176#ifndef NDEBUG
177static bool isNormalMode(InstCounterType MaxCounter) {
178 return MaxCounter == NUM_NORMAL_INST_CNTS;
179}
180#endif // NDEBUG
181
182VmemType getVmemType(const MachineInstr &Inst) {
183 assert(updateVMCntOnly(Inst));
184 if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) &&
186 return VMEM_NOSAMPLER;
188 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
190 return BaseInfo->BVH ? VMEM_BVH
191 : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
192}
193
194unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
195 switch (T) {
196 case LOAD_CNT:
197 return Wait.LoadCnt;
198 case EXP_CNT:
199 return Wait.ExpCnt;
200 case DS_CNT:
201 return Wait.DsCnt;
202 case STORE_CNT:
203 return Wait.StoreCnt;
204 case SAMPLE_CNT:
205 return Wait.SampleCnt;
206 case BVH_CNT:
207 return Wait.BvhCnt;
208 case KM_CNT:
209 return Wait.KmCnt;
210 default:
211 llvm_unreachable("bad InstCounterType");
212 }
213}
214
215void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
216 unsigned &WC = getCounterRef(Wait, T);
217 WC = std::min(WC, Count);
218}
219
220void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
221 getCounterRef(Wait, T) = ~0u;
222}
223
224unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
225 return getCounterRef(Wait, T);
226}
227
228// Mapping from event to counter according to the table masks.
229InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
230 for (auto T : inst_counter_types()) {
231 if (masks[T] & (1 << E))
232 return T;
233 }
234 llvm_unreachable("event type has no associated counter");
235}
236
237// This objects maintains the current score brackets of each wait counter, and
238// a per-register scoreboard for each wait counter.
239//
240// We also maintain the latest score for every event type that can change the
241// waitcnt in order to know if there are multiple types of events within
242// the brackets. When multiple types of event happen in the bracket,
243// wait count may get decreased out of order, therefore we need to put in
244// "s_waitcnt 0" before use.
245class WaitcntBrackets {
246public:
247 WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
248 HardwareLimits Limits, RegisterEncoding Encoding,
249 const unsigned *WaitEventMaskForInst,
250 InstCounterType SmemAccessCounter)
251 : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
252 Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
253 SmemAccessCounter(SmemAccessCounter) {}
254
255 unsigned getWaitCountMax(InstCounterType T) const {
256 switch (T) {
257 case LOAD_CNT:
258 return Limits.LoadcntMax;
259 case DS_CNT:
260 return Limits.DscntMax;
261 case EXP_CNT:
262 return Limits.ExpcntMax;
263 case STORE_CNT:
264 return Limits.StorecntMax;
265 case SAMPLE_CNT:
266 return Limits.SamplecntMax;
267 case BVH_CNT:
268 return Limits.BvhcntMax;
269 case KM_CNT:
270 return Limits.KmcntMax;
271 default:
272 break;
273 }
274 return 0;
275 }
276
277 unsigned getScoreLB(InstCounterType T) const {
278 assert(T < NUM_INST_CNTS);
279 return ScoreLBs[T];
280 }
281
282 unsigned getScoreUB(InstCounterType T) const {
283 assert(T < NUM_INST_CNTS);
284 return ScoreUBs[T];
285 }
286
287 unsigned getScoreRange(InstCounterType T) const {
288 return getScoreUB(T) - getScoreLB(T);
289 }
290
291 unsigned getRegScore(int GprNo, InstCounterType T) const {
292 if (GprNo < NUM_ALL_VGPRS) {
293 return VgprScores[T][GprNo];
294 }
295 assert(T == SmemAccessCounter);
296 return SgprScores[GprNo - NUM_ALL_VGPRS];
297 }
298
299 bool merge(const WaitcntBrackets &Other);
300
301 RegInterval getRegInterval(const MachineInstr *MI,
303 const SIRegisterInfo *TRI, unsigned OpNo) const;
304
305 bool counterOutOfOrder(InstCounterType T) const;
306 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
307 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
308 void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
309 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
310 void applyWaitcnt(InstCounterType T, unsigned Count);
311 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
312 const MachineRegisterInfo *MRI, WaitEventType E,
314
315 unsigned hasPendingEvent() const { return PendingEvents; }
316 unsigned hasPendingEvent(WaitEventType E) const {
317 return PendingEvents & (1 << E);
318 }
319 unsigned hasPendingEvent(InstCounterType T) const {
320 unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
321 assert((HasPending != 0) == (getScoreRange(T) != 0));
322 return HasPending;
323 }
324
325 bool hasMixedPendingEvents(InstCounterType T) const {
326 unsigned Events = hasPendingEvent(T);
327 // Return true if more than one bit is set in Events.
328 return Events & (Events - 1);
329 }
330
331 bool hasPendingFlat() const {
332 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
333 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
334 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
335 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
336 }
337
338 void setPendingFlat() {
339 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
340 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
341 }
342
343 // Return true if there might be pending writes to the specified vgpr by VMEM
344 // instructions with types different from V.
345 bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
346 assert(GprNo < NUM_ALL_VGPRS);
347 return VgprVmemTypes[GprNo] & ~(1 << V);
348 }
349
350 void clearVgprVmemTypes(int GprNo) {
351 assert(GprNo < NUM_ALL_VGPRS);
352 VgprVmemTypes[GprNo] = 0;
353 }
354
355 void setStateOnFunctionEntryOrReturn() {
356 setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
357 PendingEvents |= WaitEventMaskForInst[STORE_CNT];
358 }
359
360 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
361 return LDSDMAStores;
362 }
363
364 void print(raw_ostream &);
365 void dump() { print(dbgs()); }
366
367private:
368 struct MergeInfo {
369 unsigned OldLB;
370 unsigned OtherLB;
371 unsigned MyShift;
372 unsigned OtherShift;
373 };
374 static bool mergeScore(const MergeInfo &M, unsigned &Score,
375 unsigned OtherScore);
376
377 void setScoreLB(InstCounterType T, unsigned Val) {
378 assert(T < NUM_INST_CNTS);
379 ScoreLBs[T] = Val;
380 }
381
382 void setScoreUB(InstCounterType T, unsigned Val) {
383 assert(T < NUM_INST_CNTS);
384 ScoreUBs[T] = Val;
385
386 if (T != EXP_CNT)
387 return;
388
389 if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
390 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
391 }
392
393 void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
394 if (GprNo < NUM_ALL_VGPRS) {
395 VgprUB = std::max(VgprUB, GprNo);
396 VgprScores[T][GprNo] = Val;
397 } else {
398 assert(T == SmemAccessCounter);
399 SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
400 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
401 }
402 }
403
404 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
406 unsigned OpNo, unsigned Val);
407
408 const GCNSubtarget *ST = nullptr;
409 InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
410 HardwareLimits Limits = {};
411 RegisterEncoding Encoding = {};
412 const unsigned *WaitEventMaskForInst;
413 InstCounterType SmemAccessCounter;
414 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
415 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
416 unsigned PendingEvents = 0;
417 // Remember the last flat memory operation.
418 unsigned LastFlat[NUM_INST_CNTS] = {0};
419 // wait_cnt scores for every vgpr.
420 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
421 int VgprUB = -1;
422 int SgprUB = -1;
423 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
424 // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
425 // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
426 unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
427 // Bitmask of the VmemTypes of VMEM instructions that might have a pending
428 // write to each vgpr.
429 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
430 // Store representative LDS DMA operations. The only useful info here is
431 // alias info. One store is kept per unique AAInfo.
432 SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
433};
434
435// This abstracts the logic for generating and updating S_WAIT* instructions
436// away from the analysis that determines where they are needed. This was
437// done because the set of counters and instructions for waiting on them
438// underwent a major shift with gfx12, sufficiently so that having this
439// abstraction allows the main analysis logic to be simpler than it would
440// otherwise have had to become.
441class WaitcntGenerator {
442protected:
443 const GCNSubtarget *ST = nullptr;
444 const SIInstrInfo *TII = nullptr;
446 InstCounterType MaxCounter;
447
448public:
449 WaitcntGenerator() {}
450 WaitcntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter)
451 : ST(ST), TII(ST->getInstrInfo()),
452 IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter) {}
453
454 // Edits an existing sequence of wait count instructions according
455 // to an incoming Waitcnt value, which is itself updated to reflect
456 // any new wait count instructions which may need to be generated by
457 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
458 // were made.
459 //
460 // This editing will usually be merely updated operands, but it may also
461 // delete instructions if the incoming Wait value indicates they are not
462 // needed. It may also remove existing instructions for which a wait
463 // is needed if it can be determined that it is better to generate new
464 // instructions later, as can happen on gfx12.
465 virtual bool
466 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
467 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
469
470 // Transform a soft waitcnt into a normal one.
471 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
472
473 // Generates new wait count instructions according to the value of
474 // Wait, returning true if any new instructions were created.
475 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
478
479 // Returns an array of bit masks which can be used to map values in
480 // WaitEventType to corresponding counter values in InstCounterType.
481 virtual const unsigned *getWaitEventMask() const = 0;
482
483 virtual ~WaitcntGenerator() = default;
484};
485
486class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
487public:
488 WaitcntGeneratorPreGFX12() {}
489 WaitcntGeneratorPreGFX12(const GCNSubtarget *ST)
490 : WaitcntGenerator(ST, NUM_NORMAL_INST_CNTS) {}
491
492 bool
493 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
494 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
495 MachineBasicBlock::instr_iterator It) const override;
496
497 bool createNewWaitcnt(MachineBasicBlock &Block,
499 AMDGPU::Waitcnt Wait) override;
500
501 const unsigned *getWaitEventMask() const override {
502 assert(ST);
503
504 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
505 (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS) |
506 (1 << VMEM_SAMPLER_READ_ACCESS) | (1 << VMEM_BVH_READ_ACCESS),
507 (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
508 (1 << SQ_MESSAGE),
509 (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
510 (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) |
511 (1 << EXP_LDS_ACCESS),
512 (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS),
513 0,
514 0,
515 0};
516
517 return WaitEventMaskForInstPreGFX12;
518 }
519};
520
521class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
522public:
523 WaitcntGeneratorGFX12Plus() {}
524 WaitcntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter)
525 : WaitcntGenerator(ST, MaxCounter) {}
526
527 bool
528 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
529 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
530 MachineBasicBlock::instr_iterator It) const override;
531
532 bool createNewWaitcnt(MachineBasicBlock &Block,
534 AMDGPU::Waitcnt Wait) override;
535
536 const unsigned *getWaitEventMask() const override {
537 assert(ST);
538
539 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
540 (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
541 (1 << LDS_ACCESS) | (1 << GDS_ACCESS),
542 (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
543 (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) |
544 (1 << EXP_LDS_ACCESS),
545 (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS),
546 (1 << VMEM_SAMPLER_READ_ACCESS),
547 (1 << VMEM_BVH_READ_ACCESS),
548 (1 << SMEM_ACCESS) | (1 << SQ_MESSAGE)};
549
550 return WaitEventMaskForInstGFX12Plus;
551 }
552};
553
554class SIInsertWaitcnts : public MachineFunctionPass {
555private:
556 const GCNSubtarget *ST = nullptr;
557 const SIInstrInfo *TII = nullptr;
558 const SIRegisterInfo *TRI = nullptr;
559 const MachineRegisterInfo *MRI = nullptr;
560
562 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
563 MachineLoopInfo *MLI;
565 AliasAnalysis *AA = nullptr;
566
567 struct BlockInfo {
568 std::unique_ptr<WaitcntBrackets> Incoming;
569 bool Dirty = true;
570 };
571
572 InstCounterType SmemAccessCounter;
573
575
576 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
577 // because of amdgpu-waitcnt-forcezero flag
578 bool ForceEmitZeroWaitcnts;
579 bool ForceEmitWaitcnt[NUM_INST_CNTS];
580
581 bool OptNone;
582
583 // In any given run of this pass, WCG will point to one of these two
584 // generator objects, which must have been re-initialised before use
585 // from a value made using a subtarget constructor.
586 WaitcntGeneratorPreGFX12 WCGPreGFX12;
587 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
588
589 WaitcntGenerator *WCG = nullptr;
590
591 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
592 // message.
593 DenseSet<MachineInstr *> ReleaseVGPRInsts;
594
595 InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
596
597public:
598 static char ID;
599
600 SIInsertWaitcnts() : MachineFunctionPass(ID) {
601 (void)ForceExpCounter;
602 (void)ForceLgkmCounter;
603 (void)ForceVMCounter;
604 }
605
606 bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
607 bool isPreheaderToFlush(MachineBasicBlock &MBB,
608 WaitcntBrackets &ScoreBrackets);
609 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
610 bool runOnMachineFunction(MachineFunction &MF) override;
611
612 StringRef getPassName() const override {
613 return "SI insert wait instructions";
614 }
615
616 void getAnalysisUsage(AnalysisUsage &AU) const override {
617 AU.setPreservesCFG();
623 }
624
625 bool isForceEmitWaitcnt() const {
626 for (auto T : inst_counter_types())
627 if (ForceEmitWaitcnt[T])
628 return true;
629 return false;
630 }
631
632 void setForceEmitWaitcnt() {
633// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
634// For debug builds, get the debug counter info and adjust if need be
635#ifndef NDEBUG
636 if (DebugCounter::isCounterSet(ForceExpCounter) &&
637 DebugCounter::shouldExecute(ForceExpCounter)) {
638 ForceEmitWaitcnt[EXP_CNT] = true;
639 } else {
640 ForceEmitWaitcnt[EXP_CNT] = false;
641 }
642
643 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
644 DebugCounter::shouldExecute(ForceLgkmCounter)) {
645 ForceEmitWaitcnt[DS_CNT] = true;
646 ForceEmitWaitcnt[KM_CNT] = true;
647 } else {
648 ForceEmitWaitcnt[DS_CNT] = false;
649 ForceEmitWaitcnt[KM_CNT] = false;
650 }
651
652 if (DebugCounter::isCounterSet(ForceVMCounter) &&
653 DebugCounter::shouldExecute(ForceVMCounter)) {
654 ForceEmitWaitcnt[LOAD_CNT] = true;
655 ForceEmitWaitcnt[SAMPLE_CNT] = true;
656 ForceEmitWaitcnt[BVH_CNT] = true;
657 } else {
658 ForceEmitWaitcnt[LOAD_CNT] = false;
659 ForceEmitWaitcnt[SAMPLE_CNT] = false;
660 ForceEmitWaitcnt[BVH_CNT] = false;
661 }
662#endif // NDEBUG
663 }
664
665 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
666 // FLAT instruction.
667 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
668 // Maps VMEM access types to their corresponding WaitEventType.
669 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
670 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
671
673 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
674 // these should use VM_CNT.
675 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
676 return VMEM_ACCESS;
677 if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) {
678 // FLAT and SCRATCH instructions may access scratch. Other VMEM
679 // instructions do not.
680 if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
681 return SCRATCH_WRITE_ACCESS;
682 return VMEM_WRITE_ACCESS;
683 }
684 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
685 return VMEM_READ_ACCESS;
686 return VmemReadMapping[getVmemType(Inst)];
687 }
688
689 bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
690 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
691 bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
692 bool generateWaitcntInstBefore(MachineInstr &MI,
693 WaitcntBrackets &ScoreBrackets,
694 MachineInstr *OldWaitcntInstr,
695 bool FlushVmCnt);
696 bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
697 WaitcntBrackets &ScoreBrackets,
698 MachineInstr *OldWaitcntInstr);
699 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
701 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
702 MachineInstr *OldWaitcntInstr);
703 void updateEventWaitcntAfter(MachineInstr &Inst,
704 WaitcntBrackets *ScoreBrackets);
705 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
706 WaitcntBrackets &ScoreBrackets);
707};
708
709} // end anonymous namespace
710
711RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
713 const SIRegisterInfo *TRI,
714 unsigned OpNo) const {
715 const MachineOperand &Op = MI->getOperand(OpNo);
716 if (!TRI->isInAllocatableClass(Op.getReg()))
717 return {-1, -1};
718
719 // A use via a PW operand does not need a waitcnt.
720 // A partial write is not a WAW.
721 assert(!Op.getSubReg() || !Op.isUndef());
722
723 RegInterval Result;
724
725 unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
727
728 if (TRI->isVectorRegister(*MRI, Op.getReg())) {
729 assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
730 Result.first = Reg - Encoding.VGPR0;
731 if (TRI->isAGPR(*MRI, Op.getReg()))
732 Result.first += AGPR_OFFSET;
733 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
734 } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
735 assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
736 Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
737 assert(Result.first >= NUM_ALL_VGPRS &&
738 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
739 }
740 // TODO: Handle TTMP
741 // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
742 else
743 return {-1, -1};
744
745 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
746 unsigned Size = TRI->getRegSizeInBits(*RC);
747 Result.second = Result.first + ((Size + 16) / 32);
748
749 return Result;
750}
751
752void WaitcntBrackets::setExpScore(const MachineInstr *MI,
753 const SIInstrInfo *TII,
754 const SIRegisterInfo *TRI,
755 const MachineRegisterInfo *MRI, unsigned OpNo,
756 unsigned Val) {
757 RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo);
758 assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
759 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
760 setRegScore(RegNo, EXP_CNT, Val);
761 }
762}
763
764void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
765 const SIRegisterInfo *TRI,
767 WaitEventType E, MachineInstr &Inst) {
768 InstCounterType T = eventCounter(WaitEventMaskForInst, E);
769
770 unsigned UB = getScoreUB(T);
771 unsigned CurrScore = UB + 1;
772 if (CurrScore == 0)
773 report_fatal_error("InsertWaitcnt score wraparound");
774 // PendingEvents and ScoreUB need to be update regardless if this event
775 // changes the score of a register or not.
776 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
777 PendingEvents |= 1 << E;
778 setScoreUB(T, CurrScore);
779
780 if (T == EXP_CNT) {
781 // Put score on the source vgprs. If this is a store, just use those
782 // specific register(s).
783 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
784 int AddrOpIdx =
785 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
786 // All GDS operations must protect their address register (same as
787 // export.)
788 if (AddrOpIdx != -1) {
789 setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
790 }
791
792 if (Inst.mayStore()) {
793 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) {
794 setExpScore(
795 &Inst, TII, TRI, MRI,
796 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
797 CurrScore);
798 }
799 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) {
800 setExpScore(&Inst, TII, TRI, MRI,
802 AMDGPU::OpName::data1),
803 CurrScore);
804 }
805 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
806 Inst.getOpcode() != AMDGPU::DS_APPEND &&
807 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
808 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
809 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
810 const MachineOperand &Op = Inst.getOperand(I);
811 if (Op.isReg() && !Op.isDef() &&
812 TRI->isVectorRegister(*MRI, Op.getReg())) {
813 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
814 }
815 }
816 }
817 } else if (TII->isFLAT(Inst)) {
818 if (Inst.mayStore()) {
819 setExpScore(
820 &Inst, TII, TRI, MRI,
821 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
822 CurrScore);
823 } else if (SIInstrInfo::isAtomicRet(Inst)) {
824 setExpScore(
825 &Inst, TII, TRI, MRI,
826 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
827 CurrScore);
828 }
829 } else if (TII->isMIMG(Inst)) {
830 if (Inst.mayStore()) {
831 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
832 } else if (SIInstrInfo::isAtomicRet(Inst)) {
833 setExpScore(
834 &Inst, TII, TRI, MRI,
835 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
836 CurrScore);
837 }
838 } else if (TII->isMTBUF(Inst)) {
839 if (Inst.mayStore()) {
840 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
841 }
842 } else if (TII->isMUBUF(Inst)) {
843 if (Inst.mayStore()) {
844 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
845 } else if (SIInstrInfo::isAtomicRet(Inst)) {
846 setExpScore(
847 &Inst, TII, TRI, MRI,
848 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
849 CurrScore);
850 }
851 } else if (TII->isLDSDIR(Inst)) {
852 // LDSDIR instructions attach the score to the destination.
853 setExpScore(
854 &Inst, TII, TRI, MRI,
855 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
856 CurrScore);
857 } else {
858 if (TII->isEXP(Inst)) {
859 // For export the destination registers are really temps that
860 // can be used as the actual source after export patching, so
861 // we need to treat them like sources and set the EXP_CNT
862 // score.
863 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
864 MachineOperand &DefMO = Inst.getOperand(I);
865 if (DefMO.isReg() && DefMO.isDef() &&
866 TRI->isVGPR(*MRI, DefMO.getReg())) {
867 setRegScore(
868 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
869 EXP_CNT, CurrScore);
870 }
871 }
872 }
873 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
874 MachineOperand &MO = Inst.getOperand(I);
875 if (MO.isReg() && !MO.isDef() &&
876 TRI->isVectorRegister(*MRI, MO.getReg())) {
877 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
878 }
879 }
880 }
881#if 0 // TODO: check if this is handled by MUBUF code above.
882 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
883 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
884 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
885 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
886 unsigned OpNo;//TODO: find the OpNo for this operand;
887 RegInterval Interval = getRegInterval(&Inst, MRI, TRI, OpNo);
888 for (int RegNo = Interval.first; RegNo < Interval.second;
889 ++RegNo) {
890 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
891 }
892#endif
893 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
894 // Match the score to the destination registers.
895 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
896 auto &Op = Inst.getOperand(I);
897 if (!Op.isReg() || !Op.isDef())
898 continue;
899 RegInterval Interval = getRegInterval(&Inst, MRI, TRI, I);
900 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
901 if (Interval.first >= NUM_ALL_VGPRS)
902 continue;
903 if (updateVMCntOnly(Inst)) {
904 // updateVMCntOnly should only leave us with VGPRs
905 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
906 // defs. That's required for a sane index into `VgprMemTypes` below
907 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
908 VmemType V = getVmemType(Inst);
909 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
910 VgprVmemTypes[RegNo] |= 1 << V;
911 }
912 }
913 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
914 setRegScore(RegNo, T, CurrScore);
915 }
916 }
917 if (Inst.mayStore() &&
918 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
919 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
920 // written can be accessed. A load from LDS to VMEM does not need a wait.
921 unsigned Slot = 0;
922 for (const auto *MemOp : Inst.memoperands()) {
923 if (!MemOp->isStore() ||
924 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
925 continue;
926 // Comparing just AA info does not guarantee memoperands are equal
927 // in general, but this is so for LDS DMA in practice.
928 auto AAI = MemOp->getAAInfo();
929 // Alias scope information gives a way to definitely identify an
930 // original memory object and practically produced in the module LDS
931 // lowering pass. If there is no scope available we will not be able
932 // to disambiguate LDS aliasing as after the module lowering all LDS
933 // is squashed into a single big object. Do not attempt to use one of
934 // the limited LDSDMAStores for something we will not be able to use
935 // anyway.
936 if (!AAI || !AAI.Scope)
937 break;
938 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
939 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
940 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
941 Slot = I + 1;
942 break;
943 }
944 }
945 }
946 if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
947 break;
948 LDSDMAStores.push_back(&Inst);
949 Slot = LDSDMAStores.size();
950 break;
951 }
952 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore);
953 if (Slot)
954 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
955 }
956 }
957}
958
959void WaitcntBrackets::print(raw_ostream &OS) {
960 OS << '\n';
961 for (auto T : inst_counter_types(MaxCounter)) {
962 unsigned SR = getScoreRange(T);
963
964 switch (T) {
965 case LOAD_CNT:
966 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
967 << SR << "): ";
968 break;
969 case DS_CNT:
970 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
971 << SR << "): ";
972 break;
973 case EXP_CNT:
974 OS << " EXP_CNT(" << SR << "): ";
975 break;
976 case STORE_CNT:
977 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
978 << SR << "): ";
979 break;
980 case SAMPLE_CNT:
981 OS << " SAMPLE_CNT(" << SR << "): ";
982 break;
983 case BVH_CNT:
984 OS << " BVH_CNT(" << SR << "): ";
985 break;
986 case KM_CNT:
987 OS << " KM_CNT(" << SR << "): ";
988 break;
989 default:
990 OS << " UNKNOWN(" << SR << "): ";
991 break;
992 }
993
994 if (SR != 0) {
995 // Print vgpr scores.
996 unsigned LB = getScoreLB(T);
997
998 for (int J = 0; J <= VgprUB; J++) {
999 unsigned RegScore = getRegScore(J, T);
1000 if (RegScore <= LB)
1001 continue;
1002 unsigned RelScore = RegScore - LB - 1;
1003 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
1004 OS << RelScore << ":v" << J << " ";
1005 } else {
1006 OS << RelScore << ":ds ";
1007 }
1008 }
1009 // Also need to print sgpr scores for lgkm_cnt.
1010 if (T == SmemAccessCounter) {
1011 for (int J = 0; J <= SgprUB; J++) {
1012 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
1013 if (RegScore <= LB)
1014 continue;
1015 unsigned RelScore = RegScore - LB - 1;
1016 OS << RelScore << ":s" << J << " ";
1017 }
1018 }
1019 }
1020 OS << '\n';
1021 }
1022 OS << '\n';
1023}
1024
1025/// Simplify the waitcnt, in the sense of removing redundant counts, and return
1026/// whether a waitcnt instruction is needed at all.
1027void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1028 simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1029 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
1030 simplifyWaitcnt(DS_CNT, Wait.DsCnt);
1031 simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
1032 simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1033 simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
1034 simplifyWaitcnt(KM_CNT, Wait.KmCnt);
1035}
1036
1037void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1038 unsigned &Count) const {
1039 // The number of outstanding events for this type, T, can be calculated
1040 // as (UB - LB). If the current Count is greater than or equal to the number
1041 // of outstanding events, then the wait for this counter is redundant.
1042 if (Count >= getScoreRange(T))
1043 Count = ~0u;
1044}
1045
1046void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
1047 AMDGPU::Waitcnt &Wait) const {
1048 unsigned ScoreToWait = getRegScore(RegNo, T);
1049
1050 // If the score of src_operand falls within the bracket, we need an
1051 // s_waitcnt instruction.
1052 const unsigned LB = getScoreLB(T);
1053 const unsigned UB = getScoreUB(T);
1054 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1055 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1056 !ST->hasFlatLgkmVMemCountInOrder()) {
1057 // If there is a pending FLAT operation, and this is a VMem or LGKM
1058 // waitcnt and the target can report early completion, then we need
1059 // to force a waitcnt 0.
1060 addWait(Wait, T, 0);
1061 } else if (counterOutOfOrder(T)) {
1062 // Counter can get decremented out-of-order when there
1063 // are multiple types event in the bracket. Also emit an s_wait counter
1064 // with a conservative value of 0 for the counter.
1065 addWait(Wait, T, 0);
1066 } else {
1067 // If a counter has been maxed out avoid overflow by waiting for
1068 // MAX(CounterType) - 1 instead.
1069 unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
1070 addWait(Wait, T, NeededWait);
1071 }
1072 }
1073}
1074
1075void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1076 applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1077 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1078 applyWaitcnt(DS_CNT, Wait.DsCnt);
1079 applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1080 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1081 applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1082 applyWaitcnt(KM_CNT, Wait.KmCnt);
1083}
1084
1085void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1086 const unsigned UB = getScoreUB(T);
1087 if (Count >= UB)
1088 return;
1089 if (Count != 0) {
1090 if (counterOutOfOrder(T))
1091 return;
1092 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1093 } else {
1094 setScoreLB(T, UB);
1095 PendingEvents &= ~WaitEventMaskForInst[T];
1096 }
1097}
1098
1099// Where there are multiple types of event in the bracket of a counter,
1100// the decrement may go out of order.
1101bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1102 // Scalar memory read always can go out of order.
1103 if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
1104 return true;
1105 return hasMixedPendingEvents(T);
1106}
1107
1108INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1109 false)
1112INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1113 false)
1114
1115char SIInsertWaitcnts::ID = 0;
1116
1117char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
1118
1120 return new SIInsertWaitcnts();
1121}
1122
1124 unsigned NewEnc) {
1125 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1126 assert(OpIdx >= 0);
1127
1128 MachineOperand &MO = MI.getOperand(OpIdx);
1129
1130 if (NewEnc == MO.getImm())
1131 return false;
1132
1133 MO.setImm(NewEnc);
1134 return true;
1135}
1136
1137/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1138/// and if so, which counter it is waiting on.
1139static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1140 switch (Opcode) {
1141 case AMDGPU::S_WAIT_LOADCNT:
1142 return LOAD_CNT;
1143 case AMDGPU::S_WAIT_EXPCNT:
1144 return EXP_CNT;
1145 case AMDGPU::S_WAIT_STORECNT:
1146 return STORE_CNT;
1147 case AMDGPU::S_WAIT_SAMPLECNT:
1148 return SAMPLE_CNT;
1149 case AMDGPU::S_WAIT_BVHCNT:
1150 return BVH_CNT;
1151 case AMDGPU::S_WAIT_DSCNT:
1152 return DS_CNT;
1153 case AMDGPU::S_WAIT_KMCNT:
1154 return KM_CNT;
1155 default:
1156 return {};
1157 }
1158}
1159
1160bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1161 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1162 if (Opcode == Waitcnt->getOpcode())
1163 return false;
1164
1165 Waitcnt->setDesc(TII->get(Opcode));
1166 return true;
1167}
1168
1169/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1170/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1171/// from \p Wait that were added by previous passes. Currently this pass
1172/// conservatively assumes that these preexisting waits are required for
1173/// correctness.
1174bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1175 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1177 assert(ST);
1178 assert(isNormalMode(MaxCounter));
1179
1180 bool Modified = false;
1181 MachineInstr *WaitcntInstr = nullptr;
1182 MachineInstr *WaitcntVsCntInstr = nullptr;
1183
1184 for (auto &II :
1185 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1186 if (II.isMetaInstruction())
1187 continue;
1188
1189 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1190 bool IsSoft = Opcode != II.getOpcode();
1191
1192 // Update required wait count. If this is a soft waitcnt (= it was added
1193 // by an earlier pass), it may be entirely removed.
1194 if (Opcode == AMDGPU::S_WAITCNT) {
1195 unsigned IEnc = II.getOperand(0).getImm();
1196 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1197 if (IsSoft)
1198 ScoreBrackets.simplifyWaitcnt(OldWait);
1199 Wait = Wait.combined(OldWait);
1200
1201 // Merge consecutive waitcnt of the same type by erasing multiples.
1202 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && IsSoft)) {
1203 II.eraseFromParent();
1204 Modified = true;
1205 } else
1206 WaitcntInstr = &II;
1207 } else {
1208 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1209 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1210
1211 unsigned OldVSCnt =
1212 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1213 if (IsSoft)
1214 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1215 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1216
1217 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && IsSoft)) {
1218 II.eraseFromParent();
1219 Modified = true;
1220 } else
1221 WaitcntVsCntInstr = &II;
1222 }
1223 }
1224
1225 if (WaitcntInstr) {
1226 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1228 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1229
1230 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1231 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1232 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1233 Wait.LoadCnt = ~0u;
1234 Wait.ExpCnt = ~0u;
1235 Wait.DsCnt = ~0u;
1236
1237 LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1238 ? dbgs()
1239 << "applyPreexistingWaitcnt\n"
1240 << "New Instr at block end: " << *WaitcntInstr << '\n'
1241 : dbgs() << "applyPreexistingWaitcnt\n"
1242 << "Old Instr: " << *It
1243 << "New Instr: " << *WaitcntInstr << '\n');
1244 }
1245
1246 if (WaitcntVsCntInstr) {
1247 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
1248 AMDGPU::OpName::simm16, Wait.StoreCnt);
1249 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1250
1251 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1252 Wait.StoreCnt = ~0u;
1253
1254 LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1255 ? dbgs() << "applyPreexistingWaitcnt\n"
1256 << "New Instr at block end: " << *WaitcntVsCntInstr
1257 << '\n'
1258 : dbgs() << "applyPreexistingWaitcnt\n"
1259 << "Old Instr: " << *It
1260 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1261 }
1262
1263 return Modified;
1264}
1265
1266/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1267/// required counters in \p Wait
1268bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1271 assert(ST);
1272 assert(isNormalMode(MaxCounter));
1273
1274 bool Modified = false;
1275 const DebugLoc &DL = Block.findDebugLoc(It);
1276
1277 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1278 // single instruction while VScnt has its own instruction.
1279 if (Wait.hasWaitExceptStoreCnt()) {
1280 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1281 [[maybe_unused]] auto SWaitInst =
1282 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1283 Modified = true;
1284
1285 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1286 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1287 dbgs() << "New Instr: " << *SWaitInst << '\n');
1288 }
1289
1290 if (Wait.hasWaitStoreCnt()) {
1291 assert(ST->hasVscnt());
1292
1293 [[maybe_unused]] auto SWaitInst =
1294 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1295 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1296 .addImm(Wait.StoreCnt);
1297 Modified = true;
1298
1299 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1300 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1301 dbgs() << "New Instr: " << *SWaitInst << '\n');
1302 }
1303
1304 return Modified;
1305}
1306
1307/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1308/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1309/// were added by previous passes. Currently this pass conservatively
1310/// assumes that these preexisting waits are required for correctness.
1311bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1312 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1314 assert(ST);
1315 assert(!isNormalMode(MaxCounter));
1316
1317 bool Modified = false;
1318 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1319 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1320 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1321
1322 for (auto &II :
1323 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1324 if (II.isMetaInstruction())
1325 continue;
1326
1327 MachineInstr **UpdatableInstr;
1328
1329 // Update required wait count. If this is a soft waitcnt (= it was added
1330 // by an earlier pass), it may be entirely removed.
1331
1332 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1333 bool IsSoft = Opcode != II.getOpcode();
1334
1335 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1336 unsigned OldEnc =
1337 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1339 if (IsSoft)
1340 ScoreBrackets.simplifyWaitcnt(OldWait);
1341 Wait = Wait.combined(OldWait);
1342 UpdatableInstr = &CombinedLoadDsCntInstr;
1343 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1344 unsigned OldEnc =
1345 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1347 if (IsSoft)
1348 ScoreBrackets.simplifyWaitcnt(OldWait);
1349 Wait = Wait.combined(OldWait);
1350 UpdatableInstr = &CombinedStoreDsCntInstr;
1351 } else {
1352 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1353 assert(CT.has_value());
1354 unsigned OldCnt =
1355 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1356 if (IsSoft)
1357 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
1358 addWait(Wait, CT.value(), OldCnt);
1359 UpdatableInstr = &WaitInstrs[CT.value()];
1360 }
1361
1362 // Merge consecutive waitcnt of the same type by erasing multiples.
1363 if (!*UpdatableInstr) {
1364 *UpdatableInstr = &II;
1365 } else {
1366 II.eraseFromParent();
1367 Modified = true;
1368 }
1369 }
1370
1371 if (CombinedLoadDsCntInstr) {
1372 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1373 // to be waited for. Otherwise, let the instruction be deleted so
1374 // the appropriate single counter wait instruction can be inserted
1375 // instead, when new S_WAIT_*CNT instructions are inserted by
1376 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1377 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1378 // the loop below that deals with single counter instructions.
1379 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1380 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1381 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1382 AMDGPU::OpName::simm16, NewEnc);
1383 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1384 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1385 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1386 Wait.LoadCnt = ~0u;
1387 Wait.DsCnt = ~0u;
1388
1389 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1390 ? dbgs() << "applyPreexistingWaitcnt\n"
1391 << "New Instr at block end: "
1392 << *CombinedLoadDsCntInstr << '\n'
1393 : dbgs() << "applyPreexistingWaitcnt\n"
1394 << "Old Instr: " << *It << "New Instr: "
1395 << *CombinedLoadDsCntInstr << '\n');
1396 } else {
1397 CombinedLoadDsCntInstr->eraseFromParent();
1398 Modified = true;
1399 }
1400 }
1401
1402 if (CombinedStoreDsCntInstr) {
1403 // Similarly for S_WAIT_STORECNT_DSCNT.
1404 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1405 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1406 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
1407 AMDGPU::OpName::simm16, NewEnc);
1408 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1409 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1410 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1411 Wait.StoreCnt = ~0u;
1412 Wait.DsCnt = ~0u;
1413
1414 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1415 ? dbgs() << "applyPreexistingWaitcnt\n"
1416 << "New Instr at block end: "
1417 << *CombinedStoreDsCntInstr << '\n'
1418 : dbgs() << "applyPreexistingWaitcnt\n"
1419 << "Old Instr: " << *It << "New Instr: "
1420 << *CombinedStoreDsCntInstr << '\n');
1421 } else {
1422 CombinedStoreDsCntInstr->eraseFromParent();
1423 Modified = true;
1424 }
1425 }
1426
1427 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
1428 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1429 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1430 // instructions so that createNewWaitcnt() will create new combined
1431 // instructions to replace them.
1432
1433 if (Wait.DsCnt != ~0u) {
1434 // This is a vector of addresses in WaitInstrs pointing to instructions
1435 // that should be removed if they are present.
1437
1438 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1439 // both) need to be waited for, ensure that there are no existing
1440 // individual wait count instructions for these.
1441
1442 if (Wait.LoadCnt != ~0u) {
1443 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
1444 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1445 } else if (Wait.StoreCnt != ~0u) {
1446 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
1447 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1448 }
1449
1450 for (MachineInstr **WI : WaitsToErase) {
1451 if (!*WI)
1452 continue;
1453
1454 (*WI)->eraseFromParent();
1455 *WI = nullptr;
1456 Modified = true;
1457 }
1458 }
1459
1460 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1461 if (!WaitInstrs[CT])
1462 continue;
1463
1464 unsigned NewCnt = getWait(Wait, CT);
1465 if (NewCnt != ~0u) {
1466 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
1467 AMDGPU::OpName::simm16, NewCnt);
1468 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1469
1470 ScoreBrackets.applyWaitcnt(CT, NewCnt);
1471 setNoWait(Wait, CT);
1472
1473 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1474 ? dbgs() << "applyPreexistingWaitcnt\n"
1475 << "New Instr at block end: " << *WaitInstrs[CT]
1476 << '\n'
1477 : dbgs() << "applyPreexistingWaitcnt\n"
1478 << "Old Instr: " << *It
1479 << "New Instr: " << *WaitInstrs[CT] << '\n');
1480 } else {
1481 WaitInstrs[CT]->eraseFromParent();
1482 Modified = true;
1483 }
1484 }
1485
1486 return Modified;
1487}
1488
1489/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
1490bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1493 assert(ST);
1494 assert(!isNormalMode(MaxCounter));
1495
1496 bool Modified = false;
1497 const DebugLoc &DL = Block.findDebugLoc(It);
1498
1499 // Check for opportunities to use combined wait instructions.
1500 if (Wait.DsCnt != ~0u) {
1501 MachineInstr *SWaitInst = nullptr;
1502
1503 if (Wait.LoadCnt != ~0u) {
1504 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1505
1506 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
1507 .addImm(Enc);
1508
1509 Wait.LoadCnt = ~0u;
1510 Wait.DsCnt = ~0u;
1511 } else if (Wait.StoreCnt != ~0u) {
1512 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1513
1514 SWaitInst =
1515 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
1516 .addImm(Enc);
1517
1518 Wait.StoreCnt = ~0u;
1519 Wait.DsCnt = ~0u;
1520 }
1521
1522 if (SWaitInst) {
1523 Modified = true;
1524
1525 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1526 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1527 dbgs() << "New Instr: " << *SWaitInst << '\n');
1528 }
1529 }
1530
1531 // Generate an instruction for any remaining counter that needs
1532 // waiting for.
1533
1534 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1535 unsigned Count = getWait(Wait, CT);
1536 if (Count == ~0u)
1537 continue;
1538
1539 [[maybe_unused]] auto SWaitInst =
1540 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
1541 .addImm(Count);
1542
1543 Modified = true;
1544
1545 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1546 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1547 dbgs() << "New Instr: " << *SWaitInst << '\n');
1548 }
1549
1550 return Modified;
1551}
1552
1553static bool readsVCCZ(const MachineInstr &MI) {
1554 unsigned Opc = MI.getOpcode();
1555 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1556 !MI.getOperand(1).isUndef();
1557}
1558
1559/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
1561 // Currently all conventions wait, but this may not always be the case.
1562 //
1563 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
1564 // senses to omit the wait and do it in the caller.
1565 return true;
1566}
1567
1568/// \returns true if the callee is expected to wait for any outstanding waits
1569/// before returning.
1571 return true;
1572}
1573
1574/// Generate s_waitcnt instruction to be placed before cur_Inst.
1575/// Instructions of a given type are returned in order,
1576/// but instructions of different types can complete out of order.
1577/// We rely on this in-order completion
1578/// and simply assign a score to the memory access instructions.
1579/// We keep track of the active "score bracket" to determine
1580/// if an access of a memory read requires an s_waitcnt
1581/// and if so what the value of each counter is.
1582/// The "score bracket" is bound by the lower bound and upper bound
1583/// scores (*_score_LB and *_score_ub respectively).
1584/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1585/// flush the vmcnt counter here.
1586bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1587 WaitcntBrackets &ScoreBrackets,
1588 MachineInstr *OldWaitcntInstr,
1589 bool FlushVmCnt) {
1590 setForceEmitWaitcnt();
1591
1592 if (MI.isMetaInstruction())
1593 return false;
1594
1596
1597 // FIXME: This should have already been handled by the memory legalizer.
1598 // Removing this currently doesn't affect any lit tests, but we need to
1599 // verify that nothing was relying on this. The number of buffer invalidates
1600 // being handled here should not be expanded.
1601 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1602 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1603 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1604 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1605 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1606 Wait.LoadCnt = 0;
1607 }
1608
1609 // All waits must be resolved at call return.
1610 // NOTE: this could be improved with knowledge of all call sites or
1611 // with knowledge of the called routines.
1612 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1613 MI.getOpcode() == AMDGPU::SI_RETURN ||
1614 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1615 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1616 Wait = Wait.combined(
1617 AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()));
1618 }
1619 // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1620 // stores. In this case it can be useful to send a message to explicitly
1621 // release all VGPRs before the stores have completed, but it is only safe to
1622 // do this if:
1623 // * there are no outstanding scratch stores
1624 // * we are not in Dynamic VGPR mode
1625 else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1626 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1627 if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone &&
1628 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1629 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
1630 ReleaseVGPRInsts.insert(&MI);
1631 }
1632 // Resolve vm waits before gs-done.
1633 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
1634 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1635 ST->hasLegacyGeometry() &&
1636 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1638 Wait.LoadCnt = 0;
1639 }
1640#if 0 // TODO: the following blocks of logic when we have fence.
1641 else if (MI.getOpcode() == SC_FENCE) {
1642 const unsigned int group_size =
1643 context->shader_info->GetMaxThreadGroupSize();
1644 // group_size == 0 means thread group size is unknown at compile time
1645 const bool group_is_multi_wave =
1646 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
1647 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
1648
1649 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
1650 SCRegType src_type = Inst->GetSrcType(i);
1651 switch (src_type) {
1652 case SCMEM_LDS:
1653 if (group_is_multi_wave ||
1654 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
1655 EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT,
1656 ScoreBrackets->getScoreUB(DS_CNT));
1657 // LDS may have to wait for VMcnt after buffer load to LDS
1658 if (target_info->HasBufferLoadToLDS()) {
1659 EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT,
1660 ScoreBrackets->getScoreUB(LOAD_CNT));
1661 }
1662 }
1663 break;
1664
1665 case SCMEM_GDS:
1666 if (group_is_multi_wave || fence_is_global) {
1667 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1668 ScoreBrackets->getScoreUB(EXP_CNT));
1669 EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT,
1670 ScoreBrackets->getScoreUB(DS_CNT));
1671 }
1672 break;
1673
1674 case SCMEM_UAV:
1675 case SCMEM_TFBUF:
1676 case SCMEM_RING:
1677 case SCMEM_SCATTER:
1678 if (group_is_multi_wave || fence_is_global) {
1679 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1680 ScoreBrackets->getScoreUB(EXP_CNT));
1681 EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT,
1682 ScoreBrackets->getScoreUB(LOAD_CNT));
1683 }
1684 break;
1685
1686 case SCMEM_SCRATCH:
1687 default:
1688 break;
1689 }
1690 }
1691 }
1692#endif
1693
1694 // Export & GDS instructions do not read the EXEC mask until after the export
1695 // is granted (which can occur well after the instruction is issued).
1696 // The shader program must flush all EXP operations on the export-count
1697 // before overwriting the EXEC mask.
1698 else {
1699 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1700 // Export and GDS are tracked individually, either may trigger a waitcnt
1701 // for EXEC.
1702 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1703 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1704 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1705 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1706 Wait.ExpCnt = 0;
1707 }
1708 }
1709
1710 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1711 // The function is going to insert a wait on everything in its prolog.
1712 // This still needs to be careful if the call target is a load (e.g. a GOT
1713 // load). We also need to check WAW dependency with saved PC.
1715
1716 int CallAddrOpIdx =
1717 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
1718
1719 if (MI.getOperand(CallAddrOpIdx).isReg()) {
1720 RegInterval CallAddrOpInterval =
1721 ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOpIdx);
1722
1723 for (int RegNo = CallAddrOpInterval.first;
1724 RegNo < CallAddrOpInterval.second; ++RegNo)
1725 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1726
1727 int RtnAddrOpIdx =
1728 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
1729 if (RtnAddrOpIdx != -1) {
1730 RegInterval RtnAddrOpInterval =
1731 ScoreBrackets.getRegInterval(&MI, MRI, TRI, RtnAddrOpIdx);
1732
1733 for (int RegNo = RtnAddrOpInterval.first;
1734 RegNo < RtnAddrOpInterval.second; ++RegNo)
1735 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1736 }
1737 }
1738 } else {
1739 // FIXME: Should not be relying on memoperands.
1740 // Look at the source operands of every instruction to see if
1741 // any of them results from a previous memory operation that affects
1742 // its current usage. If so, an s_waitcnt instruction needs to be
1743 // emitted.
1744 // If the source operand was defined by a load, add the s_waitcnt
1745 // instruction.
1746 //
1747 // Two cases are handled for destination operands:
1748 // 1) If the destination operand was defined by a load, add the s_waitcnt
1749 // instruction to guarantee the right WAW order.
1750 // 2) If a destination operand that was used by a recent export/store ins,
1751 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1752
1753 for (const MachineMemOperand *Memop : MI.memoperands()) {
1754 const Value *Ptr = Memop->getValue();
1755 if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
1756 addWait(Wait, SmemAccessCounter, 0);
1757 if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1758 SLoadAddresses.erase(Ptr);
1759 }
1760 unsigned AS = Memop->getAddrSpace();
1762 continue;
1763 // No need to wait before load from VMEM to LDS.
1764 if (TII->mayWriteLDSThroughDMA(MI))
1765 continue;
1766
1767 // LOAD_CNT is only relevant to vgpr or LDS.
1768 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1769 bool FoundAliasingStore = false;
1770 // Only objects with alias scope info were added to LDSDMAScopes array.
1771 // In the absense of the scope info we will not be able to disambiguate
1772 // aliasing here. There is no need to try searching for a corresponding
1773 // store slot. This is conservatively correct because in that case we
1774 // will produce a wait using the first (general) LDS DMA wait slot which
1775 // will wait on all of them anyway.
1776 if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1777 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
1778 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
1779 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
1780 FoundAliasingStore = true;
1781 ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
1782 }
1783 }
1784 }
1785 if (!FoundAliasingStore)
1786 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1787 if (Memop->isStore()) {
1788 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1789 }
1790 }
1791
1792 // Loop over use and def operands.
1793 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1794 MachineOperand &Op = MI.getOperand(I);
1795 if (!Op.isReg())
1796 continue;
1797
1798 // If the instruction does not read tied source, skip the operand.
1799 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1800 continue;
1801
1802 RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, I);
1803
1804 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1805 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1806 if (IsVGPR) {
1807 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1808 // previous write and this write are the same type of VMEM
1809 // instruction, in which case they're guaranteed to write their
1810 // results in order anyway.
1811 if (Op.isUse() || !updateVMCntOnly(MI) ||
1812 ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1813 getVmemType(MI))) {
1814 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1815 ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait);
1816 ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait);
1817 ScoreBrackets.clearVgprVmemTypes(RegNo);
1818 }
1819 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1820 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1821 }
1822 ScoreBrackets.determineWait(DS_CNT, RegNo, Wait);
1823 } else {
1824 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1825 }
1826 }
1827 }
1828 }
1829 }
1830
1831 // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1832 // not, we need to ensure the subtarget is capable of backing off barrier
1833 // instructions in case there are any outstanding memory operations that may
1834 // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
1835 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1836 !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
1837 Wait = Wait.combined(
1838 AMDGPU::Waitcnt::allZero(ST->hasExtendedWaitCounts(), ST->hasVscnt()));
1839 }
1840
1841 // TODO: Remove this work-around, enable the assert for Bug 457939
1842 // after fixing the scheduler. Also, the Shader Compiler code is
1843 // independent of target.
1844 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1845 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1846 Wait.DsCnt = 0;
1847 }
1848 }
1849
1850 // Verify that the wait is actually needed.
1851 ScoreBrackets.simplifyWaitcnt(Wait);
1852
1853 if (ForceEmitZeroWaitcnts)
1854 Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts());
1855
1856 if (ForceEmitWaitcnt[LOAD_CNT])
1857 Wait.LoadCnt = 0;
1858 if (ForceEmitWaitcnt[EXP_CNT])
1859 Wait.ExpCnt = 0;
1860 if (ForceEmitWaitcnt[DS_CNT])
1861 Wait.DsCnt = 0;
1862 if (ForceEmitWaitcnt[SAMPLE_CNT])
1863 Wait.SampleCnt = 0;
1864 if (ForceEmitWaitcnt[BVH_CNT])
1865 Wait.BvhCnt = 0;
1866 if (ForceEmitWaitcnt[KM_CNT])
1867 Wait.KmCnt = 0;
1868
1869 if (FlushVmCnt) {
1870 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
1871 Wait.LoadCnt = 0;
1872 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
1873 Wait.SampleCnt = 0;
1874 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
1875 Wait.BvhCnt = 0;
1876 }
1877
1878 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
1879 OldWaitcntInstr);
1880}
1881
1882// Add a waitcnt to flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the
1883// end of the given block if needed.
1884bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
1885 WaitcntBrackets &ScoreBrackets,
1886 MachineInstr *OldWaitcntInstr) {
1888
1889 unsigned LoadCntPending = ScoreBrackets.hasPendingEvent(LOAD_CNT);
1890 unsigned SampleCntPending = ScoreBrackets.hasPendingEvent(SAMPLE_CNT);
1891 unsigned BvhCntPending = ScoreBrackets.hasPendingEvent(BVH_CNT);
1892
1893 if (LoadCntPending == 0 && SampleCntPending == 0 && BvhCntPending == 0)
1894 return false;
1895
1896 if (LoadCntPending != 0)
1897 Wait.LoadCnt = 0;
1898 if (SampleCntPending != 0)
1899 Wait.SampleCnt = 0;
1900 if (BvhCntPending != 0)
1901 Wait.BvhCnt = 0;
1902
1903 return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
1904 OldWaitcntInstr);
1905}
1906
1907bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1910 WaitcntBrackets &ScoreBrackets,
1911 MachineInstr *OldWaitcntInstr) {
1912 bool Modified = false;
1913
1914 if (OldWaitcntInstr)
1915 // Try to merge the required wait with preexisting waitcnt instructions.
1916 // Also erase redundant waitcnt.
1917 Modified =
1918 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
1919
1920 // Any counts that could have been applied to any existing waitcnt
1921 // instructions will have been done so, now deal with any remaining.
1922 ScoreBrackets.applyWaitcnt(Wait);
1923
1924 // ExpCnt can be merged into VINTERP.
1925 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
1927 MachineOperand *WaitExp =
1928 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1929 if (Wait.ExpCnt < WaitExp->getImm()) {
1930 WaitExp->setImm(Wait.ExpCnt);
1931 Modified = true;
1932 }
1933 Wait.ExpCnt = ~0u;
1934
1935 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
1936 << "Update Instr: " << *It);
1937 }
1938
1939 if (WCG->createNewWaitcnt(Block, It, Wait))
1940 Modified = true;
1941
1942 return Modified;
1943}
1944
1945// This is a flat memory operation. Check to see if it has memory tokens other
1946// than LDS. Other address spaces supported by flat memory operations involve
1947// global memory.
1948bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1949 assert(TII->isFLAT(MI));
1950
1951 // All flat instructions use the VMEM counter.
1952 assert(TII->usesVM_CNT(MI));
1953
1954 // If there are no memory operands then conservatively assume the flat
1955 // operation may access VMEM.
1956 if (MI.memoperands_empty())
1957 return true;
1958
1959 // See if any memory operand specifies an address space that involves VMEM.
1960 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1961 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1962 // (GDS) address space is not supported by flat operations. Therefore, simply
1963 // return true unless only the LDS address space is found.
1964 for (const MachineMemOperand *Memop : MI.memoperands()) {
1965 unsigned AS = Memop->getAddrSpace();
1967 if (AS != AMDGPUAS::LOCAL_ADDRESS)
1968 return true;
1969 }
1970
1971 return false;
1972}
1973
1974// This is a flat memory operation. Check to see if it has memory tokens for
1975// either LDS or FLAT.
1976bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1977 assert(TII->isFLAT(MI));
1978
1979 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1980 if (!TII->usesLGKM_CNT(MI))
1981 return false;
1982
1983 // If in tgsplit mode then there can be no use of LDS.
1984 if (ST->isTgSplitEnabled())
1985 return false;
1986
1987 // If there are no memory operands then conservatively assume the flat
1988 // operation may access LDS.
1989 if (MI.memoperands_empty())
1990 return true;
1991
1992 // See if any memory operand specifies an address space that involves LDS.
1993 for (const MachineMemOperand *Memop : MI.memoperands()) {
1994 unsigned AS = Memop->getAddrSpace();
1996 return true;
1997 }
1998
1999 return false;
2000}
2001
2002// This is a flat memory operation. Check to see if it has memory tokens for
2003// either scratch or FLAT.
2004bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
2005 const MachineInstr &MI) const {
2006 assert(TII->isFLAT(MI));
2007
2008 // SCRATCH instructions always access scratch.
2009 if (TII->isFLATScratch(MI))
2010 return true;
2011
2012 // GLOBAL instructions never access scratch.
2013 if (TII->isFLATGlobal(MI))
2014 return false;
2015
2016 // If there are no memory operands then conservatively assume the flat
2017 // operation may access scratch.
2018 if (MI.memoperands_empty())
2019 return true;
2020
2021 // See if any memory operand specifies an address space that involves scratch.
2022 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
2023 unsigned AS = Memop->getAddrSpace();
2024 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
2025 });
2026}
2027
2029 auto Opc = Inst.getOpcode();
2030 return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
2031 Opc == AMDGPU::GLOBAL_WBINV;
2032}
2033
2034void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2035 WaitcntBrackets *ScoreBrackets) {
2036 // Now look at the instruction opcode. If it is a memory access
2037 // instruction, update the upper-bound of the appropriate counter's
2038 // bracket and the destination operand scores.
2039 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
2040
2041 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
2042 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
2043 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2044 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
2045 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
2046 } else {
2047 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
2048 }
2049 } else if (TII->isFLAT(Inst)) {
2050 // TODO: Track this properly.
2051 if (isCacheInvOrWBInst(Inst))
2052 return;
2053
2054 assert(Inst.mayLoadOrStore());
2055
2056 int FlatASCount = 0;
2057
2058 if (mayAccessVMEMThroughFlat(Inst)) {
2059 ++FlatASCount;
2060 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2061 Inst);
2062 }
2063
2064 if (mayAccessLDSThroughFlat(Inst)) {
2065 ++FlatASCount;
2066 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
2067 }
2068
2069 // A Flat memory operation must access at least one address space.
2070 assert(FlatASCount);
2071
2072 // This is a flat memory operation that access both VMEM and LDS, so note it
2073 // - it will require that both the VM and LGKM be flushed to zero if it is
2074 // pending when a VM or LGKM dependency occurs.
2075 if (FlatASCount > 1)
2076 ScoreBrackets->setPendingFlat();
2077 } else if (SIInstrInfo::isVMEM(Inst) &&
2079 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2080 Inst);
2081
2082 if (ST->vmemWriteNeedsExpWaitcnt() &&
2083 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2084 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
2085 }
2086 } else if (TII->isSMRD(Inst)) {
2087 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2088 } else if (Inst.isCall()) {
2089 if (callWaitsOnFunctionReturn(Inst)) {
2090 // Act as a wait on everything
2091 ScoreBrackets->applyWaitcnt(
2092 AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()));
2093 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2094 } else {
2095 // May need to way wait for anything.
2096 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
2097 }
2098 } else if (SIInstrInfo::isLDSDIR(Inst)) {
2099 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
2100 } else if (TII->isVINTERP(Inst)) {
2101 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2102 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2103 } else if (SIInstrInfo::isEXP(Inst)) {
2104 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2106 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
2107 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2108 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
2109 else
2110 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
2111 } else {
2112 switch (Inst.getOpcode()) {
2113 case AMDGPU::S_SENDMSG:
2114 case AMDGPU::S_SENDMSG_RTN_B32:
2115 case AMDGPU::S_SENDMSG_RTN_B64:
2116 case AMDGPU::S_SENDMSGHALT:
2117 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
2118 break;
2119 case AMDGPU::S_MEMTIME:
2120 case AMDGPU::S_MEMREALTIME:
2121 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2122 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2123 case AMDGPU::S_BARRIER_LEAVE:
2124 case AMDGPU::S_GET_BARRIER_STATE_M0:
2125 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2126 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2127 break;
2128 }
2129 }
2130}
2131
2132bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2133 unsigned OtherScore) {
2134 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2135 unsigned OtherShifted =
2136 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2137 Score = std::max(MyShifted, OtherShifted);
2138 return OtherShifted > MyShifted;
2139}
2140
2141/// Merge the pending events and associater score brackets of \p Other into
2142/// this brackets status.
2143///
2144/// Returns whether the merge resulted in a change that requires tighter waits
2145/// (i.e. the merged brackets strictly dominate the original brackets).
2146bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2147 bool StrictDom = false;
2148
2149 VgprUB = std::max(VgprUB, Other.VgprUB);
2150 SgprUB = std::max(SgprUB, Other.SgprUB);
2151
2152 for (auto T : inst_counter_types(MaxCounter)) {
2153 // Merge event flags for this counter
2154 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2155 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2156 if (OtherEvents & ~OldEvents)
2157 StrictDom = true;
2158 PendingEvents |= OtherEvents;
2159
2160 // Merge scores for this counter
2161 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2162 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2163 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2164 if (NewUB < ScoreLBs[T])
2165 report_fatal_error("waitcnt score overflow");
2166
2167 MergeInfo M;
2168 M.OldLB = ScoreLBs[T];
2169 M.OtherLB = Other.ScoreLBs[T];
2170 M.MyShift = NewUB - ScoreUBs[T];
2171 M.OtherShift = NewUB - Other.ScoreUBs[T];
2172
2173 ScoreUBs[T] = NewUB;
2174
2175 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
2176
2177 for (int J = 0; J <= VgprUB; J++)
2178 StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
2179
2180 if (T == SmemAccessCounter) {
2181 for (int J = 0; J <= SgprUB; J++)
2182 StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
2183 }
2184 }
2185
2186 for (int J = 0; J <= VgprUB; J++) {
2187 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
2188 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2189 VgprVmemTypes[J] = NewVmemTypes;
2190 }
2191
2192 return StrictDom;
2193}
2194
2195static bool isWaitInstr(MachineInstr &Inst) {
2196 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2197 return Opcode == AMDGPU::S_WAITCNT ||
2198 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2199 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2200 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2201 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2202 counterTypeForInstr(Opcode).has_value();
2203}
2204
2205// Generate s_waitcnt instructions where needed.
2206bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2208 WaitcntBrackets &ScoreBrackets) {
2209 bool Modified = false;
2210
2211 LLVM_DEBUG({
2212 dbgs() << "*** Block" << Block.getNumber() << " ***";
2213 ScoreBrackets.dump();
2214 });
2215
2216 // Track the correctness of vccz through this basic block. There are two
2217 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2218 // ST->partialVCCWritesUpdateVCCZ().
2219 bool VCCZCorrect = true;
2220 if (ST->hasReadVCCZBug()) {
2221 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2222 // to vcc and then issued an smem load.
2223 VCCZCorrect = false;
2224 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2225 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2226 // to vcc_lo or vcc_hi.
2227 VCCZCorrect = false;
2228 }
2229
2230 // Walk over the instructions.
2231 MachineInstr *OldWaitcntInstr = nullptr;
2232
2233 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2234 E = Block.instr_end();
2235 Iter != E;) {
2236 MachineInstr &Inst = *Iter;
2237
2238 // Track pre-existing waitcnts that were added in earlier iterations or by
2239 // the memory legalizer.
2240 if (isWaitInstr(Inst)) {
2241 if (!OldWaitcntInstr)
2242 OldWaitcntInstr = &Inst;
2243 ++Iter;
2244 continue;
2245 }
2246
2247 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2248 isPreheaderToFlush(Block, ScoreBrackets);
2249
2250 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
2251 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2252 FlushVmCnt);
2253 OldWaitcntInstr = nullptr;
2254
2255 // Restore vccz if it's not known to be correct already.
2256 bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
2257
2258 // Don't examine operands unless we need to track vccz correctness.
2259 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
2260 if (Inst.definesRegister(AMDGPU::VCC_LO) ||
2261 Inst.definesRegister(AMDGPU::VCC_HI)) {
2262 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2263 if (!ST->partialVCCWritesUpdateVCCZ())
2264 VCCZCorrect = false;
2265 } else if (Inst.definesRegister(AMDGPU::VCC)) {
2266 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
2267 // vccz bit, so when we detect that an instruction may read from a
2268 // corrupt vccz bit, we need to:
2269 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2270 // operations to complete.
2271 // 2. Restore the correct value of vccz by writing the current value
2272 // of vcc back to vcc.
2273 if (ST->hasReadVCCZBug() &&
2274 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2275 // Writes to vcc while there's an outstanding smem read may get
2276 // clobbered as soon as any read completes.
2277 VCCZCorrect = false;
2278 } else {
2279 // Writes to vcc will fix any incorrect value in vccz.
2280 VCCZCorrect = true;
2281 }
2282 }
2283 }
2284
2285 if (TII->isSMRD(Inst)) {
2286 for (const MachineMemOperand *Memop : Inst.memoperands()) {
2287 // No need to handle invariant loads when avoiding WAR conflicts, as
2288 // there cannot be a vector store to the same memory location.
2289 if (!Memop->isInvariant()) {
2290 const Value *Ptr = Memop->getValue();
2291 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
2292 }
2293 }
2294 if (ST->hasReadVCCZBug()) {
2295 // This smem read could complete and clobber vccz at any time.
2296 VCCZCorrect = false;
2297 }
2298 }
2299
2300 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2301
2302#if 0 // TODO: implement resource type check controlled by options with ub = LB.
2303 // If this instruction generates a S_SETVSKIP because it is an
2304 // indexed resource, and we are on Tahiti, then it will also force
2305 // an S_WAITCNT vmcnt(0)
2306 if (RequireCheckResourceType(Inst, context)) {
2307 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
2308 ScoreBrackets->setScoreLB(LOAD_CNT,
2309 ScoreBrackets->getScoreUB(LOAD_CNT));
2310 }
2311#endif
2312
2313 LLVM_DEBUG({
2314 Inst.print(dbgs());
2315 ScoreBrackets.dump();
2316 });
2317
2318 // TODO: Remove this work-around after fixing the scheduler and enable the
2319 // assert above.
2320 if (RestoreVCCZ) {
2321 // Restore the vccz bit. Any time a value is written to vcc, the vcc
2322 // bit is updated, so we can restore the bit by reading the value of
2323 // vcc and then writing it back to the register.
2324 BuildMI(Block, Inst, Inst.getDebugLoc(),
2325 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2326 TRI->getVCC())
2327 .addReg(TRI->getVCC());
2328 VCCZCorrect = true;
2329 Modified = true;
2330 }
2331
2332 ++Iter;
2333 }
2334
2335 if (Block.getFirstTerminator() == Block.end() &&
2336 isPreheaderToFlush(Block, ScoreBrackets))
2337 Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
2338
2339 return Modified;
2340}
2341
2342// Return true if the given machine basic block is a preheader of a loop in
2343// which we want to flush the vmcnt counter, and false otherwise.
2344bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
2345 WaitcntBrackets &ScoreBrackets) {
2346 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
2347 if (!IsInserted)
2348 return Iterator->second;
2349
2351 if (!Succ)
2352 return false;
2353
2354 MachineLoop *Loop = MLI->getLoopFor(Succ);
2355 if (!Loop)
2356 return false;
2357
2358 if (Loop->getLoopPreheader() == &MBB &&
2359 shouldFlushVmCnt(Loop, ScoreBrackets)) {
2360 Iterator->second = true;
2361 return true;
2362 }
2363
2364 return false;
2365}
2366
2367bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2368 return SIInstrInfo::isVMEM(MI) ||
2369 (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
2370}
2371
2372// Return true if it is better to flush the vmcnt counter in the preheader of
2373// the given loop. We currently decide to flush in two situations:
2374// 1. The loop contains vmem store(s), no vmem load and at least one use of a
2375// vgpr containing a value that is loaded outside of the loop. (Only on
2376// targets with no vscnt counter).
2377// 2. The loop contains vmem load(s), but the loaded values are not used in the
2378// loop, and at least one use of a vgpr containing a value that is loaded
2379// outside of the loop.
2380bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2381 WaitcntBrackets &Brackets) {
2382 bool HasVMemLoad = false;
2383 bool HasVMemStore = false;
2384 bool UsesVgprLoadedOutside = false;
2385 DenseSet<Register> VgprUse;
2386 DenseSet<Register> VgprDef;
2387
2388 for (MachineBasicBlock *MBB : ML->blocks()) {
2389 for (MachineInstr &MI : *MBB) {
2390 if (isVMEMOrFlatVMEM(MI)) {
2391 if (MI.mayLoad())
2392 HasVMemLoad = true;
2393 if (MI.mayStore())
2394 HasVMemStore = true;
2395 }
2396 for (unsigned I = 0; I < MI.getNumOperands(); I++) {
2397 MachineOperand &Op = MI.getOperand(I);
2398 if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
2399 continue;
2400 RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, I);
2401 // Vgpr use
2402 if (Op.isUse()) {
2403 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2404 // If we find a register that is loaded inside the loop, 1. and 2.
2405 // are invalidated and we can exit.
2406 if (VgprDef.contains(RegNo))
2407 return false;
2408 VgprUse.insert(RegNo);
2409 // If at least one of Op's registers is in the score brackets, the
2410 // value is likely loaded outside of the loop.
2411 if (Brackets.getRegScore(RegNo, LOAD_CNT) >
2412 Brackets.getScoreLB(LOAD_CNT) ||
2413 Brackets.getRegScore(RegNo, SAMPLE_CNT) >
2414 Brackets.getScoreLB(SAMPLE_CNT) ||
2415 Brackets.getRegScore(RegNo, BVH_CNT) >
2416 Brackets.getScoreLB(BVH_CNT)) {
2417 UsesVgprLoadedOutside = true;
2418 break;
2419 }
2420 }
2421 }
2422 // VMem load vgpr def
2423 else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
2424 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2425 // If we find a register that is loaded inside the loop, 1. and 2.
2426 // are invalidated and we can exit.
2427 if (VgprUse.contains(RegNo))
2428 return false;
2429 VgprDef.insert(RegNo);
2430 }
2431 }
2432 }
2433 }
2434 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2435 return true;
2436 return HasVMemLoad && UsesVgprLoadedOutside;
2437}
2438
2439bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
2440 ST = &MF.getSubtarget<GCNSubtarget>();
2441 TII = ST->getInstrInfo();
2442 TRI = &TII->getRegisterInfo();
2443 MRI = &MF.getRegInfo();
2445 MLI = &getAnalysis<MachineLoopInfo>();
2446 PDT = &getAnalysis<MachinePostDominatorTree>();
2447 if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2448 AA = &AAR->getAAResults();
2449
2451
2452 if (ST->hasExtendedWaitCounts()) {
2453 MaxCounter = NUM_EXTENDED_INST_CNTS;
2454 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(ST, MaxCounter);
2455 WCG = &WCGGFX12Plus;
2456 } else {
2457 MaxCounter = NUM_NORMAL_INST_CNTS;
2458 WCGPreGFX12 = WaitcntGeneratorPreGFX12(ST);
2459 WCG = &WCGPreGFX12;
2460 }
2461
2462 ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
2463 for (auto T : inst_counter_types())
2464 ForceEmitWaitcnt[T] = false;
2465
2466 const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
2467
2468 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
2469
2470 OptNone = MF.getFunction().hasOptNone() ||
2471 MF.getTarget().getOptLevel() == CodeGenOptLevel::None;
2472
2473 HardwareLimits Limits = {};
2474 if (ST->hasExtendedWaitCounts()) {
2475 Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
2476 Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
2477 } else {
2478 Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
2479 Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
2480 }
2481 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
2482 Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
2483 Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
2484 Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
2485 Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
2486
2487 unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
2488 unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
2489 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2490 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2491
2492 RegisterEncoding Encoding = {};
2493 Encoding.VGPR0 =
2494 TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2495 Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
2496 Encoding.SGPR0 =
2497 TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2498 Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
2499
2500 BlockInfos.clear();
2501 bool Modified = false;
2502
2503 MachineBasicBlock &EntryBB = MF.front();
2505
2506 if (!MFI->isEntryFunction()) {
2507 // Wait for any outstanding memory operations that the input registers may
2508 // depend on. We can't track them and it's better to do the wait after the
2509 // costly call sequence.
2510
2511 // TODO: Could insert earlier and schedule more liberally with operations
2512 // that only use caller preserved registers.
2513 for (MachineBasicBlock::iterator E = EntryBB.end();
2514 I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
2515 ;
2516
2517 if (ST->hasExtendedWaitCounts()) {
2518 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2519 .addImm(0);
2520 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2521 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
2522 continue;
2523
2524 BuildMI(EntryBB, I, DebugLoc(),
2525 TII->get(instrsForExtendedCounterTypes[CT]))
2526 .addImm(0);
2527 }
2528 } else {
2529 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
2530 }
2531
2532 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
2533 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
2534 SmemAccessCounter);
2535 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2536 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2537
2538 Modified = true;
2539 }
2540
2541 // Keep iterating over the blocks in reverse post order, inserting and
2542 // updating s_waitcnt where needed, until a fix point is reached.
2544 BlockInfos.insert({MBB, BlockInfo()});
2545
2546 std::unique_ptr<WaitcntBrackets> Brackets;
2547 bool Repeat;
2548 do {
2549 Repeat = false;
2550
2551 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
2552 ++BII) {
2553 MachineBasicBlock *MBB = BII->first;
2554 BlockInfo &BI = BII->second;
2555 if (!BI.Dirty)
2556 continue;
2557
2558 if (BI.Incoming) {
2559 if (!Brackets)
2560 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
2561 else
2562 *Brackets = *BI.Incoming;
2563 } else {
2564 if (!Brackets)
2565 Brackets = std::make_unique<WaitcntBrackets>(
2566 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
2567 SmemAccessCounter);
2568 else
2569 *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding,
2570 WaitEventMaskForInst, SmemAccessCounter);
2571 }
2572
2573 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
2574 BI.Dirty = false;
2575
2576 if (Brackets->hasPendingEvent()) {
2577 BlockInfo *MoveBracketsToSucc = nullptr;
2578 for (MachineBasicBlock *Succ : MBB->successors()) {
2579 auto SuccBII = BlockInfos.find(Succ);
2580 BlockInfo &SuccBI = SuccBII->second;
2581 if (!SuccBI.Incoming) {
2582 SuccBI.Dirty = true;
2583 if (SuccBII <= BII)
2584 Repeat = true;
2585 if (!MoveBracketsToSucc) {
2586 MoveBracketsToSucc = &SuccBI;
2587 } else {
2588 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
2589 }
2590 } else if (SuccBI.Incoming->merge(*Brackets)) {
2591 SuccBI.Dirty = true;
2592 if (SuccBII <= BII)
2593 Repeat = true;
2594 }
2595 }
2596 if (MoveBracketsToSucc)
2597 MoveBracketsToSucc->Incoming = std::move(Brackets);
2598 }
2599 }
2600 } while (Repeat);
2601
2602 if (ST->hasScalarStores()) {
2604 bool HaveScalarStores = false;
2605
2606 for (MachineBasicBlock &MBB : MF) {
2607 for (MachineInstr &MI : MBB) {
2608 if (!HaveScalarStores && TII->isScalarStore(MI))
2609 HaveScalarStores = true;
2610
2611 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
2612 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2613 EndPgmBlocks.push_back(&MBB);
2614 }
2615 }
2616
2617 if (HaveScalarStores) {
2618 // If scalar writes are used, the cache must be flushed or else the next
2619 // wave to reuse the same scratch memory can be clobbered.
2620 //
2621 // Insert s_dcache_wb at wave termination points if there were any scalar
2622 // stores, and only if the cache hasn't already been flushed. This could
2623 // be improved by looking across blocks for flushes in postdominating
2624 // blocks from the stores but an explicitly requested flush is probably
2625 // very rare.
2626 for (MachineBasicBlock *MBB : EndPgmBlocks) {
2627 bool SeenDCacheWB = false;
2628
2629 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
2630 I != E; ++I) {
2631 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
2632 SeenDCacheWB = true;
2633 else if (TII->isScalarStore(*I))
2634 SeenDCacheWB = false;
2635
2636 // FIXME: It would be better to insert this before a waitcnt if any.
2637 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
2638 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2639 !SeenDCacheWB) {
2640 Modified = true;
2641 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
2642 }
2643 }
2644 }
2645 }
2646 }
2647
2648 // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
2649 // instructions.
2650 for (MachineInstr *MI : ReleaseVGPRInsts) {
2651 if (ST->requiresNopBeforeDeallocVGPRs()) {
2652 BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_NOP))
2653 .addImm(0);
2654 }
2655 BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_SENDMSG))
2657 Modified = true;
2658 }
2659 ReleaseVGPRInsts.clear();
2660
2661 return Modified;
2662}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:182
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1289
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
static bool isCacheInvOrWBInst(MachineInstr &Inst)
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static bool readsVCCZ(const MachineInstr &MI)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define DEBUG_TYPE
SI Insert Waitcnts
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Provides some synthesis utilities to produce sequences of values.
static const uint32_t IV[8]
Definition: blake3_impl.h:78
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Represent the analysis usage information of a pass.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
This class represents an Operation in the Expression.
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:100
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:72
A debug info location.
Definition: DebugLoc.h:33
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
bool erase(const KeyT &Val)
Definition: DenseMap.h:329
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
bool hasOptNone() const
Do not optimize this function (-O0).
Definition: Function.h:671
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:36
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:68
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:543
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:326
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:915
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:546
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr fully defines the specified register.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:756
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:472
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:553
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
A description of a memory reference used in the backend.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
iterator find(const KeyT &Key)
Definition: MapVector.h:167
iterator begin()
Definition: MapVector.h:69
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
void clear()
Definition: MapVector.h:88
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:432
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:649
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:691
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:588
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:833
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:570
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:596
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:673
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:936
static bool isVINTERP(const MachineInstr &MI)
Definition: SIInstrInfo.h:841
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:580
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
self_iterator getIterator()
Definition: ilist_node.h:109
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
unsigned getStorecntBitMask(const IsaVersion &Version)
IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getSamplecntBitMask(const IsaVersion &Version)
unsigned getKmcntBitMask(const IsaVersion &Version)
unsigned getVmcntBitMask(const IsaVersion &Version)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getBvhcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getLoadcntBitMask(const IsaVersion &Version)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned getDscntBitMask(const IsaVersion &Version)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition: Sequence.h:337
@ Wait
Definition: Threading.h:61
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:665
char & SIInsertWaitcntsID
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1738
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
FunctionPass * createSIInsertWaitcntsPass()
Instruction set architecture version.
Definition: TargetParser.h:125
Represents the counter values to wait for in an s_waitcnt instruction.
static Waitcnt allZero(bool Extended, bool HasStorecnt)
static Waitcnt allZeroExceptVsCnt(bool Extended)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
static constexpr bool is_iterable
Definition: Sequence.h:100