LLVM 19.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
31#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/Sequence.h"
40using namespace llvm;
41
42#define DEBUG_TYPE "si-insert-waitcnts"
43
44DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
45 "Force emit s_waitcnt expcnt(0) instrs");
46DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
47 "Force emit s_waitcnt lgkmcnt(0) instrs");
48DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
49 "Force emit s_waitcnt vmcnt(0) instrs");
50
52 "amdgpu-waitcnt-forcezero",
53 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
54 cl::init(false), cl::Hidden);
55
56namespace {
57// Class of object that encapsulates latest instruction counter score
58// associated with the operand. Used for determining whether
59// s_waitcnt instruction needs to be emitted.
60
61enum InstCounterType {
62 LOAD_CNT = 0, // VMcnt prior to gfx12.
63 DS_CNT, // LKGMcnt prior to gfx12.
64 EXP_CNT, //
65 STORE_CNT, // VScnt in gfx10/gfx11.
66 NUM_NORMAL_INST_CNTS,
67 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
68 BVH_CNT, // gfx12+ only.
69 KM_CNT, // gfx12+ only.
70 NUM_EXTENDED_INST_CNTS,
71 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
72};
73} // namespace
74
75namespace llvm {
76template <> struct enum_iteration_traits<InstCounterType> {
77 static constexpr bool is_iterable = true;
78};
79} // namespace llvm
80
81namespace {
82// Return an iterator over all counters between LOAD_CNT (the first counter)
83// and \c MaxCounter (exclusive, default value yields an enumeration over
84// all counters).
85auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
86 return enum_seq(LOAD_CNT, MaxCounter);
87}
88
89using RegInterval = std::pair<int, int>;
90
91struct HardwareLimits {
92 unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
93 unsigned ExpcntMax;
94 unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
95 unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
96 unsigned SamplecntMax; // gfx12+ only.
97 unsigned BvhcntMax; // gfx12+ only.
98 unsigned KmcntMax; // gfx12+ only.
99};
100
101struct RegisterEncoding {
102 unsigned VGPR0;
103 unsigned VGPRL;
104 unsigned SGPR0;
105 unsigned SGPRL;
106};
107
108enum WaitEventType {
109 VMEM_ACCESS, // vector-memory read & write
110 VMEM_READ_ACCESS, // vector-memory read
111 VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
112 VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only)
113 VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
114 SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
115 LDS_ACCESS, // lds read & write
116 GDS_ACCESS, // gds read & write
117 SQ_MESSAGE, // send message
118 SMEM_ACCESS, // scalar-memory read & write
119 EXP_GPR_LOCK, // export holding on its data src
120 GDS_GPR_LOCK, // GDS holding on its data and addr src
121 EXP_POS_ACCESS, // write to export position
122 EXP_PARAM_ACCESS, // write to export parameter
123 VMW_GPR_LOCK, // vector-memory write holding on its data src
124 EXP_LDS_ACCESS, // read by ldsdir counting as export
125 NUM_WAIT_EVENTS,
126};
127
128// The mapping is:
129// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
130// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
131// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
132// We reserve a fixed number of VGPR slots in the scoring tables for
133// special tokens like SCMEM_LDS (needed for buffer load to LDS).
134enum RegisterMapping {
135 SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
136 AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
137 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
138 NUM_EXTRA_VGPRS = 9, // Reserved slots for DS.
139 // Artificial register slots to track LDS writes into specific LDS locations
140 // if a location is known. When slots are exhausted or location is
141 // unknown use the first slot. The first slot is also always updated in
142 // addition to known location's slot to properly generate waits if dependent
143 // instruction's location is unknown.
144 EXTRA_VGPR_LDS = 0,
145 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
146};
147
148// Enumerate different types of result-returning VMEM operations. Although
149// s_waitcnt orders them all with a single vmcnt counter, in the absence of
150// s_waitcnt only instructions of the same VmemType are guaranteed to write
151// their results in order -- so there is no need to insert an s_waitcnt between
152// two instructions of the same type that write the same vgpr.
153enum VmemType {
154 // BUF instructions and MIMG instructions without a sampler.
155 VMEM_NOSAMPLER,
156 // MIMG instructions with a sampler.
157 VMEM_SAMPLER,
158 // BVH instructions
159 VMEM_BVH,
160 NUM_VMEM_TYPES
161};
162
163// Maps values of InstCounterType to the instruction that waits on that
164// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
165// returns true.
166static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
167 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
168 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
169 AMDGPU::S_WAIT_KMCNT};
170
171static bool updateVMCntOnly(const MachineInstr &Inst) {
172 return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
174}
175
176#ifndef NDEBUG
177static bool isNormalMode(InstCounterType MaxCounter) {
178 return MaxCounter == NUM_NORMAL_INST_CNTS;
179}
180#endif // NDEBUG
181
182VmemType getVmemType(const MachineInstr &Inst) {
183 assert(updateVMCntOnly(Inst));
184 if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) &&
186 return VMEM_NOSAMPLER;
188 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
190 // We have to make an additional check for isVSAMPLE here since some
191 // instructions don't have a sampler, but are still classified as sampler
192 // instructions for the purposes of e.g. waitcnt.
193 return BaseInfo->BVH ? VMEM_BVH
194 : (BaseInfo->Sampler || SIInstrInfo::isVSAMPLE(Inst)) ? VMEM_SAMPLER
195 : VMEM_NOSAMPLER;
196}
197
198unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
199 switch (T) {
200 case LOAD_CNT:
201 return Wait.LoadCnt;
202 case EXP_CNT:
203 return Wait.ExpCnt;
204 case DS_CNT:
205 return Wait.DsCnt;
206 case STORE_CNT:
207 return Wait.StoreCnt;
208 case SAMPLE_CNT:
209 return Wait.SampleCnt;
210 case BVH_CNT:
211 return Wait.BvhCnt;
212 case KM_CNT:
213 return Wait.KmCnt;
214 default:
215 llvm_unreachable("bad InstCounterType");
216 }
217}
218
219void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
220 unsigned &WC = getCounterRef(Wait, T);
221 WC = std::min(WC, Count);
222}
223
224void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
225 getCounterRef(Wait, T) = ~0u;
226}
227
228unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
229 return getCounterRef(Wait, T);
230}
231
232// Mapping from event to counter according to the table masks.
233InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
234 for (auto T : inst_counter_types()) {
235 if (masks[T] & (1 << E))
236 return T;
237 }
238 llvm_unreachable("event type has no associated counter");
239}
240
241// This objects maintains the current score brackets of each wait counter, and
242// a per-register scoreboard for each wait counter.
243//
244// We also maintain the latest score for every event type that can change the
245// waitcnt in order to know if there are multiple types of events within
246// the brackets. When multiple types of event happen in the bracket,
247// wait count may get decreased out of order, therefore we need to put in
248// "s_waitcnt 0" before use.
249class WaitcntBrackets {
250public:
251 WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
252 HardwareLimits Limits, RegisterEncoding Encoding,
253 const unsigned *WaitEventMaskForInst,
254 InstCounterType SmemAccessCounter)
255 : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
256 Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
257 SmemAccessCounter(SmemAccessCounter) {}
258
259 unsigned getWaitCountMax(InstCounterType T) const {
260 switch (T) {
261 case LOAD_CNT:
262 return Limits.LoadcntMax;
263 case DS_CNT:
264 return Limits.DscntMax;
265 case EXP_CNT:
266 return Limits.ExpcntMax;
267 case STORE_CNT:
268 return Limits.StorecntMax;
269 case SAMPLE_CNT:
270 return Limits.SamplecntMax;
271 case BVH_CNT:
272 return Limits.BvhcntMax;
273 case KM_CNT:
274 return Limits.KmcntMax;
275 default:
276 break;
277 }
278 return 0;
279 }
280
281 unsigned getScoreLB(InstCounterType T) const {
282 assert(T < NUM_INST_CNTS);
283 return ScoreLBs[T];
284 }
285
286 unsigned getScoreUB(InstCounterType T) const {
287 assert(T < NUM_INST_CNTS);
288 return ScoreUBs[T];
289 }
290
291 unsigned getScoreRange(InstCounterType T) const {
292 return getScoreUB(T) - getScoreLB(T);
293 }
294
295 unsigned getRegScore(int GprNo, InstCounterType T) const {
296 if (GprNo < NUM_ALL_VGPRS) {
297 return VgprScores[T][GprNo];
298 }
299 assert(T == SmemAccessCounter);
300 return SgprScores[GprNo - NUM_ALL_VGPRS];
301 }
302
303 bool merge(const WaitcntBrackets &Other);
304
305 RegInterval getRegInterval(const MachineInstr *MI,
307 const SIRegisterInfo *TRI, unsigned OpNo) const;
308
309 bool counterOutOfOrder(InstCounterType T) const;
310 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
311 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
312 void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
313 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
314 void applyWaitcnt(InstCounterType T, unsigned Count);
315 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
316 const MachineRegisterInfo *MRI, WaitEventType E,
318
319 unsigned hasPendingEvent() const { return PendingEvents; }
320 unsigned hasPendingEvent(WaitEventType E) const {
321 return PendingEvents & (1 << E);
322 }
323 unsigned hasPendingEvent(InstCounterType T) const {
324 unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
325 assert((HasPending != 0) == (getScoreRange(T) != 0));
326 return HasPending;
327 }
328
329 bool hasMixedPendingEvents(InstCounterType T) const {
330 unsigned Events = hasPendingEvent(T);
331 // Return true if more than one bit is set in Events.
332 return Events & (Events - 1);
333 }
334
335 bool hasPendingFlat() const {
336 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
337 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
338 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
339 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
340 }
341
342 void setPendingFlat() {
343 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
344 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
345 }
346
347 // Return true if there might be pending writes to the specified vgpr by VMEM
348 // instructions with types different from V.
349 bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
350 assert(GprNo < NUM_ALL_VGPRS);
351 return VgprVmemTypes[GprNo] & ~(1 << V);
352 }
353
354 void clearVgprVmemTypes(int GprNo) {
355 assert(GprNo < NUM_ALL_VGPRS);
356 VgprVmemTypes[GprNo] = 0;
357 }
358
359 void setStateOnFunctionEntryOrReturn() {
360 setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
361 PendingEvents |= WaitEventMaskForInst[STORE_CNT];
362 }
363
364 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
365 return LDSDMAStores;
366 }
367
368 void print(raw_ostream &);
369 void dump() { print(dbgs()); }
370
371private:
372 struct MergeInfo {
373 unsigned OldLB;
374 unsigned OtherLB;
375 unsigned MyShift;
376 unsigned OtherShift;
377 };
378 static bool mergeScore(const MergeInfo &M, unsigned &Score,
379 unsigned OtherScore);
380
381 void setScoreLB(InstCounterType T, unsigned Val) {
382 assert(T < NUM_INST_CNTS);
383 ScoreLBs[T] = Val;
384 }
385
386 void setScoreUB(InstCounterType T, unsigned Val) {
387 assert(T < NUM_INST_CNTS);
388 ScoreUBs[T] = Val;
389
390 if (T != EXP_CNT)
391 return;
392
393 if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
394 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
395 }
396
397 void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
398 if (GprNo < NUM_ALL_VGPRS) {
399 VgprUB = std::max(VgprUB, GprNo);
400 VgprScores[T][GprNo] = Val;
401 } else {
402 assert(T == SmemAccessCounter);
403 SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
404 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
405 }
406 }
407
408 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
410 unsigned OpNo, unsigned Val);
411
412 const GCNSubtarget *ST = nullptr;
413 InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
414 HardwareLimits Limits = {};
415 RegisterEncoding Encoding = {};
416 const unsigned *WaitEventMaskForInst;
417 InstCounterType SmemAccessCounter;
418 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
419 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
420 unsigned PendingEvents = 0;
421 // Remember the last flat memory operation.
422 unsigned LastFlat[NUM_INST_CNTS] = {0};
423 // wait_cnt scores for every vgpr.
424 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
425 int VgprUB = -1;
426 int SgprUB = -1;
427 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
428 // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
429 // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
430 unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
431 // Bitmask of the VmemTypes of VMEM instructions that might have a pending
432 // write to each vgpr.
433 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
434 // Store representative LDS DMA operations. The only useful info here is
435 // alias info. One store is kept per unique AAInfo.
436 SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
437};
438
439// This abstracts the logic for generating and updating S_WAIT* instructions
440// away from the analysis that determines where they are needed. This was
441// done because the set of counters and instructions for waiting on them
442// underwent a major shift with gfx12, sufficiently so that having this
443// abstraction allows the main analysis logic to be simpler than it would
444// otherwise have had to become.
445class WaitcntGenerator {
446protected:
447 const GCNSubtarget *ST = nullptr;
448 const SIInstrInfo *TII = nullptr;
450 InstCounterType MaxCounter;
451 bool OptNone;
452
453public:
454 WaitcntGenerator() {}
455 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
456 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
457 IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
458 OptNone(MF.getFunction().hasOptNone() ||
459 MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
460
461 // Return true if the current function should be compiled with no
462 // optimization.
463 bool isOptNone() const { return OptNone; }
464
465 // Edits an existing sequence of wait count instructions according
466 // to an incoming Waitcnt value, which is itself updated to reflect
467 // any new wait count instructions which may need to be generated by
468 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
469 // were made.
470 //
471 // This editing will usually be merely updated operands, but it may also
472 // delete instructions if the incoming Wait value indicates they are not
473 // needed. It may also remove existing instructions for which a wait
474 // is needed if it can be determined that it is better to generate new
475 // instructions later, as can happen on gfx12.
476 virtual bool
477 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
478 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
480
481 // Transform a soft waitcnt into a normal one.
482 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
483
484 // Generates new wait count instructions according to the value of
485 // Wait, returning true if any new instructions were created.
486 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
489
490 // Returns an array of bit masks which can be used to map values in
491 // WaitEventType to corresponding counter values in InstCounterType.
492 virtual const unsigned *getWaitEventMask() const = 0;
493
494 // Returns a new waitcnt with all counters except VScnt set to 0. If
495 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
496 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
497
498 virtual ~WaitcntGenerator() = default;
499
500 // Create a mask value from the initializer list of wait event types.
501 static constexpr unsigned
502 eventMask(std::initializer_list<WaitEventType> Events) {
503 unsigned Mask = 0;
504 for (auto &E : Events)
505 Mask |= 1 << E;
506
507 return Mask;
508 }
509};
510
511class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
512public:
513 WaitcntGeneratorPreGFX12() {}
514 WaitcntGeneratorPreGFX12(const MachineFunction &MF)
515 : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
516
517 bool
518 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
519 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
520 MachineBasicBlock::instr_iterator It) const override;
521
522 bool createNewWaitcnt(MachineBasicBlock &Block,
524 AMDGPU::Waitcnt Wait) override;
525
526 const unsigned *getWaitEventMask() const override {
527 assert(ST);
528
529 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
530 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
531 VMEM_BVH_READ_ACCESS}),
532 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
533 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
534 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
535 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
536 0,
537 0,
538 0};
539
540 return WaitEventMaskForInstPreGFX12;
541 }
542
543 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
544};
545
546class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
547public:
548 WaitcntGeneratorGFX12Plus() {}
549 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
550 InstCounterType MaxCounter)
551 : WaitcntGenerator(MF, MaxCounter) {}
552
553 bool
554 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
555 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
556 MachineBasicBlock::instr_iterator It) const override;
557
558 bool createNewWaitcnt(MachineBasicBlock &Block,
560 AMDGPU::Waitcnt Wait) override;
561
562 const unsigned *getWaitEventMask() const override {
563 assert(ST);
564
565 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
566 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
567 eventMask({LDS_ACCESS, GDS_ACCESS}),
568 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
569 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
570 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
571 eventMask({VMEM_SAMPLER_READ_ACCESS}),
572 eventMask({VMEM_BVH_READ_ACCESS}),
573 eventMask({SMEM_ACCESS, SQ_MESSAGE})};
574
575 return WaitEventMaskForInstGFX12Plus;
576 }
577
578 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
579};
580
581class SIInsertWaitcnts : public MachineFunctionPass {
582private:
583 const GCNSubtarget *ST = nullptr;
584 const SIInstrInfo *TII = nullptr;
585 const SIRegisterInfo *TRI = nullptr;
586 const MachineRegisterInfo *MRI = nullptr;
587
589 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
590 MachineLoopInfo *MLI;
592 AliasAnalysis *AA = nullptr;
593
594 struct BlockInfo {
595 std::unique_ptr<WaitcntBrackets> Incoming;
596 bool Dirty = true;
597 };
598
599 InstCounterType SmemAccessCounter;
600
602
603 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
604 // because of amdgpu-waitcnt-forcezero flag
605 bool ForceEmitZeroWaitcnts;
606 bool ForceEmitWaitcnt[NUM_INST_CNTS];
607
608 // In any given run of this pass, WCG will point to one of these two
609 // generator objects, which must have been re-initialised before use
610 // from a value made using a subtarget constructor.
611 WaitcntGeneratorPreGFX12 WCGPreGFX12;
612 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
613
614 WaitcntGenerator *WCG = nullptr;
615
616 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
617 // message.
618 DenseSet<MachineInstr *> ReleaseVGPRInsts;
619
620 InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
621
622public:
623 static char ID;
624
625 SIInsertWaitcnts() : MachineFunctionPass(ID) {
626 (void)ForceExpCounter;
627 (void)ForceLgkmCounter;
628 (void)ForceVMCounter;
629 }
630
631 bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
632 bool isPreheaderToFlush(MachineBasicBlock &MBB,
633 WaitcntBrackets &ScoreBrackets);
634 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
635 bool runOnMachineFunction(MachineFunction &MF) override;
636
637 StringRef getPassName() const override {
638 return "SI insert wait instructions";
639 }
640
641 void getAnalysisUsage(AnalysisUsage &AU) const override {
642 AU.setPreservesCFG();
648 }
649
650 bool isForceEmitWaitcnt() const {
651 for (auto T : inst_counter_types())
652 if (ForceEmitWaitcnt[T])
653 return true;
654 return false;
655 }
656
657 void setForceEmitWaitcnt() {
658// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
659// For debug builds, get the debug counter info and adjust if need be
660#ifndef NDEBUG
661 if (DebugCounter::isCounterSet(ForceExpCounter) &&
662 DebugCounter::shouldExecute(ForceExpCounter)) {
663 ForceEmitWaitcnt[EXP_CNT] = true;
664 } else {
665 ForceEmitWaitcnt[EXP_CNT] = false;
666 }
667
668 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
669 DebugCounter::shouldExecute(ForceLgkmCounter)) {
670 ForceEmitWaitcnt[DS_CNT] = true;
671 ForceEmitWaitcnt[KM_CNT] = true;
672 } else {
673 ForceEmitWaitcnt[DS_CNT] = false;
674 ForceEmitWaitcnt[KM_CNT] = false;
675 }
676
677 if (DebugCounter::isCounterSet(ForceVMCounter) &&
678 DebugCounter::shouldExecute(ForceVMCounter)) {
679 ForceEmitWaitcnt[LOAD_CNT] = true;
680 ForceEmitWaitcnt[SAMPLE_CNT] = true;
681 ForceEmitWaitcnt[BVH_CNT] = true;
682 } else {
683 ForceEmitWaitcnt[LOAD_CNT] = false;
684 ForceEmitWaitcnt[SAMPLE_CNT] = false;
685 ForceEmitWaitcnt[BVH_CNT] = false;
686 }
687#endif // NDEBUG
688 }
689
690 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
691 // FLAT instruction.
692 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
693 // Maps VMEM access types to their corresponding WaitEventType.
694 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
695 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
696
698 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
699 // these should use VM_CNT.
700 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
701 return VMEM_ACCESS;
702 if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) {
703 // FLAT and SCRATCH instructions may access scratch. Other VMEM
704 // instructions do not.
705 if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
706 return SCRATCH_WRITE_ACCESS;
707 return VMEM_WRITE_ACCESS;
708 }
709 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
710 return VMEM_READ_ACCESS;
711 return VmemReadMapping[getVmemType(Inst)];
712 }
713
714 bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
715 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
716 bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
717 bool generateWaitcntInstBefore(MachineInstr &MI,
718 WaitcntBrackets &ScoreBrackets,
719 MachineInstr *OldWaitcntInstr,
720 bool FlushVmCnt);
721 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
723 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
724 MachineInstr *OldWaitcntInstr);
725 void updateEventWaitcntAfter(MachineInstr &Inst,
726 WaitcntBrackets *ScoreBrackets);
727 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
728 WaitcntBrackets &ScoreBrackets);
729};
730
731} // end anonymous namespace
732
733RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
735 const SIRegisterInfo *TRI,
736 unsigned OpNo) const {
737 const MachineOperand &Op = MI->getOperand(OpNo);
738 if (!TRI->isInAllocatableClass(Op.getReg()))
739 return {-1, -1};
740
741 // A use via a PW operand does not need a waitcnt.
742 // A partial write is not a WAW.
743 assert(!Op.getSubReg() || !Op.isUndef());
744
745 RegInterval Result;
746
747 unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
749
750 if (TRI->isVectorRegister(*MRI, Op.getReg())) {
751 assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
752 Result.first = Reg - Encoding.VGPR0;
753 if (TRI->isAGPR(*MRI, Op.getReg()))
754 Result.first += AGPR_OFFSET;
755 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
756 } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
757 assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
758 Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
759 assert(Result.first >= NUM_ALL_VGPRS &&
760 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
761 }
762 // TODO: Handle TTMP
763 // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
764 else
765 return {-1, -1};
766
767 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
768 unsigned Size = TRI->getRegSizeInBits(*RC);
769 Result.second = Result.first + ((Size + 16) / 32);
770
771 return Result;
772}
773
774void WaitcntBrackets::setExpScore(const MachineInstr *MI,
775 const SIInstrInfo *TII,
776 const SIRegisterInfo *TRI,
777 const MachineRegisterInfo *MRI, unsigned OpNo,
778 unsigned Val) {
779 RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo);
780 assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
781 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
782 setRegScore(RegNo, EXP_CNT, Val);
783 }
784}
785
786void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
787 const SIRegisterInfo *TRI,
789 WaitEventType E, MachineInstr &Inst) {
790 InstCounterType T = eventCounter(WaitEventMaskForInst, E);
791
792 unsigned UB = getScoreUB(T);
793 unsigned CurrScore = UB + 1;
794 if (CurrScore == 0)
795 report_fatal_error("InsertWaitcnt score wraparound");
796 // PendingEvents and ScoreUB need to be update regardless if this event
797 // changes the score of a register or not.
798 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
799 PendingEvents |= 1 << E;
800 setScoreUB(T, CurrScore);
801
802 if (T == EXP_CNT) {
803 // Put score on the source vgprs. If this is a store, just use those
804 // specific register(s).
805 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
806 int AddrOpIdx =
807 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
808 // All GDS operations must protect their address register (same as
809 // export.)
810 if (AddrOpIdx != -1) {
811 setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
812 }
813
814 if (Inst.mayStore()) {
815 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) {
816 setExpScore(
817 &Inst, TII, TRI, MRI,
818 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
819 CurrScore);
820 }
821 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) {
822 setExpScore(&Inst, TII, TRI, MRI,
824 AMDGPU::OpName::data1),
825 CurrScore);
826 }
827 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
828 Inst.getOpcode() != AMDGPU::DS_APPEND &&
829 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
830 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
831 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
832 const MachineOperand &Op = Inst.getOperand(I);
833 if (Op.isReg() && !Op.isDef() &&
834 TRI->isVectorRegister(*MRI, Op.getReg())) {
835 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
836 }
837 }
838 }
839 } else if (TII->isFLAT(Inst)) {
840 if (Inst.mayStore()) {
841 setExpScore(
842 &Inst, TII, TRI, MRI,
843 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
844 CurrScore);
845 } else if (SIInstrInfo::isAtomicRet(Inst)) {
846 setExpScore(
847 &Inst, TII, TRI, MRI,
848 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
849 CurrScore);
850 }
851 } else if (TII->isMIMG(Inst)) {
852 if (Inst.mayStore()) {
853 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
854 } else if (SIInstrInfo::isAtomicRet(Inst)) {
855 setExpScore(
856 &Inst, TII, TRI, MRI,
857 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
858 CurrScore);
859 }
860 } else if (TII->isMTBUF(Inst)) {
861 if (Inst.mayStore()) {
862 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
863 }
864 } else if (TII->isMUBUF(Inst)) {
865 if (Inst.mayStore()) {
866 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
867 } else if (SIInstrInfo::isAtomicRet(Inst)) {
868 setExpScore(
869 &Inst, TII, TRI, MRI,
870 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
871 CurrScore);
872 }
873 } else if (TII->isLDSDIR(Inst)) {
874 // LDSDIR instructions attach the score to the destination.
875 setExpScore(
876 &Inst, TII, TRI, MRI,
877 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
878 CurrScore);
879 } else {
880 if (TII->isEXP(Inst)) {
881 // For export the destination registers are really temps that
882 // can be used as the actual source after export patching, so
883 // we need to treat them like sources and set the EXP_CNT
884 // score.
885 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
886 MachineOperand &DefMO = Inst.getOperand(I);
887 if (DefMO.isReg() && DefMO.isDef() &&
888 TRI->isVGPR(*MRI, DefMO.getReg())) {
889 setRegScore(
890 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
891 EXP_CNT, CurrScore);
892 }
893 }
894 }
895 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
896 MachineOperand &MO = Inst.getOperand(I);
897 if (MO.isReg() && !MO.isDef() &&
898 TRI->isVectorRegister(*MRI, MO.getReg())) {
899 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
900 }
901 }
902 }
903#if 0 // TODO: check if this is handled by MUBUF code above.
904 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
905 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
906 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
907 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
908 unsigned OpNo;//TODO: find the OpNo for this operand;
909 RegInterval Interval = getRegInterval(&Inst, MRI, TRI, OpNo);
910 for (int RegNo = Interval.first; RegNo < Interval.second;
911 ++RegNo) {
912 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
913 }
914#endif
915 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
916 // Match the score to the destination registers.
917 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
918 auto &Op = Inst.getOperand(I);
919 if (!Op.isReg() || !Op.isDef())
920 continue;
921 RegInterval Interval = getRegInterval(&Inst, MRI, TRI, I);
922 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
923 if (Interval.first >= NUM_ALL_VGPRS)
924 continue;
925 if (updateVMCntOnly(Inst)) {
926 // updateVMCntOnly should only leave us with VGPRs
927 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
928 // defs. That's required for a sane index into `VgprMemTypes` below
929 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
930 VmemType V = getVmemType(Inst);
931 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
932 VgprVmemTypes[RegNo] |= 1 << V;
933 }
934 }
935 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
936 setRegScore(RegNo, T, CurrScore);
937 }
938 }
939 if (Inst.mayStore() &&
940 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
941 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
942 // written can be accessed. A load from LDS to VMEM does not need a wait.
943 unsigned Slot = 0;
944 for (const auto *MemOp : Inst.memoperands()) {
945 if (!MemOp->isStore() ||
946 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
947 continue;
948 // Comparing just AA info does not guarantee memoperands are equal
949 // in general, but this is so for LDS DMA in practice.
950 auto AAI = MemOp->getAAInfo();
951 // Alias scope information gives a way to definitely identify an
952 // original memory object and practically produced in the module LDS
953 // lowering pass. If there is no scope available we will not be able
954 // to disambiguate LDS aliasing as after the module lowering all LDS
955 // is squashed into a single big object. Do not attempt to use one of
956 // the limited LDSDMAStores for something we will not be able to use
957 // anyway.
958 if (!AAI || !AAI.Scope)
959 break;
960 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
961 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
962 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
963 Slot = I + 1;
964 break;
965 }
966 }
967 }
968 if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
969 break;
970 LDSDMAStores.push_back(&Inst);
971 Slot = LDSDMAStores.size();
972 break;
973 }
974 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore);
975 if (Slot)
976 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
977 }
978 }
979}
980
981void WaitcntBrackets::print(raw_ostream &OS) {
982 OS << '\n';
983 for (auto T : inst_counter_types(MaxCounter)) {
984 unsigned SR = getScoreRange(T);
985
986 switch (T) {
987 case LOAD_CNT:
988 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
989 << SR << "): ";
990 break;
991 case DS_CNT:
992 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
993 << SR << "): ";
994 break;
995 case EXP_CNT:
996 OS << " EXP_CNT(" << SR << "): ";
997 break;
998 case STORE_CNT:
999 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1000 << SR << "): ";
1001 break;
1002 case SAMPLE_CNT:
1003 OS << " SAMPLE_CNT(" << SR << "): ";
1004 break;
1005 case BVH_CNT:
1006 OS << " BVH_CNT(" << SR << "): ";
1007 break;
1008 case KM_CNT:
1009 OS << " KM_CNT(" << SR << "): ";
1010 break;
1011 default:
1012 OS << " UNKNOWN(" << SR << "): ";
1013 break;
1014 }
1015
1016 if (SR != 0) {
1017 // Print vgpr scores.
1018 unsigned LB = getScoreLB(T);
1019
1020 for (int J = 0; J <= VgprUB; J++) {
1021 unsigned RegScore = getRegScore(J, T);
1022 if (RegScore <= LB)
1023 continue;
1024 unsigned RelScore = RegScore - LB - 1;
1025 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
1026 OS << RelScore << ":v" << J << " ";
1027 } else {
1028 OS << RelScore << ":ds ";
1029 }
1030 }
1031 // Also need to print sgpr scores for lgkm_cnt.
1032 if (T == SmemAccessCounter) {
1033 for (int J = 0; J <= SgprUB; J++) {
1034 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
1035 if (RegScore <= LB)
1036 continue;
1037 unsigned RelScore = RegScore - LB - 1;
1038 OS << RelScore << ":s" << J << " ";
1039 }
1040 }
1041 }
1042 OS << '\n';
1043 }
1044 OS << '\n';
1045}
1046
1047/// Simplify the waitcnt, in the sense of removing redundant counts, and return
1048/// whether a waitcnt instruction is needed at all.
1049void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1050 simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1051 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
1052 simplifyWaitcnt(DS_CNT, Wait.DsCnt);
1053 simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
1054 simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1055 simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
1056 simplifyWaitcnt(KM_CNT, Wait.KmCnt);
1057}
1058
1059void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1060 unsigned &Count) const {
1061 // The number of outstanding events for this type, T, can be calculated
1062 // as (UB - LB). If the current Count is greater than or equal to the number
1063 // of outstanding events, then the wait for this counter is redundant.
1064 if (Count >= getScoreRange(T))
1065 Count = ~0u;
1066}
1067
1068void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
1069 AMDGPU::Waitcnt &Wait) const {
1070 unsigned ScoreToWait = getRegScore(RegNo, T);
1071
1072 // If the score of src_operand falls within the bracket, we need an
1073 // s_waitcnt instruction.
1074 const unsigned LB = getScoreLB(T);
1075 const unsigned UB = getScoreUB(T);
1076 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1077 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1078 !ST->hasFlatLgkmVMemCountInOrder()) {
1079 // If there is a pending FLAT operation, and this is a VMem or LGKM
1080 // waitcnt and the target can report early completion, then we need
1081 // to force a waitcnt 0.
1082 addWait(Wait, T, 0);
1083 } else if (counterOutOfOrder(T)) {
1084 // Counter can get decremented out-of-order when there
1085 // are multiple types event in the bracket. Also emit an s_wait counter
1086 // with a conservative value of 0 for the counter.
1087 addWait(Wait, T, 0);
1088 } else {
1089 // If a counter has been maxed out avoid overflow by waiting for
1090 // MAX(CounterType) - 1 instead.
1091 unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
1092 addWait(Wait, T, NeededWait);
1093 }
1094 }
1095}
1096
1097void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1098 applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1099 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1100 applyWaitcnt(DS_CNT, Wait.DsCnt);
1101 applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1102 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1103 applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1104 applyWaitcnt(KM_CNT, Wait.KmCnt);
1105}
1106
1107void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1108 const unsigned UB = getScoreUB(T);
1109 if (Count >= UB)
1110 return;
1111 if (Count != 0) {
1112 if (counterOutOfOrder(T))
1113 return;
1114 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1115 } else {
1116 setScoreLB(T, UB);
1117 PendingEvents &= ~WaitEventMaskForInst[T];
1118 }
1119}
1120
1121// Where there are multiple types of event in the bracket of a counter,
1122// the decrement may go out of order.
1123bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1124 // Scalar memory read always can go out of order.
1125 if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
1126 return true;
1127 return hasMixedPendingEvents(T);
1128}
1129
1130INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1131 false)
1134INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1135 false)
1136
1137char SIInsertWaitcnts::ID = 0;
1138
1139char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
1140
1142 return new SIInsertWaitcnts();
1143}
1144
1146 unsigned NewEnc) {
1147 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1148 assert(OpIdx >= 0);
1149
1150 MachineOperand &MO = MI.getOperand(OpIdx);
1151
1152 if (NewEnc == MO.getImm())
1153 return false;
1154
1155 MO.setImm(NewEnc);
1156 return true;
1157}
1158
1159/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1160/// and if so, which counter it is waiting on.
1161static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1162 switch (Opcode) {
1163 case AMDGPU::S_WAIT_LOADCNT:
1164 return LOAD_CNT;
1165 case AMDGPU::S_WAIT_EXPCNT:
1166 return EXP_CNT;
1167 case AMDGPU::S_WAIT_STORECNT:
1168 return STORE_CNT;
1169 case AMDGPU::S_WAIT_SAMPLECNT:
1170 return SAMPLE_CNT;
1171 case AMDGPU::S_WAIT_BVHCNT:
1172 return BVH_CNT;
1173 case AMDGPU::S_WAIT_DSCNT:
1174 return DS_CNT;
1175 case AMDGPU::S_WAIT_KMCNT:
1176 return KM_CNT;
1177 default:
1178 return {};
1179 }
1180}
1181
1182bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1183 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1184 if (Opcode == Waitcnt->getOpcode())
1185 return false;
1186
1187 Waitcnt->setDesc(TII->get(Opcode));
1188 return true;
1189}
1190
1191/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1192/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1193/// from \p Wait that were added by previous passes. Currently this pass
1194/// conservatively assumes that these preexisting waits are required for
1195/// correctness.
1196bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1197 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1199 assert(ST);
1200 assert(isNormalMode(MaxCounter));
1201
1202 bool Modified = false;
1203 MachineInstr *WaitcntInstr = nullptr;
1204 MachineInstr *WaitcntVsCntInstr = nullptr;
1205
1206 for (auto &II :
1207 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1208 if (II.isMetaInstruction())
1209 continue;
1210
1211 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1212 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1213
1214 // Update required wait count. If this is a soft waitcnt (= it was added
1215 // by an earlier pass), it may be entirely removed.
1216 if (Opcode == AMDGPU::S_WAITCNT) {
1217 unsigned IEnc = II.getOperand(0).getImm();
1218 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1219 if (TrySimplify)
1220 ScoreBrackets.simplifyWaitcnt(OldWait);
1221 Wait = Wait.combined(OldWait);
1222
1223 // Merge consecutive waitcnt of the same type by erasing multiples.
1224 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1225 II.eraseFromParent();
1226 Modified = true;
1227 } else
1228 WaitcntInstr = &II;
1229 } else {
1230 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1231 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1232
1233 unsigned OldVSCnt =
1234 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1235 if (TrySimplify)
1236 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1237 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1238
1239 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1240 II.eraseFromParent();
1241 Modified = true;
1242 } else
1243 WaitcntVsCntInstr = &II;
1244 }
1245 }
1246
1247 if (WaitcntInstr) {
1248 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1250 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1251
1252 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1253 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1254 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1255 Wait.LoadCnt = ~0u;
1256 Wait.ExpCnt = ~0u;
1257 Wait.DsCnt = ~0u;
1258
1259 LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1260 ? dbgs()
1261 << "applyPreexistingWaitcnt\n"
1262 << "New Instr at block end: " << *WaitcntInstr << '\n'
1263 : dbgs() << "applyPreexistingWaitcnt\n"
1264 << "Old Instr: " << *It
1265 << "New Instr: " << *WaitcntInstr << '\n');
1266 }
1267
1268 if (WaitcntVsCntInstr) {
1269 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
1270 AMDGPU::OpName::simm16, Wait.StoreCnt);
1271 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1272
1273 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1274 Wait.StoreCnt = ~0u;
1275
1276 LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1277 ? dbgs() << "applyPreexistingWaitcnt\n"
1278 << "New Instr at block end: " << *WaitcntVsCntInstr
1279 << '\n'
1280 : dbgs() << "applyPreexistingWaitcnt\n"
1281 << "Old Instr: " << *It
1282 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1283 }
1284
1285 return Modified;
1286}
1287
1288/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1289/// required counters in \p Wait
1290bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1293 assert(ST);
1294 assert(isNormalMode(MaxCounter));
1295
1296 bool Modified = false;
1297 const DebugLoc &DL = Block.findDebugLoc(It);
1298
1299 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1300 // single instruction while VScnt has its own instruction.
1301 if (Wait.hasWaitExceptStoreCnt()) {
1302 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1303 [[maybe_unused]] auto SWaitInst =
1304 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1305 Modified = true;
1306
1307 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1308 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1309 dbgs() << "New Instr: " << *SWaitInst << '\n');
1310 }
1311
1312 if (Wait.hasWaitStoreCnt()) {
1313 assert(ST->hasVscnt());
1314
1315 [[maybe_unused]] auto SWaitInst =
1316 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1317 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1318 .addImm(Wait.StoreCnt);
1319 Modified = true;
1320
1321 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1322 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1323 dbgs() << "New Instr: " << *SWaitInst << '\n');
1324 }
1325
1326 return Modified;
1327}
1328
1330WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1331 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1332}
1333
1335WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1336 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
1337}
1338
1339/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1340/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1341/// were added by previous passes. Currently this pass conservatively
1342/// assumes that these preexisting waits are required for correctness.
1343bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1344 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1346 assert(ST);
1347 assert(!isNormalMode(MaxCounter));
1348
1349 bool Modified = false;
1350 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1351 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1352 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1353
1354 for (auto &II :
1355 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1356 if (II.isMetaInstruction())
1357 continue;
1358
1359 MachineInstr **UpdatableInstr;
1360
1361 // Update required wait count. If this is a soft waitcnt (= it was added
1362 // by an earlier pass), it may be entirely removed.
1363
1364 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1365 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1366
1367 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1368 // attempt to do more than that either.
1369 if (Opcode == AMDGPU::S_WAITCNT)
1370 continue;
1371
1372 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1373 unsigned OldEnc =
1374 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1376 if (TrySimplify)
1377 ScoreBrackets.simplifyWaitcnt(OldWait);
1378 Wait = Wait.combined(OldWait);
1379 UpdatableInstr = &CombinedLoadDsCntInstr;
1380 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1381 unsigned OldEnc =
1382 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1384 if (TrySimplify)
1385 ScoreBrackets.simplifyWaitcnt(OldWait);
1386 Wait = Wait.combined(OldWait);
1387 UpdatableInstr = &CombinedStoreDsCntInstr;
1388 } else {
1389 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1390 assert(CT.has_value());
1391 unsigned OldCnt =
1392 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1393 if (TrySimplify)
1394 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
1395 addWait(Wait, CT.value(), OldCnt);
1396 UpdatableInstr = &WaitInstrs[CT.value()];
1397 }
1398
1399 // Merge consecutive waitcnt of the same type by erasing multiples.
1400 if (!*UpdatableInstr) {
1401 *UpdatableInstr = &II;
1402 } else {
1403 II.eraseFromParent();
1404 Modified = true;
1405 }
1406 }
1407
1408 if (CombinedLoadDsCntInstr) {
1409 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1410 // to be waited for. Otherwise, let the instruction be deleted so
1411 // the appropriate single counter wait instruction can be inserted
1412 // instead, when new S_WAIT_*CNT instructions are inserted by
1413 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1414 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1415 // the loop below that deals with single counter instructions.
1416 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1417 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1418 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1419 AMDGPU::OpName::simm16, NewEnc);
1420 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1421 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1422 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1423 Wait.LoadCnt = ~0u;
1424 Wait.DsCnt = ~0u;
1425
1426 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1427 ? dbgs() << "applyPreexistingWaitcnt\n"
1428 << "New Instr at block end: "
1429 << *CombinedLoadDsCntInstr << '\n'
1430 : dbgs() << "applyPreexistingWaitcnt\n"
1431 << "Old Instr: " << *It << "New Instr: "
1432 << *CombinedLoadDsCntInstr << '\n');
1433 } else {
1434 CombinedLoadDsCntInstr->eraseFromParent();
1435 Modified = true;
1436 }
1437 }
1438
1439 if (CombinedStoreDsCntInstr) {
1440 // Similarly for S_WAIT_STORECNT_DSCNT.
1441 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1442 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1443 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
1444 AMDGPU::OpName::simm16, NewEnc);
1445 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1446 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1447 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1448 Wait.StoreCnt = ~0u;
1449 Wait.DsCnt = ~0u;
1450
1451 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1452 ? dbgs() << "applyPreexistingWaitcnt\n"
1453 << "New Instr at block end: "
1454 << *CombinedStoreDsCntInstr << '\n'
1455 : dbgs() << "applyPreexistingWaitcnt\n"
1456 << "Old Instr: " << *It << "New Instr: "
1457 << *CombinedStoreDsCntInstr << '\n');
1458 } else {
1459 CombinedStoreDsCntInstr->eraseFromParent();
1460 Modified = true;
1461 }
1462 }
1463
1464 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
1465 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1466 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1467 // instructions so that createNewWaitcnt() will create new combined
1468 // instructions to replace them.
1469
1470 if (Wait.DsCnt != ~0u) {
1471 // This is a vector of addresses in WaitInstrs pointing to instructions
1472 // that should be removed if they are present.
1474
1475 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1476 // both) need to be waited for, ensure that there are no existing
1477 // individual wait count instructions for these.
1478
1479 if (Wait.LoadCnt != ~0u) {
1480 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
1481 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1482 } else if (Wait.StoreCnt != ~0u) {
1483 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
1484 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1485 }
1486
1487 for (MachineInstr **WI : WaitsToErase) {
1488 if (!*WI)
1489 continue;
1490
1491 (*WI)->eraseFromParent();
1492 *WI = nullptr;
1493 Modified = true;
1494 }
1495 }
1496
1497 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1498 if (!WaitInstrs[CT])
1499 continue;
1500
1501 unsigned NewCnt = getWait(Wait, CT);
1502 if (NewCnt != ~0u) {
1503 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
1504 AMDGPU::OpName::simm16, NewCnt);
1505 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1506
1507 ScoreBrackets.applyWaitcnt(CT, NewCnt);
1508 setNoWait(Wait, CT);
1509
1510 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1511 ? dbgs() << "applyPreexistingWaitcnt\n"
1512 << "New Instr at block end: " << *WaitInstrs[CT]
1513 << '\n'
1514 : dbgs() << "applyPreexistingWaitcnt\n"
1515 << "Old Instr: " << *It
1516 << "New Instr: " << *WaitInstrs[CT] << '\n');
1517 } else {
1518 WaitInstrs[CT]->eraseFromParent();
1519 Modified = true;
1520 }
1521 }
1522
1523 return Modified;
1524}
1525
1526/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
1527bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1530 assert(ST);
1531 assert(!isNormalMode(MaxCounter));
1532
1533 bool Modified = false;
1534 const DebugLoc &DL = Block.findDebugLoc(It);
1535
1536 // Check for opportunities to use combined wait instructions.
1537 if (Wait.DsCnt != ~0u) {
1538 MachineInstr *SWaitInst = nullptr;
1539
1540 if (Wait.LoadCnt != ~0u) {
1541 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1542
1543 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
1544 .addImm(Enc);
1545
1546 Wait.LoadCnt = ~0u;
1547 Wait.DsCnt = ~0u;
1548 } else if (Wait.StoreCnt != ~0u) {
1549 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1550
1551 SWaitInst =
1552 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
1553 .addImm(Enc);
1554
1555 Wait.StoreCnt = ~0u;
1556 Wait.DsCnt = ~0u;
1557 }
1558
1559 if (SWaitInst) {
1560 Modified = true;
1561
1562 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1563 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1564 dbgs() << "New Instr: " << *SWaitInst << '\n');
1565 }
1566 }
1567
1568 // Generate an instruction for any remaining counter that needs
1569 // waiting for.
1570
1571 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1572 unsigned Count = getWait(Wait, CT);
1573 if (Count == ~0u)
1574 continue;
1575
1576 [[maybe_unused]] auto SWaitInst =
1577 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
1578 .addImm(Count);
1579
1580 Modified = true;
1581
1582 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1583 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1584 dbgs() << "New Instr: " << *SWaitInst << '\n');
1585 }
1586
1587 return Modified;
1588}
1589
1590static bool readsVCCZ(const MachineInstr &MI) {
1591 unsigned Opc = MI.getOpcode();
1592 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1593 !MI.getOperand(1).isUndef();
1594}
1595
1596/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
1598 // Currently all conventions wait, but this may not always be the case.
1599 //
1600 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
1601 // senses to omit the wait and do it in the caller.
1602 return true;
1603}
1604
1605/// \returns true if the callee is expected to wait for any outstanding waits
1606/// before returning.
1608 return true;
1609}
1610
1611/// Generate s_waitcnt instruction to be placed before cur_Inst.
1612/// Instructions of a given type are returned in order,
1613/// but instructions of different types can complete out of order.
1614/// We rely on this in-order completion
1615/// and simply assign a score to the memory access instructions.
1616/// We keep track of the active "score bracket" to determine
1617/// if an access of a memory read requires an s_waitcnt
1618/// and if so what the value of each counter is.
1619/// The "score bracket" is bound by the lower bound and upper bound
1620/// scores (*_score_LB and *_score_ub respectively).
1621/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1622/// flush the vmcnt counter here.
1623bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1624 WaitcntBrackets &ScoreBrackets,
1625 MachineInstr *OldWaitcntInstr,
1626 bool FlushVmCnt) {
1627 setForceEmitWaitcnt();
1628
1629 if (MI.isMetaInstruction())
1630 return false;
1631
1633
1634 // FIXME: This should have already been handled by the memory legalizer.
1635 // Removing this currently doesn't affect any lit tests, but we need to
1636 // verify that nothing was relying on this. The number of buffer invalidates
1637 // being handled here should not be expanded.
1638 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1639 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1640 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1641 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1642 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1643 Wait.LoadCnt = 0;
1644 }
1645
1646 // All waits must be resolved at call return.
1647 // NOTE: this could be improved with knowledge of all call sites or
1648 // with knowledge of the called routines.
1649 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1650 MI.getOpcode() == AMDGPU::SI_RETURN ||
1651 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1652 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1653 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
1654 }
1655 // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1656 // stores. In this case it can be useful to send a message to explicitly
1657 // release all VGPRs before the stores have completed, but it is only safe to
1658 // do this if:
1659 // * there are no outstanding scratch stores
1660 // * we are not in Dynamic VGPR mode
1661 else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1662 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1663 if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() &&
1664 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1665 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
1666 ReleaseVGPRInsts.insert(&MI);
1667 }
1668 // Resolve vm waits before gs-done.
1669 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
1670 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1671 ST->hasLegacyGeometry() &&
1672 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1674 Wait.LoadCnt = 0;
1675 }
1676#if 0 // TODO: the following blocks of logic when we have fence.
1677 else if (MI.getOpcode() == SC_FENCE) {
1678 const unsigned int group_size =
1679 context->shader_info->GetMaxThreadGroupSize();
1680 // group_size == 0 means thread group size is unknown at compile time
1681 const bool group_is_multi_wave =
1682 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
1683 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
1684
1685 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
1686 SCRegType src_type = Inst->GetSrcType(i);
1687 switch (src_type) {
1688 case SCMEM_LDS:
1689 if (group_is_multi_wave ||
1690 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
1691 EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT,
1692 ScoreBrackets->getScoreUB(DS_CNT));
1693 // LDS may have to wait for VMcnt after buffer load to LDS
1694 if (target_info->HasBufferLoadToLDS()) {
1695 EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT,
1696 ScoreBrackets->getScoreUB(LOAD_CNT));
1697 }
1698 }
1699 break;
1700
1701 case SCMEM_GDS:
1702 if (group_is_multi_wave || fence_is_global) {
1703 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1704 ScoreBrackets->getScoreUB(EXP_CNT));
1705 EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT,
1706 ScoreBrackets->getScoreUB(DS_CNT));
1707 }
1708 break;
1709
1710 case SCMEM_UAV:
1711 case SCMEM_TFBUF:
1712 case SCMEM_RING:
1713 case SCMEM_SCATTER:
1714 if (group_is_multi_wave || fence_is_global) {
1715 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1716 ScoreBrackets->getScoreUB(EXP_CNT));
1717 EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT,
1718 ScoreBrackets->getScoreUB(LOAD_CNT));
1719 }
1720 break;
1721
1722 case SCMEM_SCRATCH:
1723 default:
1724 break;
1725 }
1726 }
1727 }
1728#endif
1729
1730 // Export & GDS instructions do not read the EXEC mask until after the export
1731 // is granted (which can occur well after the instruction is issued).
1732 // The shader program must flush all EXP operations on the export-count
1733 // before overwriting the EXEC mask.
1734 else {
1735 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1736 // Export and GDS are tracked individually, either may trigger a waitcnt
1737 // for EXEC.
1738 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1739 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1740 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1741 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1742 Wait.ExpCnt = 0;
1743 }
1744 }
1745
1746 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1747 // The function is going to insert a wait on everything in its prolog.
1748 // This still needs to be careful if the call target is a load (e.g. a GOT
1749 // load). We also need to check WAW dependency with saved PC.
1751
1752 int CallAddrOpIdx =
1753 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
1754
1755 if (MI.getOperand(CallAddrOpIdx).isReg()) {
1756 RegInterval CallAddrOpInterval =
1757 ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOpIdx);
1758
1759 for (int RegNo = CallAddrOpInterval.first;
1760 RegNo < CallAddrOpInterval.second; ++RegNo)
1761 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1762
1763 int RtnAddrOpIdx =
1764 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
1765 if (RtnAddrOpIdx != -1) {
1766 RegInterval RtnAddrOpInterval =
1767 ScoreBrackets.getRegInterval(&MI, MRI, TRI, RtnAddrOpIdx);
1768
1769 for (int RegNo = RtnAddrOpInterval.first;
1770 RegNo < RtnAddrOpInterval.second; ++RegNo)
1771 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1772 }
1773 }
1774 } else {
1775 // FIXME: Should not be relying on memoperands.
1776 // Look at the source operands of every instruction to see if
1777 // any of them results from a previous memory operation that affects
1778 // its current usage. If so, an s_waitcnt instruction needs to be
1779 // emitted.
1780 // If the source operand was defined by a load, add the s_waitcnt
1781 // instruction.
1782 //
1783 // Two cases are handled for destination operands:
1784 // 1) If the destination operand was defined by a load, add the s_waitcnt
1785 // instruction to guarantee the right WAW order.
1786 // 2) If a destination operand that was used by a recent export/store ins,
1787 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1788
1789 for (const MachineMemOperand *Memop : MI.memoperands()) {
1790 const Value *Ptr = Memop->getValue();
1791 if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
1792 addWait(Wait, SmemAccessCounter, 0);
1793 if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1794 SLoadAddresses.erase(Ptr);
1795 }
1796 unsigned AS = Memop->getAddrSpace();
1798 continue;
1799 // No need to wait before load from VMEM to LDS.
1800 if (TII->mayWriteLDSThroughDMA(MI))
1801 continue;
1802
1803 // LOAD_CNT is only relevant to vgpr or LDS.
1804 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1805 bool FoundAliasingStore = false;
1806 // Only objects with alias scope info were added to LDSDMAScopes array.
1807 // In the absense of the scope info we will not be able to disambiguate
1808 // aliasing here. There is no need to try searching for a corresponding
1809 // store slot. This is conservatively correct because in that case we
1810 // will produce a wait using the first (general) LDS DMA wait slot which
1811 // will wait on all of them anyway.
1812 if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1813 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
1814 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
1815 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
1816 FoundAliasingStore = true;
1817 ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
1818 }
1819 }
1820 }
1821 if (!FoundAliasingStore)
1822 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1823 if (Memop->isStore()) {
1824 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1825 }
1826 }
1827
1828 // Loop over use and def operands.
1829 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1830 MachineOperand &Op = MI.getOperand(I);
1831 if (!Op.isReg())
1832 continue;
1833
1834 // If the instruction does not read tied source, skip the operand.
1835 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1836 continue;
1837
1838 RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, I);
1839
1840 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1841 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1842 if (IsVGPR) {
1843 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1844 // previous write and this write are the same type of VMEM
1845 // instruction, in which case they're guaranteed to write their
1846 // results in order anyway.
1847 if (Op.isUse() || !updateVMCntOnly(MI) ||
1848 ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1849 getVmemType(MI))) {
1850 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1851 ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait);
1852 ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait);
1853 ScoreBrackets.clearVgprVmemTypes(RegNo);
1854 }
1855 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1856 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1857 }
1858 ScoreBrackets.determineWait(DS_CNT, RegNo, Wait);
1859 } else {
1860 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1861 }
1862 }
1863 }
1864 }
1865 }
1866
1867 // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1868 // not, we need to ensure the subtarget is capable of backing off barrier
1869 // instructions in case there are any outstanding memory operations that may
1870 // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
1871 if (TII->isBarrierStart(MI.getOpcode()) &&
1872 !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
1873 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
1874 }
1875
1876 // TODO: Remove this work-around, enable the assert for Bug 457939
1877 // after fixing the scheduler. Also, the Shader Compiler code is
1878 // independent of target.
1879 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1880 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1881 Wait.DsCnt = 0;
1882 }
1883 }
1884
1885 // Verify that the wait is actually needed.
1886 ScoreBrackets.simplifyWaitcnt(Wait);
1887
1888 if (ForceEmitZeroWaitcnts)
1889 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
1890
1891 if (ForceEmitWaitcnt[LOAD_CNT])
1892 Wait.LoadCnt = 0;
1893 if (ForceEmitWaitcnt[EXP_CNT])
1894 Wait.ExpCnt = 0;
1895 if (ForceEmitWaitcnt[DS_CNT])
1896 Wait.DsCnt = 0;
1897 if (ForceEmitWaitcnt[SAMPLE_CNT])
1898 Wait.SampleCnt = 0;
1899 if (ForceEmitWaitcnt[BVH_CNT])
1900 Wait.BvhCnt = 0;
1901 if (ForceEmitWaitcnt[KM_CNT])
1902 Wait.KmCnt = 0;
1903
1904 if (FlushVmCnt) {
1905 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
1906 Wait.LoadCnt = 0;
1907 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
1908 Wait.SampleCnt = 0;
1909 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
1910 Wait.BvhCnt = 0;
1911 }
1912
1913 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
1914 OldWaitcntInstr);
1915}
1916
1917bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1920 WaitcntBrackets &ScoreBrackets,
1921 MachineInstr *OldWaitcntInstr) {
1922 bool Modified = false;
1923
1924 if (OldWaitcntInstr)
1925 // Try to merge the required wait with preexisting waitcnt instructions.
1926 // Also erase redundant waitcnt.
1927 Modified =
1928 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
1929
1930 // Any counts that could have been applied to any existing waitcnt
1931 // instructions will have been done so, now deal with any remaining.
1932 ScoreBrackets.applyWaitcnt(Wait);
1933
1934 // ExpCnt can be merged into VINTERP.
1935 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
1937 MachineOperand *WaitExp =
1938 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1939 if (Wait.ExpCnt < WaitExp->getImm()) {
1940 WaitExp->setImm(Wait.ExpCnt);
1941 Modified = true;
1942 }
1943 Wait.ExpCnt = ~0u;
1944
1945 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
1946 << "Update Instr: " << *It);
1947 }
1948
1949 if (WCG->createNewWaitcnt(Block, It, Wait))
1950 Modified = true;
1951
1952 return Modified;
1953}
1954
1955// This is a flat memory operation. Check to see if it has memory tokens other
1956// than LDS. Other address spaces supported by flat memory operations involve
1957// global memory.
1958bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1959 assert(TII->isFLAT(MI));
1960
1961 // All flat instructions use the VMEM counter.
1962 assert(TII->usesVM_CNT(MI));
1963
1964 // If there are no memory operands then conservatively assume the flat
1965 // operation may access VMEM.
1966 if (MI.memoperands_empty())
1967 return true;
1968
1969 // See if any memory operand specifies an address space that involves VMEM.
1970 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1971 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1972 // (GDS) address space is not supported by flat operations. Therefore, simply
1973 // return true unless only the LDS address space is found.
1974 for (const MachineMemOperand *Memop : MI.memoperands()) {
1975 unsigned AS = Memop->getAddrSpace();
1977 if (AS != AMDGPUAS::LOCAL_ADDRESS)
1978 return true;
1979 }
1980
1981 return false;
1982}
1983
1984// This is a flat memory operation. Check to see if it has memory tokens for
1985// either LDS or FLAT.
1986bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1987 assert(TII->isFLAT(MI));
1988
1989 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1990 if (!TII->usesLGKM_CNT(MI))
1991 return false;
1992
1993 // If in tgsplit mode then there can be no use of LDS.
1994 if (ST->isTgSplitEnabled())
1995 return false;
1996
1997 // If there are no memory operands then conservatively assume the flat
1998 // operation may access LDS.
1999 if (MI.memoperands_empty())
2000 return true;
2001
2002 // See if any memory operand specifies an address space that involves LDS.
2003 for (const MachineMemOperand *Memop : MI.memoperands()) {
2004 unsigned AS = Memop->getAddrSpace();
2006 return true;
2007 }
2008
2009 return false;
2010}
2011
2012// This is a flat memory operation. Check to see if it has memory tokens for
2013// either scratch or FLAT.
2014bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
2015 const MachineInstr &MI) const {
2016 assert(TII->isFLAT(MI));
2017
2018 // SCRATCH instructions always access scratch.
2019 if (TII->isFLATScratch(MI))
2020 return true;
2021
2022 // GLOBAL instructions never access scratch.
2023 if (TII->isFLATGlobal(MI))
2024 return false;
2025
2026 // If there are no memory operands then conservatively assume the flat
2027 // operation may access scratch.
2028 if (MI.memoperands_empty())
2029 return true;
2030
2031 // See if any memory operand specifies an address space that involves scratch.
2032 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
2033 unsigned AS = Memop->getAddrSpace();
2034 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
2035 });
2036}
2037
2039 auto Opc = Inst.getOpcode();
2040 return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
2041 Opc == AMDGPU::GLOBAL_WBINV;
2042}
2043
2044void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2045 WaitcntBrackets *ScoreBrackets) {
2046 // Now look at the instruction opcode. If it is a memory access
2047 // instruction, update the upper-bound of the appropriate counter's
2048 // bracket and the destination operand scores.
2049 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
2050
2051 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
2052 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
2053 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2054 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
2055 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
2056 } else {
2057 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
2058 }
2059 } else if (TII->isFLAT(Inst)) {
2060 // TODO: Track this properly.
2061 if (isCacheInvOrWBInst(Inst))
2062 return;
2063
2064 assert(Inst.mayLoadOrStore());
2065
2066 int FlatASCount = 0;
2067
2068 if (mayAccessVMEMThroughFlat(Inst)) {
2069 ++FlatASCount;
2070 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2071 Inst);
2072 }
2073
2074 if (mayAccessLDSThroughFlat(Inst)) {
2075 ++FlatASCount;
2076 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
2077 }
2078
2079 // A Flat memory operation must access at least one address space.
2080 assert(FlatASCount);
2081
2082 // This is a flat memory operation that access both VMEM and LDS, so note it
2083 // - it will require that both the VM and LGKM be flushed to zero if it is
2084 // pending when a VM or LGKM dependency occurs.
2085 if (FlatASCount > 1)
2086 ScoreBrackets->setPendingFlat();
2087 } else if (SIInstrInfo::isVMEM(Inst) &&
2089 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2090 Inst);
2091
2092 if (ST->vmemWriteNeedsExpWaitcnt() &&
2093 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2094 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
2095 }
2096 } else if (TII->isSMRD(Inst)) {
2097 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2098 } else if (Inst.isCall()) {
2099 if (callWaitsOnFunctionReturn(Inst)) {
2100 // Act as a wait on everything
2101 ScoreBrackets->applyWaitcnt(
2102 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2103 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2104 } else {
2105 // May need to way wait for anything.
2106 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
2107 }
2108 } else if (SIInstrInfo::isLDSDIR(Inst)) {
2109 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
2110 } else if (TII->isVINTERP(Inst)) {
2111 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2112 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2113 } else if (SIInstrInfo::isEXP(Inst)) {
2114 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2116 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
2117 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2118 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
2119 else
2120 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
2121 } else {
2122 switch (Inst.getOpcode()) {
2123 case AMDGPU::S_SENDMSG:
2124 case AMDGPU::S_SENDMSG_RTN_B32:
2125 case AMDGPU::S_SENDMSG_RTN_B64:
2126 case AMDGPU::S_SENDMSGHALT:
2127 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
2128 break;
2129 case AMDGPU::S_MEMTIME:
2130 case AMDGPU::S_MEMREALTIME:
2131 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2132 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2133 case AMDGPU::S_BARRIER_LEAVE:
2134 case AMDGPU::S_GET_BARRIER_STATE_M0:
2135 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2136 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2137 break;
2138 }
2139 }
2140}
2141
2142bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2143 unsigned OtherScore) {
2144 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2145 unsigned OtherShifted =
2146 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2147 Score = std::max(MyShifted, OtherShifted);
2148 return OtherShifted > MyShifted;
2149}
2150
2151/// Merge the pending events and associater score brackets of \p Other into
2152/// this brackets status.
2153///
2154/// Returns whether the merge resulted in a change that requires tighter waits
2155/// (i.e. the merged brackets strictly dominate the original brackets).
2156bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2157 bool StrictDom = false;
2158
2159 VgprUB = std::max(VgprUB, Other.VgprUB);
2160 SgprUB = std::max(SgprUB, Other.SgprUB);
2161
2162 for (auto T : inst_counter_types(MaxCounter)) {
2163 // Merge event flags for this counter
2164 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2165 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2166 if (OtherEvents & ~OldEvents)
2167 StrictDom = true;
2168 PendingEvents |= OtherEvents;
2169
2170 // Merge scores for this counter
2171 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2172 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2173 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2174 if (NewUB < ScoreLBs[T])
2175 report_fatal_error("waitcnt score overflow");
2176
2177 MergeInfo M;
2178 M.OldLB = ScoreLBs[T];
2179 M.OtherLB = Other.ScoreLBs[T];
2180 M.MyShift = NewUB - ScoreUBs[T];
2181 M.OtherShift = NewUB - Other.ScoreUBs[T];
2182
2183 ScoreUBs[T] = NewUB;
2184
2185 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
2186
2187 for (int J = 0; J <= VgprUB; J++)
2188 StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
2189
2190 if (T == SmemAccessCounter) {
2191 for (int J = 0; J <= SgprUB; J++)
2192 StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
2193 }
2194 }
2195
2196 for (int J = 0; J <= VgprUB; J++) {
2197 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
2198 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2199 VgprVmemTypes[J] = NewVmemTypes;
2200 }
2201
2202 return StrictDom;
2203}
2204
2205static bool isWaitInstr(MachineInstr &Inst) {
2206 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2207 return Opcode == AMDGPU::S_WAITCNT ||
2208 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2209 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2210 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2211 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2212 counterTypeForInstr(Opcode).has_value();
2213}
2214
2215// Generate s_waitcnt instructions where needed.
2216bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2218 WaitcntBrackets &ScoreBrackets) {
2219 bool Modified = false;
2220
2221 LLVM_DEBUG({
2222 dbgs() << "*** Block" << Block.getNumber() << " ***";
2223 ScoreBrackets.dump();
2224 });
2225
2226 // Track the correctness of vccz through this basic block. There are two
2227 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2228 // ST->partialVCCWritesUpdateVCCZ().
2229 bool VCCZCorrect = true;
2230 if (ST->hasReadVCCZBug()) {
2231 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2232 // to vcc and then issued an smem load.
2233 VCCZCorrect = false;
2234 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2235 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2236 // to vcc_lo or vcc_hi.
2237 VCCZCorrect = false;
2238 }
2239
2240 // Walk over the instructions.
2241 MachineInstr *OldWaitcntInstr = nullptr;
2242
2243 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2244 E = Block.instr_end();
2245 Iter != E;) {
2246 MachineInstr &Inst = *Iter;
2247
2248 // Track pre-existing waitcnts that were added in earlier iterations or by
2249 // the memory legalizer.
2250 if (isWaitInstr(Inst)) {
2251 if (!OldWaitcntInstr)
2252 OldWaitcntInstr = &Inst;
2253 ++Iter;
2254 continue;
2255 }
2256
2257 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2258 isPreheaderToFlush(Block, ScoreBrackets);
2259
2260 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
2261 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2262 FlushVmCnt);
2263 OldWaitcntInstr = nullptr;
2264
2265 // Restore vccz if it's not known to be correct already.
2266 bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
2267
2268 // Don't examine operands unless we need to track vccz correctness.
2269 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
2270 if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
2271 Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) {
2272 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2273 if (!ST->partialVCCWritesUpdateVCCZ())
2274 VCCZCorrect = false;
2275 } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) {
2276 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
2277 // vccz bit, so when we detect that an instruction may read from a
2278 // corrupt vccz bit, we need to:
2279 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2280 // operations to complete.
2281 // 2. Restore the correct value of vccz by writing the current value
2282 // of vcc back to vcc.
2283 if (ST->hasReadVCCZBug() &&
2284 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2285 // Writes to vcc while there's an outstanding smem read may get
2286 // clobbered as soon as any read completes.
2287 VCCZCorrect = false;
2288 } else {
2289 // Writes to vcc will fix any incorrect value in vccz.
2290 VCCZCorrect = true;
2291 }
2292 }
2293 }
2294
2295 if (TII->isSMRD(Inst)) {
2296 for (const MachineMemOperand *Memop : Inst.memoperands()) {
2297 // No need to handle invariant loads when avoiding WAR conflicts, as
2298 // there cannot be a vector store to the same memory location.
2299 if (!Memop->isInvariant()) {
2300 const Value *Ptr = Memop->getValue();
2301 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
2302 }
2303 }
2304 if (ST->hasReadVCCZBug()) {
2305 // This smem read could complete and clobber vccz at any time.
2306 VCCZCorrect = false;
2307 }
2308 }
2309
2310 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2311
2312#if 0 // TODO: implement resource type check controlled by options with ub = LB.
2313 // If this instruction generates a S_SETVSKIP because it is an
2314 // indexed resource, and we are on Tahiti, then it will also force
2315 // an S_WAITCNT vmcnt(0)
2316 if (RequireCheckResourceType(Inst, context)) {
2317 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
2318 ScoreBrackets->setScoreLB(LOAD_CNT,
2319 ScoreBrackets->getScoreUB(LOAD_CNT));
2320 }
2321#endif
2322
2323 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
2324 AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(
2325 Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst));
2326 ScoreBrackets.simplifyWaitcnt(Wait);
2327 Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block,
2328 ScoreBrackets, /*OldWaitcntInstr=*/nullptr);
2329 }
2330
2331 LLVM_DEBUG({
2332 Inst.print(dbgs());
2333 ScoreBrackets.dump();
2334 });
2335
2336 // TODO: Remove this work-around after fixing the scheduler and enable the
2337 // assert above.
2338 if (RestoreVCCZ) {
2339 // Restore the vccz bit. Any time a value is written to vcc, the vcc
2340 // bit is updated, so we can restore the bit by reading the value of
2341 // vcc and then writing it back to the register.
2342 BuildMI(Block, Inst, Inst.getDebugLoc(),
2343 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2344 TRI->getVCC())
2345 .addReg(TRI->getVCC());
2346 VCCZCorrect = true;
2347 Modified = true;
2348 }
2349
2350 ++Iter;
2351 }
2352
2353 // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
2354 // needed.
2356 if (Block.getFirstTerminator() == Block.end() &&
2357 isPreheaderToFlush(Block, ScoreBrackets)) {
2358 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2359 Wait.LoadCnt = 0;
2360 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2361 Wait.SampleCnt = 0;
2362 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2363 Wait.BvhCnt = 0;
2364 }
2365
2366 // Combine or remove any redundant waitcnts at the end of the block.
2367 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
2368 OldWaitcntInstr);
2369
2370 return Modified;
2371}
2372
2373// Return true if the given machine basic block is a preheader of a loop in
2374// which we want to flush the vmcnt counter, and false otherwise.
2375bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
2376 WaitcntBrackets &ScoreBrackets) {
2377 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
2378 if (!IsInserted)
2379 return Iterator->second;
2380
2382 if (!Succ)
2383 return false;
2384
2385 MachineLoop *Loop = MLI->getLoopFor(Succ);
2386 if (!Loop)
2387 return false;
2388
2389 if (Loop->getLoopPreheader() == &MBB &&
2390 shouldFlushVmCnt(Loop, ScoreBrackets)) {
2391 Iterator->second = true;
2392 return true;
2393 }
2394
2395 return false;
2396}
2397
2398bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2399 return SIInstrInfo::isVMEM(MI) ||
2400 (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
2401}
2402
2403// Return true if it is better to flush the vmcnt counter in the preheader of
2404// the given loop. We currently decide to flush in two situations:
2405// 1. The loop contains vmem store(s), no vmem load and at least one use of a
2406// vgpr containing a value that is loaded outside of the loop. (Only on
2407// targets with no vscnt counter).
2408// 2. The loop contains vmem load(s), but the loaded values are not used in the
2409// loop, and at least one use of a vgpr containing a value that is loaded
2410// outside of the loop.
2411bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2412 WaitcntBrackets &Brackets) {
2413 bool HasVMemLoad = false;
2414 bool HasVMemStore = false;
2415 bool UsesVgprLoadedOutside = false;
2416 DenseSet<Register> VgprUse;
2417 DenseSet<Register> VgprDef;
2418
2419 for (MachineBasicBlock *MBB : ML->blocks()) {
2420 for (MachineInstr &MI : *MBB) {
2421 if (isVMEMOrFlatVMEM(MI)) {
2422 if (MI.mayLoad())
2423 HasVMemLoad = true;
2424 if (MI.mayStore())
2425 HasVMemStore = true;
2426 }
2427 for (unsigned I = 0; I < MI.getNumOperands(); I++) {
2428 MachineOperand &Op = MI.getOperand(I);
2429 if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
2430 continue;
2431 RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, I);
2432 // Vgpr use
2433 if (Op.isUse()) {
2434 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2435 // If we find a register that is loaded inside the loop, 1. and 2.
2436 // are invalidated and we can exit.
2437 if (VgprDef.contains(RegNo))
2438 return false;
2439 VgprUse.insert(RegNo);
2440 // If at least one of Op's registers is in the score brackets, the
2441 // value is likely loaded outside of the loop.
2442 if (Brackets.getRegScore(RegNo, LOAD_CNT) >
2443 Brackets.getScoreLB(LOAD_CNT) ||
2444 Brackets.getRegScore(RegNo, SAMPLE_CNT) >
2445 Brackets.getScoreLB(SAMPLE_CNT) ||
2446 Brackets.getRegScore(RegNo, BVH_CNT) >
2447 Brackets.getScoreLB(BVH_CNT)) {
2448 UsesVgprLoadedOutside = true;
2449 break;
2450 }
2451 }
2452 }
2453 // VMem load vgpr def
2454 else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
2455 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2456 // If we find a register that is loaded inside the loop, 1. and 2.
2457 // are invalidated and we can exit.
2458 if (VgprUse.contains(RegNo))
2459 return false;
2460 VgprDef.insert(RegNo);
2461 }
2462 }
2463 }
2464 }
2465 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2466 return true;
2467 return HasVMemLoad && UsesVgprLoadedOutside;
2468}
2469
2470bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
2471 ST = &MF.getSubtarget<GCNSubtarget>();
2472 TII = ST->getInstrInfo();
2473 TRI = &TII->getRegisterInfo();
2474 MRI = &MF.getRegInfo();
2476 MLI = &getAnalysis<MachineLoopInfo>();
2477 PDT = &getAnalysis<MachinePostDominatorTree>();
2478 if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2479 AA = &AAR->getAAResults();
2480
2482
2483 if (ST->hasExtendedWaitCounts()) {
2484 MaxCounter = NUM_EXTENDED_INST_CNTS;
2485 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
2486 WCG = &WCGGFX12Plus;
2487 } else {
2488 MaxCounter = NUM_NORMAL_INST_CNTS;
2489 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
2490 WCG = &WCGPreGFX12;
2491 }
2492
2493 ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
2494 for (auto T : inst_counter_types())
2495 ForceEmitWaitcnt[T] = false;
2496
2497 const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
2498
2499 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
2500
2501 HardwareLimits Limits = {};
2502 if (ST->hasExtendedWaitCounts()) {
2503 Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
2504 Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
2505 } else {
2506 Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
2507 Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
2508 }
2509 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
2510 Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
2511 Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
2512 Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
2513 Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
2514
2515 unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
2516 unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
2517 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2518 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2519
2520 RegisterEncoding Encoding = {};
2521 Encoding.VGPR0 =
2522 TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2523 Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
2524 Encoding.SGPR0 =
2525 TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2526 Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
2527
2528 BlockInfos.clear();
2529 bool Modified = false;
2530
2531 MachineBasicBlock &EntryBB = MF.front();
2533
2534 if (!MFI->isEntryFunction()) {
2535 // Wait for any outstanding memory operations that the input registers may
2536 // depend on. We can't track them and it's better to do the wait after the
2537 // costly call sequence.
2538
2539 // TODO: Could insert earlier and schedule more liberally with operations
2540 // that only use caller preserved registers.
2541 for (MachineBasicBlock::iterator E = EntryBB.end();
2542 I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
2543 ;
2544
2545 if (ST->hasExtendedWaitCounts()) {
2546 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2547 .addImm(0);
2548 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2549 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
2550 continue;
2551
2552 BuildMI(EntryBB, I, DebugLoc(),
2553 TII->get(instrsForExtendedCounterTypes[CT]))
2554 .addImm(0);
2555 }
2556 } else {
2557 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
2558 }
2559
2560 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
2561 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
2562 SmemAccessCounter);
2563 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2564 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2565
2566 Modified = true;
2567 }
2568
2569 // Keep iterating over the blocks in reverse post order, inserting and
2570 // updating s_waitcnt where needed, until a fix point is reached.
2572 BlockInfos.insert({MBB, BlockInfo()});
2573
2574 std::unique_ptr<WaitcntBrackets> Brackets;
2575 bool Repeat;
2576 do {
2577 Repeat = false;
2578
2579 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
2580 ++BII) {
2581 MachineBasicBlock *MBB = BII->first;
2582 BlockInfo &BI = BII->second;
2583 if (!BI.Dirty)
2584 continue;
2585
2586 if (BI.Incoming) {
2587 if (!Brackets)
2588 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
2589 else
2590 *Brackets = *BI.Incoming;
2591 } else {
2592 if (!Brackets)
2593 Brackets = std::make_unique<WaitcntBrackets>(
2594 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
2595 SmemAccessCounter);
2596 else
2597 *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding,
2598 WaitEventMaskForInst, SmemAccessCounter);
2599 }
2600
2601 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
2602 BI.Dirty = false;
2603
2604 if (Brackets->hasPendingEvent()) {
2605 BlockInfo *MoveBracketsToSucc = nullptr;
2606 for (MachineBasicBlock *Succ : MBB->successors()) {
2607 auto SuccBII = BlockInfos.find(Succ);
2608 BlockInfo &SuccBI = SuccBII->second;
2609 if (!SuccBI.Incoming) {
2610 SuccBI.Dirty = true;
2611 if (SuccBII <= BII)
2612 Repeat = true;
2613 if (!MoveBracketsToSucc) {
2614 MoveBracketsToSucc = &SuccBI;
2615 } else {
2616 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
2617 }
2618 } else if (SuccBI.Incoming->merge(*Brackets)) {
2619 SuccBI.Dirty = true;
2620 if (SuccBII <= BII)
2621 Repeat = true;
2622 }
2623 }
2624 if (MoveBracketsToSucc)
2625 MoveBracketsToSucc->Incoming = std::move(Brackets);
2626 }
2627 }
2628 } while (Repeat);
2629
2630 if (ST->hasScalarStores()) {
2632 bool HaveScalarStores = false;
2633
2634 for (MachineBasicBlock &MBB : MF) {
2635 for (MachineInstr &MI : MBB) {
2636 if (!HaveScalarStores && TII->isScalarStore(MI))
2637 HaveScalarStores = true;
2638
2639 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
2640 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2641 EndPgmBlocks.push_back(&MBB);
2642 }
2643 }
2644
2645 if (HaveScalarStores) {
2646 // If scalar writes are used, the cache must be flushed or else the next
2647 // wave to reuse the same scratch memory can be clobbered.
2648 //
2649 // Insert s_dcache_wb at wave termination points if there were any scalar
2650 // stores, and only if the cache hasn't already been flushed. This could
2651 // be improved by looking across blocks for flushes in postdominating
2652 // blocks from the stores but an explicitly requested flush is probably
2653 // very rare.
2654 for (MachineBasicBlock *MBB : EndPgmBlocks) {
2655 bool SeenDCacheWB = false;
2656
2657 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
2658 I != E; ++I) {
2659 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
2660 SeenDCacheWB = true;
2661 else if (TII->isScalarStore(*I))
2662 SeenDCacheWB = false;
2663
2664 // FIXME: It would be better to insert this before a waitcnt if any.
2665 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
2666 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2667 !SeenDCacheWB) {
2668 Modified = true;
2669 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
2670 }
2671 }
2672 }
2673 }
2674 }
2675
2676 // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
2677 // instructions.
2678 for (MachineInstr *MI : ReleaseVGPRInsts) {
2679 if (ST->requiresNopBeforeDeallocVGPRs()) {
2680 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_NOP))
2681 .addImm(0);
2682 }
2683 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2684 TII->get(AMDGPU::S_SENDMSG))
2686 Modified = true;
2687 }
2688 ReleaseVGPRInsts.clear();
2689
2690 return Modified;
2691}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:182
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1291
static Function * getFunction(Constant *C)
Definition: Evaluator.cpp:236
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
static bool isCacheInvOrWBInst(MachineInstr &Inst)
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static bool readsVCCZ(const MachineInstr &MI)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define DEBUG_TYPE
SI Insert Waitcnts
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Provides some synthesis utilities to produce sequences of values.
@ None
static const uint32_t IV[8]
Definition: blake3_impl.h:78
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Represent the analysis usage information of a pass.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
This class represents an Operation in the Expression.
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:100
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:72
A debug info location.
Definition: DebugLoc.h:33
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
bool erase(const KeyT &Val)
Definition: DenseMap.h:329
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:558
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:341
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:939
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:561
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:771
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:487
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:568
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
A description of a memory reference used in the backend.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
iterator find(const KeyT &Key)
Definition: MapVector.h:167
iterator begin()
Definition: MapVector.h:69
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
void clear()
Definition: MapVector.h:88
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:432
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:649
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:691
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:588
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:833
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:570
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:596
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:673
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:947
static bool isVINTERP(const MachineInstr &MI)
Definition: SIInstrInfo.h:841
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:580
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
self_iterator getIterator()
Definition: ilist_node.h:109
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
unsigned getStorecntBitMask(const IsaVersion &Version)
IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getSamplecntBitMask(const IsaVersion &Version)
unsigned getKmcntBitMask(const IsaVersion &Version)
unsigned getVmcntBitMask(const IsaVersion &Version)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getBvhcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getLoadcntBitMask(const IsaVersion &Version)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned getDscntBitMask(const IsaVersion &Version)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition: Sequence.h:337
@ Wait
Definition: Threading.h:61
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
char & SIInsertWaitcntsID
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
FunctionPass * createSIInsertWaitcntsPass()
Instruction set architecture version.
Definition: TargetParser.h:125
Represents the counter values to wait for in an s_waitcnt instruction.
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
static constexpr bool is_iterable
Definition: Sequence.h:100