LLVM 23.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "AMDGPUHWEvents.h"
28#include "AMDGPUWaitcntUtils.h"
29#include "GCNSubtarget.h"
33#include "llvm/ADT/MapVector.h"
35#include "llvm/ADT/Sequence.h"
41#include "llvm/IR/Dominators.h"
44
45using namespace llvm;
46
49
50#define DEBUG_TYPE "si-insert-waitcnts"
51
52static cl::opt<bool>
53 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
54 cl::desc("Force all waitcnt instrs to be emitted as "
55 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
56 cl::init(false), cl::Hidden);
57
59 "amdgpu-waitcnt-load-forcezero",
60 cl::desc("Force all waitcnt load counters to wait until 0"),
61 cl::init(false), cl::Hidden);
62
64 "amdgpu-expert-scheduling-mode",
65 cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
66 cl::init(false), cl::Hidden);
67
68namespace {
69
70template <typename EmitWaitcntFn>
71static void EmitExpandedWaitcnt(unsigned Outstanding, unsigned Target,
72 EmitWaitcntFn &&EmitWaitcnt) {
73 // Emit waitcnts from (Outstanding - 1) down to Target.
74 for (unsigned I = Outstanding - 1; I > Target && I != ~0u; --I)
75 EmitWaitcnt(I);
76 EmitWaitcnt(Target);
77}
78
79/// Integer IDs used to track vector memory locations we may have to wait on.
80/// Encoded as u16 chunks:
81///
82/// [0, REGUNITS_END ): MCRegUnit
83/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
84///
85/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
86/// It gives (2 << 16) - 1 entries per category which is more than enough
87/// for all register units. MCPhysReg is u16 so we don't even support >u16
88/// physical register numbers at this time, let alone >u16 register units.
89/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
90/// is enough for all register units.
91using VMEMID = uint32_t;
92
93enum : VMEMID {
94 TRACKINGID_RANGE_LEN = (1 << 16),
95
96 // Important: MCRegUnits must always be tracked starting from 0, as we
97 // need to be able to convert between a MCRegUnit and a VMEMID freely.
98 REGUNITS_BEGIN = 0,
99 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
100
101 // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
102 // entry, which is updated for all LDS DMA operations encountered.
103 // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
104 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
105 LDSDMA_BEGIN = REGUNITS_END,
106 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
107};
108
109/// Convert a MCRegUnit to a VMEMID.
110static constexpr VMEMID toVMEMID(MCRegUnit RU) {
111 return static_cast<unsigned>(RU);
112}
113
114} // namespace
115
116namespace {
117
118// Enumerate different types of result-returning VMEM operations. Although
119// s_waitcnt orders them all with a single vmcnt counter, in the absence of
120// s_waitcnt only instructions of the same VmemType are guaranteed to write
121// their results in order -- so there is no need to insert an s_waitcnt between
122// two instructions of the same type that write the same vgpr.
123enum VmemType {
124 // BUF instructions and MIMG instructions without a sampler.
125 VMEM_NOSAMPLER,
126 // MIMG instructions with a sampler.
127 VMEM_SAMPLER,
128 // BVH instructions
129 VMEM_BVH,
130 NUM_VMEM_TYPES
131};
132
133// Maps values of InstCounterType to the instruction that waits on that
134// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
135// returns true, and does not cover VA_VDST or VM_VSRC.
136static const unsigned
137 instrsForExtendedCounterTypes[AMDGPU::NUM_EXTENDED_INST_CNTS] = {
138 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT,
139 AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT,
140 AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
141 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT,
142 AMDGPU::S_WAIT_ASYNCCNT, AMDGPU::S_WAIT_TENSORCNT};
143
144// ASYNCMARK and WAIT_ASYNCMARK are meta instructions that emit no hardware
145// code but still need to be processed by this pass for async vmcnt tracking.
146static bool isNonWaitcntMetaInst(const MachineInstr &MI) {
147 switch (MI.getOpcode()) {
148 case AMDGPU::ASYNCMARK:
149 case AMDGPU::WAIT_ASYNCMARK:
150 return false;
151 default:
152 return MI.isMetaInstruction();
153 }
154}
155
156static bool updateVMCntOnly(const MachineInstr &Inst) {
157 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
159}
160
161#ifndef NDEBUG
162static bool isNormalMode(AMDGPU::InstCounterType MaxCounter) {
163 return MaxCounter == AMDGPU::NUM_NORMAL_INST_CNTS;
164}
165#endif // NDEBUG
166
167VmemType getVmemType(const MachineInstr &Inst) {
168 assert(updateVMCntOnly(Inst));
169 if (!SIInstrInfo::isImage(Inst))
170 return VMEM_NOSAMPLER;
171 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
172 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
173 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
174
175 if (BaseInfo->BVH)
176 return VMEM_BVH;
177
178 // We have to make an additional check for isVSAMPLE here since some
179 // instructions don't have a sampler, but are still classified as sampler
180 // instructions for the purposes of e.g. waitcnt.
181 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
182 return VMEM_SAMPLER;
183
184 return VMEM_NOSAMPLER;
185}
186
187class WaitcntBrackets;
188
189// This abstracts the logic for generating and updating S_WAIT* instructions
190// away from the analysis that determines where they are needed. This was
191// done because the set of counters and instructions for waiting on them
192// underwent a major shift with gfx12, sufficiently so that having this
193// abstraction allows the main analysis logic to be simpler than it would
194// otherwise have had to become.
195class WaitcntGenerator {
196protected:
197 const GCNSubtarget &ST;
198 const SIInstrInfo &TII;
199 AMDGPU::IsaVersion IV;
200 AMDGPU::InstCounterType MaxCounter;
201 bool OptNone;
202 bool ExpandWaitcntProfiling = false;
203 const AMDGPU::HardwareLimits &Limits;
204
205public:
206 WaitcntGenerator() = delete;
207 WaitcntGenerator(const WaitcntGenerator &) = delete;
208 WaitcntGenerator(const MachineFunction &MF,
209 AMDGPU::InstCounterType MaxCounter,
210 const AMDGPU::HardwareLimits &Limits)
211 : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
212 IV(AMDGPU::getIsaVersion(ST.getCPU())), MaxCounter(MaxCounter),
213 OptNone(MF.getFunction().hasOptNone() ||
214 MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
215 ExpandWaitcntProfiling(
216 MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),
217 Limits(Limits) {}
218
219 // Return true if the current function should be compiled with no
220 // optimization.
221 bool isOptNone() const { return OptNone; }
222
223 unsigned getLimit(AMDGPU::InstCounterType E) const { return Limits.get(E); }
224
225 // Edits an existing sequence of wait count instructions according
226 // to an incoming Waitcnt value, which is itself updated to reflect
227 // any new wait count instructions which may need to be generated by
228 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
229 // were made.
230 //
231 // This editing will usually be merely updated operands, but it may also
232 // delete instructions if the incoming Wait value indicates they are not
233 // needed. It may also remove existing instructions for which a wait
234 // is needed if it can be determined that it is better to generate new
235 // instructions later, as can happen on gfx12.
236 virtual bool
237 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
238 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
240
241 // Transform a soft waitcnt into a normal one.
242 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
243
244 // Generates new wait count instructions according to the value of
245 // Wait, returning true if any new instructions were created.
246 // ScoreBrackets is used for profiling expansion.
247 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
249 AMDGPU::Waitcnt Wait,
250 const WaitcntBrackets &ScoreBrackets) = 0;
251
252 // Returns the HWEventSet that corresponds to counter \p T.
253 virtual const HWEventSet &getWaitEvents(AMDGPU::InstCounterType T) const = 0;
254
255 /// \returns the counter that corresponds to event \p E.
256 AMDGPU::InstCounterType getCounterFromEvent(HWEvent E) const {
257 for (auto T : AMDGPU::inst_counter_types()) {
258 if (getWaitEvents(T).contains(E))
259 return T;
260 }
261 llvm_unreachable("event type has no associated counter");
262 }
263
264 // Returns a new waitcnt with all counters except VScnt set to 0. If
265 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
266 // AsyncCnt and TensorCnt always default to ~0u (don't wait for it). They
267 // are only updated when a call to @llvm.amdgcn.wait.asyncmark() is
268 // processed.
269 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
270
271 virtual ~WaitcntGenerator() = default;
272};
273
274class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
275 static constexpr const HWEventSet
276 WaitEventMaskForInstPreGFX12[AMDGPU::NUM_INST_CNTS] = {
277 HWEventSet({HWEvent::VMEM_ACCESS, HWEvent::VMEM_SAMPLER_READ_ACCESS,
278 HWEvent::VMEM_BVH_READ_ACCESS}),
279 HWEventSet({HWEvent::SMEM_ACCESS, HWEvent::LDS_ACCESS,
280 HWEvent::GDS_ACCESS, HWEvent::SQ_MESSAGE}),
281 HWEventSet({HWEvent::EXP_GPR_LOCK, HWEvent::GDS_GPR_LOCK,
282 HWEvent::VMW_GPR_LOCK, HWEvent::EXP_PARAM_ACCESS,
283 HWEvent::EXP_POS_ACCESS, HWEvent::EXP_LDS_ACCESS}),
285 {HWEvent::VMEM_WRITE_ACCESS, HWEvent::SCRATCH_WRITE_ACCESS}),
286 HWEventSet(),
287 HWEventSet(),
288 HWEventSet(),
289 HWEventSet(),
290 HWEventSet(),
291 HWEventSet(),
292 HWEventSet(),
293 HWEventSet()};
294
295public:
296 using WaitcntGenerator::WaitcntGenerator;
297 bool
298 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
299 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
300 MachineBasicBlock::instr_iterator It) const override;
301
302 bool createNewWaitcnt(MachineBasicBlock &Block,
304 AMDGPU::Waitcnt Wait,
305 const WaitcntBrackets &ScoreBrackets) override;
306
307 const HWEventSet &getWaitEvents(AMDGPU::InstCounterType T) const override {
308 return WaitEventMaskForInstPreGFX12[T];
309 }
310
311 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
312};
313
314class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
315protected:
316 bool IsExpertMode;
317 static constexpr const HWEventSet
318 WaitEventMaskForInstGFX12Plus[AMDGPU::NUM_INST_CNTS] = {
319 HWEventSet({HWEvent::VMEM_ACCESS, HWEvent::GLOBAL_INV_ACCESS}),
320 HWEventSet({HWEvent::LDS_ACCESS, HWEvent::GDS_ACCESS}),
321 HWEventSet({HWEvent::EXP_GPR_LOCK, HWEvent::GDS_GPR_LOCK,
322 HWEvent::VMW_GPR_LOCK, HWEvent::EXP_PARAM_ACCESS,
323 HWEvent::EXP_POS_ACCESS, HWEvent::EXP_LDS_ACCESS}),
325 {HWEvent::VMEM_WRITE_ACCESS, HWEvent::SCRATCH_WRITE_ACCESS}),
326 HWEventSet({HWEvent::VMEM_SAMPLER_READ_ACCESS}),
327 HWEventSet({HWEvent::VMEM_BVH_READ_ACCESS}),
329 {HWEvent::SMEM_ACCESS, HWEvent::SQ_MESSAGE, HWEvent::SCC_WRITE}),
330 HWEventSet({HWEvent::VMEM_GROUP, HWEvent::SMEM_GROUP}),
331 HWEventSet({HWEvent::ASYNC_ACCESS}),
332 HWEventSet({HWEvent::TENSOR_ACCESS}),
333 HWEventSet({HWEvent::VGPR_CSMACC_WRITE, HWEvent::VGPR_DPMACC_WRITE,
334 HWEvent::VGPR_TRANS_WRITE, HWEvent::VGPR_XDL_WRITE}),
335 HWEventSet({HWEvent::VGPR_LDS_READ, HWEvent::VGPR_FLAT_READ,
336 HWEvent::VGPR_VMEM_READ})};
337
338public:
339 WaitcntGeneratorGFX12Plus() = delete;
340 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
341 AMDGPU::InstCounterType MaxCounter,
342 const AMDGPU::HardwareLimits &Limits,
343 bool IsExpertMode)
344 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
345
346 bool
347 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
348 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
349 MachineBasicBlock::instr_iterator It) const override;
350
351 bool createNewWaitcnt(MachineBasicBlock &Block,
353 AMDGPU::Waitcnt Wait,
354 const WaitcntBrackets &ScoreBrackets) override;
355
356 const HWEventSet &getWaitEvents(AMDGPU::InstCounterType T) const override {
357 return WaitEventMaskForInstGFX12Plus[T];
358 }
359
360 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
361};
362
363// Flags indicating which counters should be flushed in a loop preheader.
364struct PreheaderFlushFlags {
365 bool FlushVmCnt = false;
366 bool FlushDsCnt = false;
367};
368
369class SIInsertWaitcnts {
370 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
371 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
372 MachineLoopInfo &MLI;
373 MachinePostDominatorTree &PDT;
374 AliasAnalysis *AA = nullptr;
375 MachineFunction &MF;
376
377 struct BlockInfo {
378 std::unique_ptr<WaitcntBrackets> Incoming;
379 bool Dirty = true;
380 BlockInfo() = default;
381 BlockInfo(BlockInfo &&) = default;
382 BlockInfo &operator=(BlockInfo &&) = default;
383 ~BlockInfo();
384 };
385
386 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
387
388 bool ForceEmitWaitcnt[AMDGPU::NUM_INST_CNTS] = {};
389
390 std::unique_ptr<WaitcntGenerator> WCG;
391
392 // Remember call and return instructions in the function.
393 DenseSet<MachineInstr *> CallInsts;
394 DenseSet<MachineInstr *> ReturnInsts;
395
396 // Remember all S_ENDPGM instructions. The boolean flag is true if there might
397 // be outstanding stores but definitely no outstanding scratch stores, to help
398 // with insertion of DEALLOC_VGPRS messages.
399 DenseMap<MachineInstr *, bool> EndPgmInsts;
400
401 AMDGPU::HardwareLimits Limits;
402
403public:
404 const GCNSubtarget &ST;
405 const SIInstrInfo &TII;
406 const SIRegisterInfo &TRI;
407 const MachineRegisterInfo &MRI;
408 AMDGPU::InstCounterType SmemAccessCounter;
409 AMDGPU::InstCounterType MaxCounter;
410 bool IsExpertMode = false;
411
412 SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,
413 AliasAnalysis *AA, MachineFunction &MF)
414 : MLI(MLI), PDT(PDT), AA(AA), MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),
415 TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()),
416 MRI(MF.getRegInfo()) {}
417
418 const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
419
420 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
421 const WaitcntBrackets &Brackets);
422 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
423 const WaitcntBrackets &ScoreBrackets);
424 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
425 bool isDSRead(const MachineInstr &MI) const;
426 bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;
427 bool run();
428
429 bool isAsync(const MachineInstr &MI) const {
431 return false;
433 return true;
434 const MachineOperand *Async =
435 TII.getNamedOperand(MI, AMDGPU::OpName::IsAsync);
436 return Async && (Async->getImm());
437 }
438
439 bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const {
440 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI);
441 }
442
443 bool isAsyncLdsDmaWrite(const MachineInstr &MI) const {
444 return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
445 }
446
447 bool shouldUpdateAsyncMark(const MachineInstr &MI,
450 return T == AMDGPU::TENSOR_CNT;
451 if (!isAsyncLdsDmaWrite(MI))
452 return false;
454 return T == AMDGPU::ASYNC_CNT;
455 return T == AMDGPU::LOAD_CNT;
456 }
457
458 bool isVmemAccess(const MachineInstr &MI) const;
459 bool generateWaitcntInstBefore(MachineInstr &MI,
460 WaitcntBrackets &ScoreBrackets,
461 MachineInstr *OldWaitcntInstr,
462 PreheaderFlushFlags FlushFlags);
463 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
465 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
466 MachineInstr *OldWaitcntInstr);
467 void updateEventWaitcntAfter(MachineInstr &Inst,
468 WaitcntBrackets *ScoreBrackets);
469 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
470 MachineBasicBlock *Block) const;
471 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
472 WaitcntBrackets &ScoreBrackets);
473 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
474 WaitcntBrackets &ScoreBrackets);
475 /// Removes redundant Soft Xcnt Waitcnts in \p Block emitted by the Memory
476 /// Legalizer. Returns true if block was modified.
477 bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
478 void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
479 bool ExpertMode) const;
480 const HWEventSet &getWaitEvents(AMDGPU::InstCounterType T) const {
481 return WCG->getWaitEvents(T);
482 }
483 AMDGPU::InstCounterType getCounterFromEvent(HWEvent E) const {
484 return WCG->getCounterFromEvent(E);
485 }
486};
487
488// This objects maintains the current score brackets of each wait counter, and
489// a per-register scoreboard for each wait counter.
490//
491// We also maintain the latest score for every event type that can change the
492// waitcnt in order to know if there are multiple types of events within
493// the brackets. When multiple types of event happen in the bracket,
494// wait count may get decreased out of order, therefore we need to put in
495// "s_waitcnt 0" before use.
496class WaitcntBrackets {
497public:
498 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
499 assert(Context->TRI.getNumRegUnits() < REGUNITS_END);
500 }
501
502#ifndef NDEBUG
503 ~WaitcntBrackets() {
504 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
505 for (auto &[ID, Val] : VMem) {
506 if (Val.empty())
507 ++NumUnusedVmem;
508 }
509 for (auto &[ID, Val] : SGPRs) {
510 if (Val.empty())
511 ++NumUnusedSGPRs;
512 }
513
514 if (NumUnusedVmem || NumUnusedSGPRs) {
515 errs() << "WaitcntBracket had unused entries at destruction time: "
516 << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
517 << " SGPR unused entries\n";
518 std::abort();
519 }
520 }
521#endif
522
523 bool isSmemCounter(AMDGPU::InstCounterType T) const {
524 return T == Context->SmemAccessCounter || T == AMDGPU::X_CNT;
525 }
526
527 unsigned getOutstanding(AMDGPU::InstCounterType T) const {
528 return ScoreUBs[T] - ScoreLBs[T];
529 }
530
531 bool hasPendingVMEM(VMEMID ID, AMDGPU::InstCounterType T) const {
532 return getVMemScore(ID, T) > getScoreLB(T);
533 }
534
535 /// \Return true if we have no score entries for counter \p T.
536 bool empty(AMDGPU::InstCounterType T) const { return getScoreRange(T) == 0; }
537
538private:
539 unsigned getScoreLB(AMDGPU::InstCounterType T) const {
541 return ScoreLBs[T];
542 }
543
544 unsigned getScoreUB(AMDGPU::InstCounterType T) const {
546 return ScoreUBs[T];
547 }
548
549 unsigned getScoreRange(AMDGPU::InstCounterType T) const {
550 return getScoreUB(T) - getScoreLB(T);
551 }
552
553 unsigned getSGPRScore(MCRegUnit RU, AMDGPU::InstCounterType T) const {
554 auto It = SGPRs.find(RU);
555 return It != SGPRs.end() ? It->second.get(T) : 0;
556 }
557
558 unsigned getVMemScore(VMEMID TID, AMDGPU::InstCounterType T) const {
559 auto It = VMem.find(TID);
560 return It != VMem.end() ? It->second.Scores[T] : 0;
561 }
562
563public:
564 bool merge(const WaitcntBrackets &Other);
565
566 bool counterOutOfOrder(AMDGPU::InstCounterType T) const;
567 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
568 simplifyWaitcnt(Wait, Wait);
569 }
570 void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
571 AMDGPU::Waitcnt &UpdateWait) const;
572 void simplifyWaitcnt(AMDGPU::InstCounterType T, unsigned &Count) const;
573 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T) const;
574 void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
575 AMDGPU::Waitcnt &UpdateWait) const;
576 void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
577 AMDGPU::Waitcnt &UpdateWait) const;
578
579 void determineWaitForPhysReg(AMDGPU::InstCounterType T, MCPhysReg Reg,
580 AMDGPU::Waitcnt &Wait,
581 const MachineInstr &MI) const;
582 MCPhysReg determineVGPR16Dependency(const MachineInstr &MI,
584 MCPhysReg Reg) const;
585 void determineWaitForLDSDMA(AMDGPU::InstCounterType T, VMEMID TID,
586 AMDGPU::Waitcnt &Wait) const;
587 AMDGPU::Waitcnt determineAsyncWait(unsigned N);
588 void tryClearSCCWriteEvent(MachineInstr *Inst);
589
590 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
591 void applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count);
592 void applyWaitcnt(const AMDGPU::Waitcnt &Wait, AMDGPU::InstCounterType T);
593 void updateByEvent(HWEvent E, MachineInstr &MI);
594 void recordAsyncMark(MachineInstr &MI);
595
596 bool hasPendingEvent() const { return !PendingEvents.empty(); }
597 bool hasPendingEvent(HWEvent E) const { return PendingEvents.contains(E); }
598 bool hasPendingEvent(AMDGPU::InstCounterType T) const {
599 bool HasPending = PendingEvents & Context->getWaitEvents(T);
600 assert(HasPending == !empty(T) &&
601 "Expected pending events iff scoreboard is not empty");
602 return HasPending;
603 }
604
605 bool hasMixedPendingEvents(AMDGPU::InstCounterType T) const {
606 HWEventSet Events = PendingEvents & Context->getWaitEvents(T);
607 // Return true if more than one bit is set in Events.
608 return Events.twoOrMore();
609 }
610
611 bool hasPendingFlat() const {
612 return ((LastFlatDsCnt > ScoreLBs[AMDGPU::DS_CNT] &&
613 LastFlatDsCnt <= ScoreUBs[AMDGPU::DS_CNT]) ||
614 (LastFlatLoadCnt > ScoreLBs[AMDGPU::LOAD_CNT] &&
615 LastFlatLoadCnt <= ScoreUBs[AMDGPU::LOAD_CNT]));
616 }
617
618 void setPendingFlat() {
619 LastFlatLoadCnt = ScoreUBs[AMDGPU::LOAD_CNT];
620 LastFlatDsCnt = ScoreUBs[AMDGPU::DS_CNT];
621 }
622
623 bool hasPendingGDS() const {
624 return LastGDS > ScoreLBs[AMDGPU::DS_CNT] &&
625 LastGDS <= ScoreUBs[AMDGPU::DS_CNT];
626 }
627
628 unsigned getPendingGDSWait() const {
629 return std::min(getScoreUB(AMDGPU::DS_CNT) - LastGDS,
630 getLimit(AMDGPU::DS_CNT) - 1);
631 }
632
633 void setPendingGDS() { LastGDS = ScoreUBs[AMDGPU::DS_CNT]; }
634
635 // Return true if there might be pending writes to the vgpr-interval by VMEM
636 // instructions with types different from V.
637 bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
638 for (MCRegUnit RU : regunits(Reg)) {
639 auto It = VMem.find(toVMEMID(RU));
640 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
641 return true;
642 }
643 return false;
644 }
645
646 void clearVgprVmemTypes(MCPhysReg Reg) {
647 for (MCRegUnit RU : regunits(Reg)) {
648 if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
649 It->second.VMEMTypes = 0;
650 if (It->second.empty())
651 VMem.erase(It);
652 }
653 }
654 }
655
656 void setStateOnFunctionEntryOrReturn() {
657 setScoreUB(AMDGPU::STORE_CNT,
658 getScoreUB(AMDGPU::STORE_CNT) + getLimit(AMDGPU::STORE_CNT));
659 PendingEvents |= Context->getWaitEvents(AMDGPU::STORE_CNT);
660 }
661
662 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
663 return LDSDMAStores;
664 }
665
666 bool hasPointSampleAccel(const MachineInstr &MI) const;
667 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
668 MCPhysReg RU) const;
669
670 void print(raw_ostream &) const;
671 void dump() const { print(dbgs()); }
672
673 // Free up memory by removing empty entries from the DenseMap that track event
674 // scores.
675 void purgeEmptyTrackingData();
676
677private:
678 unsigned getLimit(AMDGPU::InstCounterType T) const {
679 return Context->getLimits().get(T);
680 }
681
682 struct MergeInfo {
683 unsigned OldLB;
684 unsigned OtherLB;
685 unsigned MyShift;
686 unsigned OtherShift;
687 };
688
689 using CounterValueArray = std::array<unsigned, AMDGPU::NUM_INST_CNTS>;
690
691 void determineWaitForScore(AMDGPU::InstCounterType T, unsigned Score,
692 AMDGPU::Waitcnt &Wait) const;
693
694 static bool mergeScore(const MergeInfo &M, unsigned &Score,
695 unsigned OtherScore);
696 bool mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
697 ArrayRef<CounterValueArray> OtherMarks);
698
700 assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
701 if (!Context->TRI.isInAllocatableClass(Reg))
702 return {{}, {}};
703 return Context->TRI.regunits(Reg);
704 }
705
706 void setScoreLB(AMDGPU::InstCounterType T, unsigned Val) {
708 ScoreLBs[T] = Val;
709 }
710
711 void setScoreUB(AMDGPU::InstCounterType T, unsigned Val) {
713 ScoreUBs[T] = Val;
714
715 if (T != AMDGPU::EXP_CNT)
716 return;
717
718 if (getScoreRange(AMDGPU::EXP_CNT) > getLimit(AMDGPU::EXP_CNT))
719 ScoreLBs[AMDGPU::EXP_CNT] =
720 ScoreUBs[AMDGPU::EXP_CNT] - getLimit(AMDGPU::EXP_CNT);
721 }
722
723 void setRegScore(MCPhysReg Reg, AMDGPU::InstCounterType T, unsigned Val) {
724 const SIRegisterInfo &TRI = Context->TRI;
725 if (Reg == AMDGPU::SCC) {
726 SCCScore = Val;
727 } else if (TRI.isVectorRegister(Context->MRI, Reg)) {
728 for (MCRegUnit RU : regunits(Reg))
729 VMem[toVMEMID(RU)].Scores[T] = Val;
730 } else if (TRI.isSGPRReg(Context->MRI, Reg)) {
731 for (MCRegUnit RU : regunits(Reg))
732 SGPRs[RU].get(T) = Val;
733 } else {
734 llvm_unreachable("Register cannot be tracked/unknown register!");
735 }
736 }
737
738 void setVMemScore(VMEMID TID, AMDGPU::InstCounterType T, unsigned Val) {
739 VMem[TID].Scores[T] = Val;
740 }
741
742 void setScoreByOperand(const MachineOperand &Op,
743 AMDGPU::InstCounterType CntTy, unsigned Val);
744
745 const SIInsertWaitcnts *Context;
746
747 unsigned ScoreLBs[AMDGPU::NUM_INST_CNTS] = {0};
748 unsigned ScoreUBs[AMDGPU::NUM_INST_CNTS] = {0};
749 HWEventSet PendingEvents;
750 // Remember the last flat memory operation.
751 unsigned LastFlatDsCnt = 0;
752 unsigned LastFlatLoadCnt = 0;
753 // Remember the last GDS operation.
754 unsigned LastGDS = 0;
755
756 // The score tracking logic is fragmented as follows:
757 // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
758 // - SGPRs: SGPR RegUnits
759 // - SCC: Non-allocatable and not general purpose: not a SGPR.
760 //
761 // For the VMem case, if the key is within the range of LDS DMA IDs,
762 // then the corresponding index into the `LDSDMAStores` vector below is:
763 // Key - LDSDMA_BEGIN - 1
764 // This is because LDSDMA_BEGIN is a generic entry and does not have an
765 // associated MachineInstr.
766 //
767 // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
768
769 struct VMEMInfo {
770 // Scores for all instruction counters. Zero-initialized.
771 CounterValueArray Scores{};
772 // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
773 unsigned VMEMTypes = 0;
774
775 bool empty() const { return all_of(Scores, equal_to(0)) && !VMEMTypes; }
776 };
777
778 /// Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
779 /// pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
780 class SGPRInfo {
781 /// Either DS_CNT or KM_CNT score.
782 unsigned ScoreDsKmCnt = 0;
783 unsigned ScoreXCnt = 0;
784
785 public:
786 unsigned get(AMDGPU::InstCounterType T) const {
787 assert(
788 (T == AMDGPU::DS_CNT || T == AMDGPU::KM_CNT || T == AMDGPU::X_CNT) &&
789 "Invalid counter");
790 return T == AMDGPU::X_CNT ? ScoreXCnt : ScoreDsKmCnt;
791 }
792 unsigned &get(AMDGPU::InstCounterType T) {
793 assert(
794 (T == AMDGPU::DS_CNT || T == AMDGPU::KM_CNT || T == AMDGPU::X_CNT) &&
795 "Invalid counter");
796 return T == AMDGPU::X_CNT ? ScoreXCnt : ScoreDsKmCnt;
797 }
798
799 bool empty() const { return !ScoreDsKmCnt && !ScoreXCnt; }
800 };
801
802 DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
803 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
804
805 // Reg score for SCC.
806 unsigned SCCScore = 0;
807 // The unique instruction that has an SCC write pending, if there is one.
808 const MachineInstr *PendingSCCWrite = nullptr;
809
810 // Store representative LDS DMA operations. The only useful info here is
811 // alias info. One store is kept per unique AAInfo.
812 SmallVector<const MachineInstr *> LDSDMAStores;
813
814 // State of all counters at each async mark encountered so far.
816
817 // But in the rare pathological case, a nest of loops that pushes marks
818 // without waiting on any mark can cause AsyncMarks to grow very large. We cap
819 // it to a reasonable limit. We can tune this later or potentially introduce a
820 // user option to control the value.
821 static constexpr unsigned MaxAsyncMarks = 16;
822
823 // Track the upper bound score for async operations that are not part of a
824 // mark yet. Initialized to all zeros.
825 CounterValueArray AsyncScore{};
826};
827
828SIInsertWaitcnts::BlockInfo::~BlockInfo() = default;
829
830class SIInsertWaitcntsLegacy : public MachineFunctionPass {
831public:
832 static char ID;
833 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
834
835 bool runOnMachineFunction(MachineFunction &MF) override;
836
837 StringRef getPassName() const override {
838 return "SI insert wait instructions";
839 }
840
841 void getAnalysisUsage(AnalysisUsage &AU) const override {
842 AU.setPreservesCFG();
843 AU.addRequired<MachineLoopInfoWrapperPass>();
844 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
845 AU.addUsedIfAvailable<AAResultsWrapperPass>();
846 AU.addPreserved<AAResultsWrapperPass>();
848 }
849};
850
851} // end anonymous namespace
852
853void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
855 unsigned Score) {
856 setRegScore(Op.getReg().asMCReg(), CntTy, Score);
857}
858
859// Return true if the subtarget is one that enables Point Sample Acceleration
860// and the MachineInstr passed in is one to which it might be applied (the
861// hardware makes this decision based on several factors, but we can't determine
862// this at compile time, so we have to assume it might be applied if the
863// instruction supports it).
864bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
865 if (!Context->ST.hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
866 return false;
867
868 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
869 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
871 return BaseInfo->PointSampleAccel;
872}
873
874// Return true if the subtarget enables Point Sample Acceleration, the supplied
875// MachineInstr is one to which it might be applied and the supplied interval is
876// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
877// (this is the type that a point sample accelerated instruction effectively
878// becomes)
879bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
880 MCPhysReg Reg) const {
881 if (!hasPointSampleAccel(MI))
882 return false;
883
884 return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER);
885}
886
887void WaitcntBrackets::updateByEvent(HWEvent E, MachineInstr &Inst) {
888 AMDGPU::InstCounterType T = Context->getCounterFromEvent(E);
889 assert(T < Context->MaxCounter);
890
891 unsigned UB = getScoreUB(T);
892 unsigned Increment = 1;
894 Context->ST.hasVOP3PX2IncrementsVaVdstTwice()) {
895 // V_WMMA_SCALE instructions use VOP3PX2 encoding. Hardware treats this as
896 // two VOP3P instructions and increments VA_VDST twice.
897 Increment = 2;
898 }
899 unsigned CurrScore = UB + Increment;
900 if (CurrScore == 0)
901 report_fatal_error("InsertWaitcnt score wraparound");
902 // PendingEvents and ScoreUB need to be update regardless if this event
903 // changes the score of a register or not.
904 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
905 PendingEvents.insert(E);
906 setScoreUB(T, CurrScore);
907
908 const SIRegisterInfo &TRI = Context->TRI;
909 const MachineRegisterInfo &MRI = Context->MRI;
910 const SIInstrInfo &TII = Context->TII;
911
912 if (T == AMDGPU::EXP_CNT) {
913 // Put score on the source vgprs. If this is a store, just use those
914 // specific register(s).
915 if (TII.isDS(Inst) && Inst.mayLoadOrStore()) {
916 // All GDS operations must protect their address register (same as
917 // export.)
918 if (const auto *AddrOp = TII.getNamedOperand(Inst, AMDGPU::OpName::addr))
919 setScoreByOperand(*AddrOp, AMDGPU::EXP_CNT, CurrScore);
920
921 if (Inst.mayStore()) {
922 if (const auto *Data0 =
923 TII.getNamedOperand(Inst, AMDGPU::OpName::data0))
924 setScoreByOperand(*Data0, AMDGPU::EXP_CNT, CurrScore);
925 if (const auto *Data1 =
926 TII.getNamedOperand(Inst, AMDGPU::OpName::data1))
927 setScoreByOperand(*Data1, AMDGPU::EXP_CNT, CurrScore);
928 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
929 Inst.getOpcode() != AMDGPU::DS_APPEND &&
930 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
931 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
932 for (const MachineOperand &Op : Inst.all_uses()) {
933 if (TRI.isVectorRegister(MRI, Op.getReg()))
934 setScoreByOperand(Op, AMDGPU::EXP_CNT, CurrScore);
935 }
936 }
937 } else if (TII.isFLAT(Inst)) {
938 if (Inst.mayStore()) {
939 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
940 AMDGPU::EXP_CNT, CurrScore);
941 } else if (SIInstrInfo::isAtomicRet(Inst)) {
942 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
943 AMDGPU::EXP_CNT, CurrScore);
944 }
945 } else if (TII.isMIMG(Inst)) {
946 if (Inst.mayStore()) {
947 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
948 } else if (SIInstrInfo::isAtomicRet(Inst)) {
949 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
950 AMDGPU::EXP_CNT, CurrScore);
951 }
952 } else if (TII.isMTBUF(Inst)) {
953 if (Inst.mayStore())
954 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
955 } else if (TII.isMUBUF(Inst)) {
956 if (Inst.mayStore()) {
957 setScoreByOperand(Inst.getOperand(0), AMDGPU::EXP_CNT, CurrScore);
958 } else if (SIInstrInfo::isAtomicRet(Inst)) {
959 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
960 AMDGPU::EXP_CNT, CurrScore);
961 }
962 } else if (TII.isLDSDIR(Inst)) {
963 // LDSDIR instructions attach the score to the destination.
964 setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::vdst),
965 AMDGPU::EXP_CNT, CurrScore);
966 } else {
967 if (TII.isEXP(Inst)) {
968 // For export the destination registers are really temps that
969 // can be used as the actual source after export patching, so
970 // we need to treat them like sources and set the EXP_CNT
971 // score.
972 for (MachineOperand &DefMO : Inst.all_defs()) {
973 if (TRI.isVGPR(MRI, DefMO.getReg())) {
974 setScoreByOperand(DefMO, AMDGPU::EXP_CNT, CurrScore);
975 }
976 }
977 }
978 for (const MachineOperand &Op : Inst.all_uses()) {
979 if (TRI.isVectorRegister(MRI, Op.getReg()))
980 setScoreByOperand(Op, AMDGPU::EXP_CNT, CurrScore);
981 }
982 }
983 } else if (T == AMDGPU::X_CNT) {
984 HWEvent OtherEvent =
985 E == HWEvent::SMEM_GROUP ? HWEvent::VMEM_GROUP : HWEvent::SMEM_GROUP;
986 if (PendingEvents.contains(OtherEvent)) {
987 // Hardware inserts an implicit xcnt between interleaved
988 // SMEM and VMEM operations. So there will never be
989 // outstanding address translations for both SMEM and
990 // VMEM at the same time.
991 setScoreLB(T, getScoreUB(T) - 1);
992 PendingEvents.remove(OtherEvent);
993 }
994 for (const MachineOperand &Op : Inst.all_uses())
995 setScoreByOperand(Op, T, CurrScore);
996 } else if (T == AMDGPU::VA_VDST || T == AMDGPU::VM_VSRC) {
997 // Match the score to the VGPR destination or source registers as
998 // appropriate
999 for (const MachineOperand &Op : Inst.operands()) {
1000 if (!Op.isReg() || (T == AMDGPU::VA_VDST && Op.isUse()) ||
1001 (T == AMDGPU::VM_VSRC && Op.isDef()))
1002 continue;
1003 if (TRI.isVectorRegister(Context->MRI, Op.getReg()))
1004 setScoreByOperand(Op, T, CurrScore);
1005 }
1006 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1007 // Match the score to the destination registers.
1008 //
1009 // Check only explicit operands. Stores, especially spill stores, include
1010 // implicit uses and defs of their super registers which would create an
1011 // artificial dependency, while these are there only for register liveness
1012 // accounting purposes.
1013 //
1014 // Special cases where implicit register defs exists, such as M0 or VCC,
1015 // but none with memory instructions.
1016 for (const MachineOperand &Op : Inst.defs()) {
1017 if (T == AMDGPU::LOAD_CNT || T == AMDGPU::SAMPLE_CNT ||
1018 T == AMDGPU::BVH_CNT) {
1019 if (!TRI.isVectorRegister(MRI, Op.getReg())) // TODO: add wrapper
1020 continue;
1021 if (updateVMCntOnly(Inst)) {
1022 // updateVMCntOnly should only leave us with VGPRs
1023 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1024 // defs. That's required for a sane index into `VgprMemTypes` below
1025 assert(TRI.isVectorRegister(MRI, Op.getReg()));
1026 VmemType V = getVmemType(Inst);
1027 unsigned char TypesMask = 1 << V;
1028 // If instruction can have Point Sample Accel applied, we have to flag
1029 // this with another potential dependency
1030 if (hasPointSampleAccel(Inst))
1031 TypesMask |= 1 << VMEM_NOSAMPLER;
1032 for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))
1033 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1034 }
1035 }
1036 setScoreByOperand(Op, T, CurrScore);
1037 }
1038 if (Inst.mayStore() &&
1039 (TII.isDS(Inst) || Context->isNonAsyncLdsDmaWrite(Inst))) {
1040 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1041 // written can be accessed. A load from LDS to VMEM does not need a wait.
1042 //
1043 // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
1044 // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
1045 // store. The "Slot" is the index into LDSDMAStores + 1.
1046 unsigned Slot = 0;
1047 for (const auto *MemOp : Inst.memoperands()) {
1048 if (!MemOp->isStore() ||
1049 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1050 continue;
1051 // Comparing just AA info does not guarantee memoperands are equal
1052 // in general, but this is so for LDS DMA in practice.
1053 auto AAI = MemOp->getAAInfo();
1054 // Alias scope information gives a way to definitely identify an
1055 // original memory object and practically produced in the module LDS
1056 // lowering pass. If there is no scope available we will not be able
1057 // to disambiguate LDS aliasing as after the module lowering all LDS
1058 // is squashed into a single big object.
1059 if (!AAI || !AAI.Scope)
1060 break;
1061 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1062 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1063 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1064 Slot = I + 1;
1065 break;
1066 }
1067 }
1068 }
1069 if (Slot)
1070 break;
1071 // The slot may not be valid because it can be >= NUM_LDSDMA which
1072 // means the scoreboard cannot track it. We still want to preserve the
1073 // MI in order to check alias information, though.
1074 LDSDMAStores.push_back(&Inst);
1075 Slot = LDSDMAStores.size();
1076 break;
1077 }
1078 setVMemScore(LDSDMA_BEGIN, T, CurrScore);
1079 if (Slot && Slot < NUM_LDSDMA)
1080 setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
1081 }
1082
1083 if (Context->shouldUpdateAsyncMark(Inst, T)) {
1084 AsyncScore[T] = CurrScore;
1085 }
1086
1088 setRegScore(AMDGPU::SCC, T, CurrScore);
1089 PendingSCCWrite = &Inst;
1090 }
1091 }
1092}
1093
1094void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1095 // In the absence of loops, AsyncMarks can grow linearly with the program
1096 // until we encounter an ASYNCMARK_WAIT. We could drop the oldest mark above a
1097 // limit every time we push a new mark, but that seems like unnecessary work
1098 // in practical cases. We do separately truncate the array when processing a
1099 // loop, which should be sufficient.
1100 AsyncMarks.push_back(AsyncScore);
1101 AsyncScore = {};
1102 LLVM_DEBUG({
1103 dbgs() << "recordAsyncMark:\n" << Inst;
1104 for (const auto &Mark : AsyncMarks) {
1105 llvm::interleaveComma(Mark, dbgs());
1106 dbgs() << '\n';
1107 }
1108 });
1109}
1110
1111void WaitcntBrackets::print(raw_ostream &OS) const {
1112 const GCNSubtarget &ST = Context->ST;
1113
1114 for (auto T : inst_counter_types(Context->MaxCounter)) {
1115 unsigned SR = getScoreRange(T);
1116 switch (T) {
1117 case AMDGPU::LOAD_CNT:
1118 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1119 << SR << "):";
1120 break;
1121 case AMDGPU::DS_CNT:
1122 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1123 << SR << "):";
1124 break;
1125 case AMDGPU::EXP_CNT:
1126 OS << " EXP_CNT(" << SR << "):";
1127 break;
1128 case AMDGPU::STORE_CNT:
1129 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1130 << SR << "):";
1131 break;
1132 case AMDGPU::SAMPLE_CNT:
1133 OS << " SAMPLE_CNT(" << SR << "):";
1134 break;
1135 case AMDGPU::BVH_CNT:
1136 OS << " BVH_CNT(" << SR << "):";
1137 break;
1138 case AMDGPU::KM_CNT:
1139 OS << " KM_CNT(" << SR << "):";
1140 break;
1141 case AMDGPU::X_CNT:
1142 OS << " X_CNT(" << SR << "):";
1143 break;
1144 case AMDGPU::ASYNC_CNT:
1145 OS << " ASYNC_CNT(" << SR << "):";
1146 break;
1147 case AMDGPU::VA_VDST:
1148 OS << " VA_VDST(" << SR << "): ";
1149 break;
1150 case AMDGPU::VM_VSRC:
1151 OS << " VM_VSRC(" << SR << "): ";
1152 break;
1153 default:
1154 OS << " UNKNOWN(" << SR << "):";
1155 break;
1156 }
1157
1158 if (SR != 0) {
1159 // Print vgpr scores.
1160 unsigned LB = getScoreLB(T);
1161
1162 SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
1163 sort(SortedVMEMIDs);
1164
1165 for (auto ID : SortedVMEMIDs) {
1166 unsigned RegScore = VMem.at(ID).Scores[T];
1167 if (RegScore <= LB)
1168 continue;
1169 unsigned RelScore = RegScore - LB - 1;
1170 if (ID < REGUNITS_END) {
1171 OS << ' ' << RelScore << ":vRU" << ID;
1172 } else {
1173 assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
1174 "Unhandled/unexpected ID value!");
1175 OS << ' ' << RelScore << ":LDSDMA" << ID;
1176 }
1177 }
1178
1179 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1180 if (isSmemCounter(T)) {
1181 SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
1182 sort(SortedSMEMIDs);
1183 for (auto ID : SortedSMEMIDs) {
1184 unsigned RegScore = SGPRs.at(ID).get(T);
1185 if (RegScore <= LB)
1186 continue;
1187 unsigned RelScore = RegScore - LB - 1;
1188 OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
1189 }
1190 }
1191
1192 if (T == AMDGPU::KM_CNT && SCCScore > 0)
1193 OS << ' ' << SCCScore << ":scc";
1194 }
1195 OS << '\n';
1196 }
1197
1198 OS << "Pending Events: ";
1199 if (hasPendingEvent()) {
1200 ListSeparator LS;
1201 for (auto E : AMDGPU::hw_events()) {
1202 if (hasPendingEvent(E)) {
1203 OS << LS << AMDGPU::toString(E);
1204 }
1205 }
1206 } else {
1207 OS << "none";
1208 }
1209 OS << '\n';
1210
1211 OS << "Async score: ";
1212 if (AsyncScore.empty())
1213 OS << "none";
1214 else
1215 llvm::interleaveComma(AsyncScore, OS);
1216 OS << '\n';
1217
1218 OS << "Async marks: " << AsyncMarks.size() << '\n';
1219
1220 for (const auto &Mark : AsyncMarks) {
1221 for (auto T : AMDGPU::inst_counter_types()) {
1222 unsigned MarkedScore = Mark[T];
1223 switch (T) {
1224 case AMDGPU::LOAD_CNT:
1225 OS << " " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM")
1226 << "_CNT: " << MarkedScore;
1227 break;
1228 case AMDGPU::DS_CNT:
1229 OS << " " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM")
1230 << "_CNT: " << MarkedScore;
1231 break;
1232 case AMDGPU::EXP_CNT:
1233 OS << " EXP_CNT: " << MarkedScore;
1234 break;
1235 case AMDGPU::STORE_CNT:
1236 OS << " " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS")
1237 << "_CNT: " << MarkedScore;
1238 break;
1239 case AMDGPU::SAMPLE_CNT:
1240 OS << " SAMPLE_CNT: " << MarkedScore;
1241 break;
1242 case AMDGPU::BVH_CNT:
1243 OS << " BVH_CNT: " << MarkedScore;
1244 break;
1245 case AMDGPU::KM_CNT:
1246 OS << " KM_CNT: " << MarkedScore;
1247 break;
1248 case AMDGPU::X_CNT:
1249 OS << " X_CNT: " << MarkedScore;
1250 break;
1251 case AMDGPU::ASYNC_CNT:
1252 OS << " ASYNC_CNT: " << MarkedScore;
1253 break;
1254 default:
1255 OS << " UNKNOWN: " << MarkedScore;
1256 break;
1257 }
1258 }
1259 OS << '\n';
1260 }
1261 OS << '\n';
1262}
1263
1264/// Simplify \p UpdateWait by removing waits that are redundant based on the
1265/// current WaitcntBrackets and any other waits specified in \p CheckWait.
1266void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
1267 AMDGPU::Waitcnt &UpdateWait) const {
1268 simplifyWaitcnt(UpdateWait, AMDGPU::LOAD_CNT);
1269 simplifyWaitcnt(UpdateWait, AMDGPU::EXP_CNT);
1270 simplifyWaitcnt(UpdateWait, AMDGPU::DS_CNT);
1271 simplifyWaitcnt(UpdateWait, AMDGPU::STORE_CNT);
1272 simplifyWaitcnt(UpdateWait, AMDGPU::SAMPLE_CNT);
1273 simplifyWaitcnt(UpdateWait, AMDGPU::BVH_CNT);
1274 simplifyWaitcnt(UpdateWait, AMDGPU::KM_CNT);
1275 simplifyXcnt(CheckWait, UpdateWait);
1276 simplifyWaitcnt(UpdateWait, AMDGPU::VA_VDST);
1277 simplifyVmVsrc(CheckWait, UpdateWait);
1278 simplifyWaitcnt(UpdateWait, AMDGPU::ASYNC_CNT);
1279}
1280
1281void WaitcntBrackets::simplifyWaitcnt(AMDGPU::InstCounterType T,
1282 unsigned &Count) const {
1283 // The number of outstanding events for this type, T, can be calculated
1284 // as (UB - LB). If the current Count is greater than or equal to the number
1285 // of outstanding events, then the wait for this counter is redundant.
1286 if (Count >= getScoreRange(T))
1287 Count = ~0u;
1288}
1289
1290void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait,
1291 AMDGPU::InstCounterType T) const {
1292 unsigned Cnt = Wait.get(T);
1293 simplifyWaitcnt(T, Cnt);
1294 Wait.set(T, Cnt);
1295}
1296
1297void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
1298 AMDGPU::Waitcnt &UpdateWait) const {
1299 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1300 // optimizations. On entry to a block with multiple predescessors, there may
1301 // be pending SMEM and VMEM events active at the same time.
1302 // In such cases, only clear one active event at a time.
1303 // TODO: Revisit xcnt optimizations for gfx1250.
1304 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1305 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1306 // zero.
1307 if (CheckWait.get(AMDGPU::KM_CNT) == 0 &&
1308 hasPendingEvent(HWEvent::SMEM_GROUP))
1309 UpdateWait.set(AMDGPU::X_CNT, ~0u);
1310 // If we have pending store we cannot optimize XCnt because we do not wait for
1311 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1312 // decremented to the same number as LOADCnt.
1313 if (CheckWait.get(AMDGPU::LOAD_CNT) != ~0u &&
1314 hasPendingEvent(HWEvent::VMEM_GROUP) &&
1315 !hasPendingEvent(AMDGPU::STORE_CNT) &&
1316 CheckWait.get(AMDGPU::X_CNT) >= CheckWait.get(AMDGPU::LOAD_CNT))
1317 UpdateWait.set(AMDGPU::X_CNT, ~0u);
1318 simplifyWaitcnt(UpdateWait, AMDGPU::X_CNT);
1319}
1320
1321void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
1322 AMDGPU::Waitcnt &UpdateWait) const {
1323 // Waiting for some counters implies waiting for VM_VSRC, since an
1324 // instruction that decrements a counter on completion would have
1325 // decremented VM_VSRC once its VGPR operands had been read.
1326 if (CheckWait.get(AMDGPU::VM_VSRC) >=
1327 std::min({CheckWait.get(AMDGPU::LOAD_CNT),
1328 CheckWait.get(AMDGPU::STORE_CNT),
1329 CheckWait.get(AMDGPU::SAMPLE_CNT),
1330 CheckWait.get(AMDGPU::BVH_CNT), CheckWait.get(AMDGPU::DS_CNT)}))
1331 UpdateWait.set(AMDGPU::VM_VSRC, ~0u);
1332 simplifyWaitcnt(UpdateWait, AMDGPU::VM_VSRC);
1333}
1334
1335void WaitcntBrackets::purgeEmptyTrackingData() {
1336 VMem.remove_if([](const auto &P) { return P.second.empty(); });
1337 SGPRs.remove_if([](const auto &P) { return P.second.empty(); });
1338}
1339
1340void WaitcntBrackets::determineWaitForScore(AMDGPU::InstCounterType T,
1341 unsigned ScoreToWait,
1342 AMDGPU::Waitcnt &Wait) const {
1343 const unsigned LB = getScoreLB(T);
1344 const unsigned UB = getScoreUB(T);
1345
1346 // If the score falls within the bracket, we need a waitcnt.
1347 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1348 if ((T == AMDGPU::LOAD_CNT || T == AMDGPU::DS_CNT) && hasPendingFlat() &&
1349 !Context->ST.hasFlatLgkmVMemCountInOrder()) {
1350 // If there is a pending FLAT operation, and this is a VMem or LGKM
1351 // waitcnt and the target can report early completion, then we need
1352 // to force a waitcnt 0.
1353 Wait.add(T, 0);
1354 } else if (counterOutOfOrder(T)) {
1355 // Counter can get decremented out-of-order when there
1356 // are multiple types event in the bracket. Also emit an s_wait counter
1357 // with a conservative value of 0 for the counter.
1358 Wait.add(T, 0);
1359 } else {
1360 // If a counter has been maxed out avoid overflow by waiting for
1361 // MAX(CounterType) - 1 instead.
1362 unsigned NeededWait = std::min(UB - ScoreToWait, getLimit(T) - 1);
1363 Wait.add(T, NeededWait);
1364 }
1365 }
1366}
1367
1368AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
1369 LLVM_DEBUG({
1370 dbgs() << "Need " << N << " async marks. Found " << AsyncMarks.size()
1371 << ":\n";
1372 for (const auto &Mark : AsyncMarks) {
1373 llvm::interleaveComma(Mark, dbgs());
1374 dbgs() << '\n';
1375 }
1376 });
1377
1378 if (AsyncMarks.size() == MaxAsyncMarks) {
1379 // Enforcing MaxAsyncMarks here is unnecessary work because the size of
1380 // MaxAsyncMarks is linear when traversing straightline code. But we do
1381 // need to check if truncation may have occured at a merge, and adjust N
1382 // to ensure that a wait is generated.
1383 LLVM_DEBUG(dbgs() << "Possible truncation. Ensuring a non-trivial wait.\n");
1384 N = std::min(N, (unsigned)MaxAsyncMarks - 1);
1385 }
1386
1387 AMDGPU::Waitcnt Wait;
1388 if (AsyncMarks.size() <= N) {
1389 LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n");
1390 return Wait;
1391 }
1392
1393 size_t MarkIndex = AsyncMarks.size() - N - 1;
1394 const auto &RequiredMark = AsyncMarks[MarkIndex];
1396 determineWaitForScore(T, RequiredMark[T], Wait);
1397
1398 // Immediately remove the waited mark and all older ones
1399 // This happens BEFORE the wait is actually inserted, which is fine
1400 // because we've already extracted the wait requirements
1401 LLVM_DEBUG({
1402 dbgs() << "Removing " << (MarkIndex + 1)
1403 << " async marks after determining wait\n";
1404 });
1405 AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
1406
1407 LLVM_DEBUG(dbgs() << "Waits to add: " << Wait);
1408 return Wait;
1409}
1410
1411// With D16Write32BitVgpr, D16 inst might be clobbered by events running on the
1412// other half 16bit.
1413//
1414// Replace VGPR16 to VGPR32 for wait check if:
1415// 1. MI is a VALU, and there is a wait event on the other half
1416// 2. MI is a LdSt, and there is a wait event on the other half from different
1417// order group
1418MCPhysReg WaitcntBrackets::determineVGPR16Dependency(const MachineInstr &MI,
1420 MCPhysReg Reg) const {
1421 const TargetRegisterClass *RC = Context->TRI.getPhysRegBaseClass(Reg);
1422 unsigned Size = Context->TRI.getRegSizeInBits(*RC);
1423
1424 if (Size != 16 || !Context->ST.hasD16Writes32BitVgpr())
1425 return Reg;
1426
1427 // With D16Writes32BitVgpr, D16 Inst might clobber the whole vgpr32
1428 // check dependency on the other half
1429 Register Reg32 = Context->TRI.get32BitRegister(Reg);
1430 Register OtherHalf = Context->TRI.getSubReg(
1431 Reg32,
1432 AMDGPU::isHi16Reg(Reg, Context->TRI) ? AMDGPU::lo16 : AMDGPU::hi16);
1433
1434 AMDGPU::Waitcnt Wait;
1435 for (MCRegUnit RU : regunits(OtherHalf))
1436 determineWaitForScore(T, getVMemScore(toVMEMID(RU), T), Wait);
1437
1438 // No wait on otherhalf
1439 if (!Wait.hasWait())
1440 return Reg;
1441
1442 if (Context->TII.isVALU(MI, /*AllowLDSDMA=*/true))
1443 return Reg32;
1444
1445 // If hi/lo16 mixed events
1446 HWEventSet MIEvents =
1447 AMDGPU::getEventsFor(MI, Context->ST, Context->IsExpertMode);
1448 HWEventSet OtherHalfEvents = Context->getWaitEvents(T);
1449 HWEventSet Events = MIEvents & OtherHalfEvents;
1450 if (Events.twoOrMore())
1451 return Reg32;
1452 return Reg;
1453}
1454
1455void WaitcntBrackets::determineWaitForPhysReg(AMDGPU::InstCounterType T,
1456 MCPhysReg Reg,
1457 AMDGPU::Waitcnt &Wait,
1458 const MachineInstr &MI) const {
1459 if (Reg == AMDGPU::SCC) {
1460 determineWaitForScore(T, SCCScore, Wait);
1461 } else {
1462 bool IsVGPR = Context->TRI.isVectorRegister(Context->MRI, Reg);
1463 if (IsVGPR)
1464 Reg = determineVGPR16Dependency(MI, T, Reg);
1465 for (MCRegUnit RU : regunits(Reg))
1466 determineWaitForScore(
1467 T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
1468 Wait);
1469 }
1470}
1471
1472void WaitcntBrackets::determineWaitForLDSDMA(AMDGPU::InstCounterType T,
1473 VMEMID TID,
1474 AMDGPU::Waitcnt &Wait) const {
1475 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1476 determineWaitForScore(T, getVMemScore(TID, T), Wait);
1477}
1478
1479void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1480 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1481 // SCC has landed
1482 if (PendingSCCWrite &&
1483 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1484 PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
1485 HWEventSet SCC_WRITE_PendingEvent(HWEvent::SCC_WRITE);
1486 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1487 if ((PendingEvents & Context->getWaitEvents(AMDGPU::KM_CNT)) ==
1488 SCC_WRITE_PendingEvent) {
1489 setScoreLB(AMDGPU::KM_CNT, getScoreUB(AMDGPU::KM_CNT));
1490 }
1491
1492 PendingEvents.remove(SCC_WRITE_PendingEvent);
1493 PendingSCCWrite = nullptr;
1494 }
1495}
1496
1497void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1499 applyWaitcnt(Wait, T);
1500}
1501
1502void WaitcntBrackets::applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count) {
1503 const unsigned UB = getScoreUB(T);
1504 if (Count >= UB)
1505 return;
1506 if (Count != 0) {
1507 if (counterOutOfOrder(T))
1508 return;
1509 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1510 } else {
1511 setScoreLB(T, UB);
1512 PendingEvents.remove(Context->getWaitEvents(T));
1513 }
1514
1515 if (T == AMDGPU::KM_CNT && Count == 0 &&
1516 hasPendingEvent(HWEvent::SMEM_GROUP)) {
1517 if (!hasMixedPendingEvents(AMDGPU::X_CNT))
1518 applyWaitcnt(AMDGPU::X_CNT, 0);
1519 else
1520 PendingEvents.remove(HWEvent::SMEM_GROUP);
1521 }
1522 if (T == AMDGPU::LOAD_CNT && hasPendingEvent(HWEvent::VMEM_GROUP) &&
1523 !hasPendingEvent(AMDGPU::STORE_CNT)) {
1524 if (!hasMixedPendingEvents(AMDGPU::X_CNT))
1525 applyWaitcnt(AMDGPU::X_CNT, Count);
1526 else if (Count == 0)
1527 PendingEvents.remove(HWEvent::VMEM_GROUP);
1528 }
1529}
1530
1531void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait,
1533 unsigned Cnt = Wait.get(T);
1534 applyWaitcnt(T, Cnt);
1535}
1536
1537// Where there are multiple types of event in the bracket of a counter,
1538// the decrement may go out of order.
1539bool WaitcntBrackets::counterOutOfOrder(AMDGPU::InstCounterType T) const {
1540 // Scalar memory read always can go out of order.
1541 if ((T == Context->SmemAccessCounter &&
1542 hasPendingEvent(HWEvent::SMEM_ACCESS)) ||
1543 (T == AMDGPU::X_CNT && hasPendingEvent(HWEvent::SMEM_GROUP)))
1544 return true;
1545
1546 // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
1547 // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
1548 // out-of-order completion.
1549 if (T == AMDGPU::LOAD_CNT) {
1550 HWEventSet Events = PendingEvents & Context->getWaitEvents(T);
1551 // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
1552 // events
1553 Events.remove(HWEvent::GLOBAL_INV_ACCESS);
1554 // Return true only if there are still multiple event types after removing
1555 // GLOBAL_INV
1556 return Events.twoOrMore();
1557 }
1558
1559 return hasMixedPendingEvents(T);
1560}
1561
1562INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1563 false, false)
1566INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1568
1569char SIInsertWaitcntsLegacy::ID = 0;
1570
1571char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1572
1574 return new SIInsertWaitcntsLegacy();
1575}
1576
1577static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1578 unsigned NewEnc) {
1579 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1580 assert(OpIdx >= 0);
1581
1582 MachineOperand &MO = MI.getOperand(OpIdx);
1583
1584 if (NewEnc == MO.getImm())
1585 return false;
1586
1587 MO.setImm(NewEnc);
1588 return true;
1589}
1590
1591bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1592 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1593 if (Opcode == Waitcnt->getOpcode())
1594 return false;
1595
1596 Waitcnt->setDesc(TII.get(Opcode));
1597 return true;
1598}
1599
1600/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1601/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1602/// from \p Wait that were added by previous passes. Currently this pass
1603/// conservatively assumes that these preexisting waits are required for
1604/// correctness.
1605bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1606 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1607 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1608 assert(isNormalMode(MaxCounter));
1609
1610 bool Modified = false;
1611 MachineInstr *WaitcntInstr = nullptr;
1612 MachineInstr *WaitcntVsCntInstr = nullptr;
1613
1614 LLVM_DEBUG({
1615 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1616 if (It.isEnd())
1617 dbgs() << "end of block\n";
1618 else
1619 dbgs() << *It;
1620 });
1621
1622 for (auto &II :
1623 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1624 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1625 if (isNonWaitcntMetaInst(II)) {
1626 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1627 continue;
1628 }
1629
1630 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1631 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1632
1633 // Update required wait count. If this is a soft waitcnt (= it was added
1634 // by an earlier pass), it may be entirely removed.
1635 if (Opcode == AMDGPU::S_WAITCNT) {
1636 unsigned IEnc = II.getOperand(0).getImm();
1637 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1638 if (TrySimplify)
1639 ScoreBrackets.simplifyWaitcnt(OldWait);
1640 Wait = Wait.combined(OldWait);
1641
1642 // Merge consecutive waitcnt of the same type by erasing multiples.
1643 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1644 II.eraseFromParent();
1645 Modified = true;
1646 } else
1647 WaitcntInstr = &II;
1648 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1649 assert(ST.hasVMemToLDSLoad());
1650 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1651 << "Before: " << Wait << '\n';);
1652 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, LDSDMA_BEGIN,
1653 Wait);
1654 LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
1655
1656 // It is possible (but unlikely) that this is the only wait instruction,
1657 // in which case, we exit this loop without a WaitcntInstr to consume
1658 // `Wait`. But that works because `Wait` was passed in by reference, and
1659 // the callee eventually calls createNewWaitcnt on it. We test this
1660 // possibility in an articial MIR test since such a situation cannot be
1661 // recreated by running the memory legalizer.
1662 II.eraseFromParent();
1663 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1664 unsigned N = II.getOperand(0).getImm();
1665 LLVM_DEBUG(dbgs() << "Processing WAIT_ASYNCMARK: " << II << '\n';);
1666 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
1667 Wait = Wait.combined(OldWait);
1668 } else {
1669 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1670 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1671
1672 unsigned OldVSCnt =
1673 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1674 if (TrySimplify)
1675 ScoreBrackets.simplifyWaitcnt(AMDGPU::STORE_CNT, OldVSCnt);
1677 std::min(Wait.get(AMDGPU::STORE_CNT), OldVSCnt));
1678
1679 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1680 II.eraseFromParent();
1681 Modified = true;
1682 } else
1683 WaitcntVsCntInstr = &II;
1684 }
1685 }
1686
1687 if (WaitcntInstr) {
1688 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1690 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1691
1692 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::LOAD_CNT);
1693 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::EXP_CNT);
1694 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::DS_CNT);
1695 Wait.set(AMDGPU::LOAD_CNT, ~0u);
1696 Wait.set(AMDGPU::EXP_CNT, ~0u);
1697 Wait.set(AMDGPU::DS_CNT, ~0u);
1698
1699 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
1700 << "New Instr at block end: "
1701 << *WaitcntInstr << '\n'
1702 : dbgs() << "applied pre-existing waitcnt\n"
1703 << "Old Instr: " << *It
1704 << "New Instr: " << *WaitcntInstr << '\n');
1705 }
1706
1707 if (WaitcntVsCntInstr) {
1708 Modified |=
1709 updateOperandIfDifferent(*WaitcntVsCntInstr, AMDGPU::OpName::simm16,
1710 Wait.get(AMDGPU::STORE_CNT));
1711 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1712
1713 ScoreBrackets.applyWaitcnt(AMDGPU::STORE_CNT, Wait.get(AMDGPU::STORE_CNT));
1714 Wait.set(AMDGPU::STORE_CNT, ~0u);
1715
1716 LLVM_DEBUG(It.isEnd()
1717 ? dbgs() << "applied pre-existing waitcnt\n"
1718 << "New Instr at block end: " << *WaitcntVsCntInstr
1719 << '\n'
1720 : dbgs() << "applied pre-existing waitcnt\n"
1721 << "Old Instr: " << *It
1722 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1723 }
1724
1725 return Modified;
1726}
1727
1728/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1729/// required counters in \p Wait
1730bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1731 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1732 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
1733 assert(isNormalMode(MaxCounter));
1734
1735 bool Modified = false;
1736 const DebugLoc &DL = Block.findDebugLoc(It);
1737
1738 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1739 // single instruction while VScnt has its own instruction.
1740 if (Wait.hasWaitExceptStoreCnt()) {
1741 // If profiling expansion is enabled, emit an expanded sequence
1742 if (ExpandWaitcntProfiling) {
1743 // Check if any of the counters to be waited on are out-of-order.
1744 // If so, fall back to normal (non-expanded) behavior since expansion
1745 // would provide misleading profiling information.
1746 bool AnyOutOfOrder = false;
1747 for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
1748 unsigned WaitCnt = Wait.get(CT);
1749 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
1750 AnyOutOfOrder = true;
1751 break;
1752 }
1753 }
1754
1755 if (AnyOutOfOrder) {
1756 // Fall back to non-expanded wait
1757 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1758 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
1759 Modified = true;
1760 } else {
1761 // All counters are in-order, safe to expand
1762 for (auto CT : {AMDGPU::LOAD_CNT, AMDGPU::DS_CNT, AMDGPU::EXP_CNT}) {
1763 unsigned WaitCnt = Wait.get(CT);
1764 if (WaitCnt == ~0u)
1765 continue;
1766
1767 unsigned Outstanding =
1768 std::min(ScoreBrackets.getOutstanding(CT), getLimit(CT) - 1);
1769 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
1770 AMDGPU::Waitcnt W;
1771 W.set(CT, Count);
1772 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT))
1774 });
1775 Modified = true;
1776 }
1777 }
1778 } else {
1779 // Normal behavior: emit single combined waitcnt
1780 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1781 [[maybe_unused]] auto SWaitInst =
1782 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
1783 Modified = true;
1784
1785 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1786 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1787 dbgs() << "New Instr: " << *SWaitInst << '\n');
1788 }
1789 }
1790
1791 if (Wait.hasWaitStoreCnt()) {
1792 assert(ST.hasVscnt());
1793
1794 if (ExpandWaitcntProfiling && Wait.get(AMDGPU::STORE_CNT) != ~0u &&
1795 !ScoreBrackets.counterOutOfOrder(AMDGPU::STORE_CNT)) {
1796 // Only expand if counter is not out-of-order
1797 unsigned Outstanding =
1798 std::min(ScoreBrackets.getOutstanding(AMDGPU::STORE_CNT),
1799 getLimit(AMDGPU::STORE_CNT) - 1);
1800 EmitExpandedWaitcnt(
1801 Outstanding, Wait.get(AMDGPU::STORE_CNT), [&](unsigned Count) {
1802 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
1803 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1804 .addImm(Count);
1805 });
1806 Modified = true;
1807 } else {
1808 [[maybe_unused]] auto SWaitInst =
1809 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
1810 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1812 Modified = true;
1813
1814 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1815 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1816 dbgs() << "New Instr: " << *SWaitInst << '\n');
1817 }
1818 }
1819
1820 return Modified;
1821}
1822
1823AMDGPU::Waitcnt
1824WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1825 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST.hasVscnt() ? 0 : ~0u);
1826}
1827
1828AMDGPU::Waitcnt
1829WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1830 unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
1831 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1832 ~0u /* XCNT */, ~0u /* ASYNC_CNT */,
1833 ~0u /* TENSOR_CNT */, ExpertVal, ExpertVal);
1834}
1835
1836/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1837/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1838/// were added by previous passes. Currently this pass conservatively
1839/// assumes that these preexisting waits are required for correctness.
1840bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1841 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1842 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1843 assert(!isNormalMode(MaxCounter));
1844
1845 bool Modified = false;
1846 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1847 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1848 MachineInstr *WaitcntDepctrInstr = nullptr;
1849 MachineInstr *WaitInstrs[AMDGPU::NUM_EXTENDED_INST_CNTS] = {};
1850
1851 LLVM_DEBUG({
1852 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
1853 if (It.isEnd())
1854 dbgs() << "end of block\n";
1855 else
1856 dbgs() << *It;
1857 });
1858
1859 // Accumulate waits that should not be simplified.
1860 AMDGPU::Waitcnt RequiredWait;
1861
1862 for (auto &II :
1863 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1864 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1865 if (isNonWaitcntMetaInst(II)) {
1866 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1867 continue;
1868 }
1869
1870 // Update required wait count. If this is a soft waitcnt (= it was added
1871 // by an earlier pass), it may be entirely removed.
1872
1873 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1874 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1875
1876 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1877 // attempt to do more than that either.
1878 if (Opcode == AMDGPU::S_WAITCNT)
1879 continue;
1880
1881 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1882 unsigned OldEnc =
1883 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1884 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
1885 if (TrySimplify)
1886 Wait = Wait.combined(OldWait);
1887 else
1888 RequiredWait = RequiredWait.combined(OldWait);
1889 // Keep the first wait_loadcnt, erase the rest.
1890 if (CombinedLoadDsCntInstr == nullptr) {
1891 CombinedLoadDsCntInstr = &II;
1892 } else {
1893 II.eraseFromParent();
1894 Modified = true;
1895 }
1896 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1897 unsigned OldEnc =
1898 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1899 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
1900 if (TrySimplify)
1901 Wait = Wait.combined(OldWait);
1902 else
1903 RequiredWait = RequiredWait.combined(OldWait);
1904 // Keep the first wait_storecnt, erase the rest.
1905 if (CombinedStoreDsCntInstr == nullptr) {
1906 CombinedStoreDsCntInstr = &II;
1907 } else {
1908 II.eraseFromParent();
1909 Modified = true;
1910 }
1911 } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
1912 unsigned OldEnc =
1913 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1914 AMDGPU::Waitcnt OldWait;
1917 if (TrySimplify)
1918 ScoreBrackets.simplifyWaitcnt(OldWait);
1919 Wait = Wait.combined(OldWait);
1920 if (WaitcntDepctrInstr == nullptr) {
1921 WaitcntDepctrInstr = &II;
1922 } else {
1923 // S_WAITCNT_DEPCTR requires special care. Don't remove a
1924 // duplicate if it is waiting on things other than VA_VDST or
1925 // VM_VSRC. If that is the case, just make sure the VA_VDST and
1926 // VM_VSRC subfields of the operand are set to the "no wait"
1927 // values.
1928
1929 unsigned Enc =
1930 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1931 Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u);
1932 Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u);
1933
1934 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
1935 Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc);
1936 Modified |= promoteSoftWaitCnt(&II);
1937 } else {
1938 II.eraseFromParent();
1939 Modified = true;
1940 }
1941 }
1942 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1943 // Architectures higher than GFX10 do not have direct loads to
1944 // LDS, so no work required here yet.
1945 II.eraseFromParent();
1946 Modified = true;
1947 } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1948 // Update the Waitcnt, but don't erase the wait.asyncmark() itself. It
1949 // shows up in the assembly as a comment with the original parameter N.
1950 unsigned N = II.getOperand(0).getImm();
1951 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
1952 Wait = Wait.combined(OldWait);
1953 } else {
1954 std::optional<AMDGPU::InstCounterType> CT =
1956 assert(CT.has_value());
1957 unsigned OldCnt =
1958 TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1959 if (TrySimplify)
1960 Wait.add(CT.value(), OldCnt);
1961 else
1962 RequiredWait.add(CT.value(), OldCnt);
1963 // Keep the first wait of its kind, erase the rest.
1964 if (WaitInstrs[CT.value()] == nullptr) {
1965 WaitInstrs[CT.value()] = &II;
1966 } else {
1967 II.eraseFromParent();
1968 Modified = true;
1969 }
1970 }
1971 }
1972
1973 ScoreBrackets.simplifyWaitcnt(Wait.combined(RequiredWait), Wait);
1974 Wait = Wait.combined(RequiredWait);
1975
1976 if (CombinedLoadDsCntInstr) {
1977 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1978 // to be waited for. Otherwise, let the instruction be deleted so
1979 // the appropriate single counter wait instruction can be inserted
1980 // instead, when new S_WAIT_*CNT instructions are inserted by
1981 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1982 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1983 // the loop below that deals with single counter instructions.
1984 //
1985 // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
1986 // instructions that have decremented LOAD_CNT or DS_CNT on completion
1987 // will have needed to wait for their register sources to be available
1988 // first.
1989 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u && Wait.get(AMDGPU::DS_CNT) != ~0u) {
1990 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1991 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1992 AMDGPU::OpName::simm16, NewEnc);
1993 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1994 ScoreBrackets.applyWaitcnt(AMDGPU::LOAD_CNT, Wait.get(AMDGPU::LOAD_CNT));
1995 ScoreBrackets.applyWaitcnt(AMDGPU::DS_CNT, Wait.get(AMDGPU::DS_CNT));
1996 Wait.set(AMDGPU::LOAD_CNT, ~0u);
1997 Wait.set(AMDGPU::DS_CNT, ~0u);
1998
1999 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2000 << "New Instr at block end: "
2001 << *CombinedLoadDsCntInstr << '\n'
2002 : dbgs() << "applied pre-existing waitcnt\n"
2003 << "Old Instr: " << *It << "New Instr: "
2004 << *CombinedLoadDsCntInstr << '\n');
2005 } else {
2006 CombinedLoadDsCntInstr->eraseFromParent();
2007 Modified = true;
2008 }
2009 }
2010
2011 if (CombinedStoreDsCntInstr) {
2012 // Similarly for S_WAIT_STORECNT_DSCNT.
2013 if (Wait.get(AMDGPU::STORE_CNT) != ~0u && Wait.get(AMDGPU::DS_CNT) != ~0u) {
2014 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2015 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
2016 AMDGPU::OpName::simm16, NewEnc);
2017 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2018 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::STORE_CNT);
2019 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::DS_CNT);
2020 Wait.set(AMDGPU::STORE_CNT, ~0u);
2021 Wait.set(AMDGPU::DS_CNT, ~0u);
2022
2023 LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
2024 << "New Instr at block end: "
2025 << *CombinedStoreDsCntInstr << '\n'
2026 : dbgs() << "applied pre-existing waitcnt\n"
2027 << "Old Instr: " << *It << "New Instr: "
2028 << *CombinedStoreDsCntInstr << '\n');
2029 } else {
2030 CombinedStoreDsCntInstr->eraseFromParent();
2031 Modified = true;
2032 }
2033 }
2034
2035 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
2036 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
2037 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
2038 // instructions so that createNewWaitcnt() will create new combined
2039 // instructions to replace them.
2040
2041 if (Wait.get(AMDGPU::DS_CNT) != ~0u) {
2042 // This is a vector of addresses in WaitInstrs pointing to instructions
2043 // that should be removed if they are present.
2045
2046 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
2047 // both) need to be waited for, ensure that there are no existing
2048 // individual wait count instructions for these.
2049
2050 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u) {
2051 WaitsToErase.push_back(&WaitInstrs[AMDGPU::LOAD_CNT]);
2052 WaitsToErase.push_back(&WaitInstrs[AMDGPU::DS_CNT]);
2053 } else if (Wait.get(AMDGPU::STORE_CNT) != ~0u) {
2054 WaitsToErase.push_back(&WaitInstrs[AMDGPU::STORE_CNT]);
2055 WaitsToErase.push_back(&WaitInstrs[AMDGPU::DS_CNT]);
2056 }
2057
2058 for (MachineInstr **WI : WaitsToErase) {
2059 if (!*WI)
2060 continue;
2061
2062 (*WI)->eraseFromParent();
2063 *WI = nullptr;
2064 Modified = true;
2065 }
2066 }
2067
2069 if (!WaitInstrs[CT])
2070 continue;
2071
2072 unsigned NewCnt = Wait.get(CT);
2073 if (NewCnt != ~0u) {
2074 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
2075 AMDGPU::OpName::simm16, NewCnt);
2076 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2077
2078 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2079 Wait.clear(CT);
2080
2081 LLVM_DEBUG(It.isEnd()
2082 ? dbgs() << "applied pre-existing waitcnt\n"
2083 << "New Instr at block end: " << *WaitInstrs[CT]
2084 << '\n'
2085 : dbgs() << "applied pre-existing waitcnt\n"
2086 << "Old Instr: " << *It
2087 << "New Instr: " << *WaitInstrs[CT] << '\n');
2088 } else {
2089 WaitInstrs[CT]->eraseFromParent();
2090 Modified = true;
2091 }
2092 }
2093
2094 if (WaitcntDepctrInstr) {
2095 // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
2096 // subfields with the new required values.
2097 unsigned Enc =
2098 TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2099 ->getImm();
2102
2103 ScoreBrackets.applyWaitcnt(AMDGPU::VA_VDST, Wait.get(AMDGPU::VA_VDST));
2104 ScoreBrackets.applyWaitcnt(AMDGPU::VM_VSRC, Wait.get(AMDGPU::VM_VSRC));
2105 Wait.set(AMDGPU::VA_VDST, ~0u);
2106 Wait.set(AMDGPU::VM_VSRC, ~0u);
2107
2108 // If that new encoded Depctr immediate would actually still wait
2109 // for anything, update the instruction's operand. Otherwise it can
2110 // just be deleted.
2111 if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
2112 Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr,
2113 AMDGPU::OpName::simm16, Enc);
2114 LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n"
2115 << "New Instr at block end: "
2116 << *WaitcntDepctrInstr << '\n'
2117 : dbgs() << "applyPreexistingWaitcnt\n"
2118 << "Old Instr: " << *It << "New Instr: "
2119 << *WaitcntDepctrInstr << '\n');
2120 } else {
2121 WaitcntDepctrInstr->eraseFromParent();
2122 Modified = true;
2123 }
2124 }
2125
2126 return Modified;
2127}
2128
2129/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
2130bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2131 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
2132 AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
2133 assert(!isNormalMode(MaxCounter));
2134
2135 bool Modified = false;
2136 const DebugLoc &DL = Block.findDebugLoc(It);
2137
2138 // For GFX12+, we use separate wait instructions, which makes expansion
2139 // simpler
2140 if (ExpandWaitcntProfiling) {
2142 unsigned Count = Wait.get(CT);
2143 if (Count == ~0u)
2144 continue;
2145
2146 // Skip expansion for out-of-order counters - emit normal wait instead
2147 if (ScoreBrackets.counterOutOfOrder(CT)) {
2148 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2149 .addImm(Count);
2150 Modified = true;
2151 continue;
2152 }
2153
2154 unsigned Outstanding =
2155 std::min(ScoreBrackets.getOutstanding(CT), getLimit(CT) - 1);
2156 EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
2157 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2158 .addImm(Val);
2159 });
2160 Modified = true;
2161 }
2162 return Modified;
2163 }
2164
2165 // Normal behavior (no expansion)
2166 // Check for opportunities to use combined wait instructions.
2167 if (Wait.get(AMDGPU::DS_CNT) != ~0u) {
2168 MachineInstr *SWaitInst = nullptr;
2169
2170 if (Wait.get(AMDGPU::LOAD_CNT) != ~0u) {
2171 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
2172
2173 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2174 .addImm(Enc);
2175
2176 Wait.set(AMDGPU::LOAD_CNT, ~0u);
2177 Wait.set(AMDGPU::DS_CNT, ~0u);
2178 } else if (Wait.get(AMDGPU::STORE_CNT) != ~0u) {
2179 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
2180
2181 SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_STORECNT_DSCNT))
2182 .addImm(Enc);
2183
2184 Wait.set(AMDGPU::STORE_CNT, ~0u);
2185 Wait.set(AMDGPU::DS_CNT, ~0u);
2186 }
2187
2188 if (SWaitInst) {
2189 Modified = true;
2190
2191 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2192 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2193 dbgs() << "New Instr: " << *SWaitInst << '\n');
2194 }
2195 }
2196
2197 // Generate an instruction for any remaining counter that needs
2198 // waiting for.
2199
2201 unsigned Count = Wait.get(CT);
2202 if (Count == ~0u)
2203 continue;
2204
2205 [[maybe_unused]] auto SWaitInst =
2206 BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
2207 .addImm(Count);
2208
2209 Modified = true;
2210
2211 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
2212 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2213 dbgs() << "New Instr: " << *SWaitInst << '\n');
2214 }
2215
2216 if (Wait.hasWaitDepctr()) {
2217 assert(IsExpertMode);
2218 unsigned Enc =
2221
2222 [[maybe_unused]] auto SWaitInst =
2223 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_DEPCTR)).addImm(Enc);
2224
2225 Modified = true;
2226
2227 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
2228 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
2229 dbgs() << "New Instr: " << *SWaitInst << '\n');
2230 }
2231
2232 return Modified;
2233}
2234
2235/// Generate s_waitcnt instruction to be placed before cur_Inst.
2236/// Instructions of a given type are returned in order,
2237/// but instructions of different types can complete out of order.
2238/// We rely on this in-order completion
2239/// and simply assign a score to the memory access instructions.
2240/// We keep track of the active "score bracket" to determine
2241/// if an access of a memory read requires an s_waitcnt
2242/// and if so what the value of each counter is.
2243/// The "score bracket" is bound by the lower bound and upper bound
2244/// scores (*_score_LB and *_score_ub respectively).
2245/// If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here.
2246/// If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here
2247/// (GFX12+ only, where DS_CNT is a separate counter).
2248bool SIInsertWaitcnts::generateWaitcntInstBefore(
2249 MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
2250 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2251 LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs()););
2252
2253 assert(!isNonWaitcntMetaInst(MI));
2254
2255 AMDGPU::Waitcnt Wait;
2256 const unsigned Opc = MI.getOpcode();
2257
2258 switch (Opc) {
2259 case AMDGPU::BUFFER_WBINVL1:
2260 case AMDGPU::BUFFER_WBINVL1_SC:
2261 case AMDGPU::BUFFER_WBINVL1_VOL:
2262 case AMDGPU::BUFFER_GL0_INV:
2263 case AMDGPU::BUFFER_GL1_INV: {
2264 // FIXME: This should have already been handled by the memory legalizer.
2265 // Removing this currently doesn't affect any lit tests, but we need to
2266 // verify that nothing was relying on this. The number of buffer invalidates
2267 // being handled here should not be expanded.
2268 Wait.set(AMDGPU::LOAD_CNT, 0);
2269 break;
2270 }
2271 case AMDGPU::SI_RETURN_TO_EPILOG:
2272 case AMDGPU::SI_RETURN:
2273 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2274 case AMDGPU::S_SETPC_B64_return: {
2275 // All waits must be resolved at call return.
2276 // NOTE: this could be improved with knowledge of all call sites or
2277 // with knowledge of the called routines.
2278 ReturnInsts.insert(&MI);
2279 AMDGPU::Waitcnt AllZeroWait =
2280 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2281 // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
2282 // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
2283 // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
2284 // no need to wait for it at function boundaries.
2285 if (ST.hasExtendedWaitCounts() &&
2286 !ScoreBrackets.hasPendingEvent(HWEvent::VMEM_ACCESS))
2287 AllZeroWait.set(AMDGPU::LOAD_CNT, ~0u);
2288 Wait = AllZeroWait;
2289 break;
2290 }
2291 case AMDGPU::S_ENDPGM:
2292 case AMDGPU::S_ENDPGM_SAVED: {
2293 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
2294 // Technically the hardware will do this on its own if we don't, but that
2295 // might cost extra cycles compared to doing it explicitly.
2296 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
2297 // have to wait for outstanding VMEM stores. In this case it can be useful
2298 // to send a message to explicitly release all VGPRs before the stores have
2299 // completed, but it is only safe to do this if there are no outstanding
2300 // scratch stores.
2301 EndPgmInsts[&MI] =
2302 !ScoreBrackets.empty(AMDGPU::STORE_CNT) &&
2303 !ScoreBrackets.hasPendingEvent(HWEvent::SCRATCH_WRITE_ACCESS);
2304 break;
2305 }
2306 case AMDGPU::S_SENDMSG:
2307 case AMDGPU::S_SENDMSGHALT: {
2308 if (ST.hasLegacyGeometry() &&
2309 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
2311 // Resolve vm waits before gs-done.
2312 Wait.set(AMDGPU::LOAD_CNT, 0);
2313 break;
2314 }
2315 [[fallthrough]];
2316 }
2317 default: {
2318
2319 // Export & GDS instructions do not read the EXEC mask until after the
2320 // export is granted (which can occur well after the instruction is issued).
2321 // The shader program must flush all EXP operations on the export-count
2322 // before overwriting the EXEC mask.
2323 if (MI.modifiesRegister(AMDGPU::EXEC, &TRI)) {
2324 // Export and GDS are tracked individually, either may trigger a waitcnt
2325 // for EXEC.
2326 if (ScoreBrackets.hasPendingEvent(HWEvent::EXP_GPR_LOCK) ||
2327 ScoreBrackets.hasPendingEvent(HWEvent::EXP_PARAM_ACCESS) ||
2328 ScoreBrackets.hasPendingEvent(HWEvent::EXP_POS_ACCESS) ||
2329 ScoreBrackets.hasPendingEvent(HWEvent::GDS_GPR_LOCK)) {
2330 Wait.set(AMDGPU::EXP_CNT, 0);
2331 }
2332 }
2333
2334 // Wait for any pending GDS instruction to complete before any
2335 // "Always GDS" instruction.
2336 if (TII.isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
2337 Wait.add(AMDGPU::DS_CNT, ScoreBrackets.getPendingGDSWait());
2338
2339 if (MI.isCall()) {
2340 // The function is going to insert a wait on everything in its prolog.
2341 // This still needs to be careful if the call target is a load (e.g. a GOT
2342 // load). We also need to check WAW dependency with saved PC.
2343 CallInsts.insert(&MI);
2344 Wait = AMDGPU::Waitcnt();
2345
2346 const MachineOperand &CallAddrOp = TII.getCalleeOperand(MI);
2347 if (CallAddrOp.isReg()) {
2348 ScoreBrackets.determineWaitForPhysReg(
2349 SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait, MI);
2350
2351 if (const auto *RtnAddrOp =
2352 TII.getNamedOperand(MI, AMDGPU::OpName::dst)) {
2353 ScoreBrackets.determineWaitForPhysReg(
2354 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait, MI);
2355 }
2356 }
2357 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
2358 ScoreBrackets.tryClearSCCWriteEvent(&MI);
2359 } else {
2360 // FIXME: Should not be relying on memoperands.
2361 // Look at the source operands of every instruction to see if
2362 // any of them results from a previous memory operation that affects
2363 // its current usage. If so, an s_waitcnt instruction needs to be
2364 // emitted.
2365 // If the source operand was defined by a load, add the s_waitcnt
2366 // instruction.
2367 //
2368 // Two cases are handled for destination operands:
2369 // 1) If the destination operand was defined by a load, add the s_waitcnt
2370 // instruction to guarantee the right WAW order.
2371 // 2) If a destination operand that was used by a recent export/store ins,
2372 // add s_waitcnt on exp_cnt to guarantee the WAR order.
2373
2374 for (const MachineMemOperand *Memop : MI.memoperands()) {
2375 const Value *Ptr = Memop->getValue();
2376 if (Memop->isStore()) {
2377 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
2378 Wait.add(SmemAccessCounter, 0);
2379 if (PDT.dominates(MI.getParent(), It->second))
2380 SLoadAddresses.erase(It);
2381 }
2382 }
2383 unsigned AS = Memop->getAddrSpace();
2385 continue;
2386 // No need to wait before load from VMEM to LDS.
2387 if (TII.mayWriteLDSThroughDMA(MI))
2388 continue;
2389
2390 // LOAD_CNT is only relevant to vgpr or LDS.
2391 unsigned TID = LDSDMA_BEGIN;
2392 if (Ptr && Memop->getAAInfo()) {
2393 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2394 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2395 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
2396 if ((I + 1) >= NUM_LDSDMA) {
2397 // We didn't have enough slot to track this LDS DMA store, it
2398 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
2399 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, TID,
2400 Wait);
2401 break;
2402 }
2403
2404 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT,
2405 TID + I + 1, Wait);
2406 }
2407 }
2408 } else {
2409 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::LOAD_CNT, TID, Wait);
2410 }
2411 if (Memop->isStore()) {
2412 ScoreBrackets.determineWaitForLDSDMA(AMDGPU::EXP_CNT, TID, Wait);
2413 }
2414 }
2415
2416 // Loop over use and def operands.
2417 for (const MachineOperand &Op : MI.operands()) {
2418 if (!Op.isReg())
2419 continue;
2420
2421 // If the instruction does not read tied source, skip the operand.
2422 if (Op.isTied() && Op.isUse() && TII.doesNotReadTiedSource(MI))
2423 continue;
2424
2425 MCPhysReg Reg = Op.getReg().asMCReg();
2426
2427 const bool IsVGPR = TRI.isVectorRegister(MRI, Op.getReg());
2428 if (IsVGPR) {
2429 // Implicit VGPR defs and uses are never a part of the memory
2430 // instructions description and usually present to account for
2431 // super-register liveness.
2432 // TODO: Most of the other instructions also have implicit uses
2433 // for the liveness accounting only.
2434 if (Op.isImplicit() && MI.mayLoadOrStore())
2435 continue;
2436
2437 ScoreBrackets.determineWaitForPhysReg(AMDGPU::VA_VDST, Reg, Wait, MI);
2438 if (Op.isDef())
2439 ScoreBrackets.determineWaitForPhysReg(AMDGPU::VM_VSRC, Reg, Wait,
2440 MI);
2441 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2442 // previous write and this write are the same type of VMEM
2443 // instruction, in which case they are (in some architectures)
2444 // guaranteed to write their results in order anyway.
2445 // Additionally check instructions where Point Sample Acceleration
2446 // might be applied.
2447 if (Op.isUse() || !updateVMCntOnly(MI) ||
2448 ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||
2449 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
2450 !ST.hasVmemWriteVgprInOrder()) {
2451 ScoreBrackets.determineWaitForPhysReg(AMDGPU::LOAD_CNT, Reg, Wait,
2452 MI);
2453 ScoreBrackets.determineWaitForPhysReg(AMDGPU::SAMPLE_CNT, Reg, Wait,
2454 MI);
2455 ScoreBrackets.determineWaitForPhysReg(AMDGPU::BVH_CNT, Reg, Wait,
2456 MI);
2457 ScoreBrackets.clearVgprVmemTypes(Reg);
2458 }
2459
2460 if (Op.isDef() ||
2461 ScoreBrackets.hasPendingEvent(HWEvent::EXP_LDS_ACCESS)) {
2462 ScoreBrackets.determineWaitForPhysReg(AMDGPU::EXP_CNT, Reg, Wait,
2463 MI);
2464 }
2465 ScoreBrackets.determineWaitForPhysReg(AMDGPU::DS_CNT, Reg, Wait, MI);
2466 } else if (Op.getReg() == AMDGPU::SCC) {
2467 ScoreBrackets.determineWaitForPhysReg(AMDGPU::KM_CNT, Reg, Wait, MI);
2468 } else {
2469 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait,
2470 MI);
2471 }
2472
2473 if (ST.hasWaitXcnt() && Op.isDef())
2474 ScoreBrackets.determineWaitForPhysReg(AMDGPU::X_CNT, Reg, Wait, MI);
2475 }
2476 }
2477 }
2478 }
2479
2480 // Ensure safety against exceptions from outstanding memory operations while
2481 // waiting for a barrier:
2482 //
2483 // * Some subtargets safely handle backing off the barrier in hardware
2484 // when an exception occurs.
2485 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2486 // there can be no outstanding memory operations during the wait.
2487 // * Subtargets with split barriers don't need to back off the barrier; it
2488 // is up to the trap handler to preserve the user barrier state correctly.
2489 //
2490 // In all other cases, ensure safety by ensuring that there are no outstanding
2491 // memory operations.
2492 if (Opc == AMDGPU::S_BARRIER && !ST.hasAutoWaitcntBeforeBarrier() &&
2493 !ST.hasBackOffBarrier()) {
2494 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2495 }
2496
2497 // TODO: Remove this work-around, enable the assert for Bug 457939
2498 // after fixing the scheduler. Also, the Shader Compiler code is
2499 // independent of target.
2500 if (SIInstrInfo::isCBranchVCCZRead(MI) && ST.hasReadVCCZBug() &&
2501 ScoreBrackets.hasPendingEvent(HWEvent::SMEM_ACCESS)) {
2502 Wait.set(AMDGPU::DS_CNT, 0);
2503 }
2504
2505 // Verify that the wait is actually needed.
2506 ScoreBrackets.simplifyWaitcnt(Wait);
2507
2508 // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
2509 // waits on VA_VDST if the instruction it would precede is not a VALU
2510 // instruction, since hardware handles VALU->VGPR->VALU hazards in
2511 // expert scheduling mode.
2512 if (TII.isVALU(MI, /*AllowLDSDMA=*/true) && !SIInstrInfo::isLDSDMA(MI))
2513 Wait.set(AMDGPU::VA_VDST, ~0u);
2514
2515 // Since the translation for VMEM addresses occur in-order, we can apply the
2516 // XCnt if the current instruction is of VMEM type and has a memory
2517 // dependency with another VMEM instruction in flight.
2518 if (Wait.get(AMDGPU::X_CNT) != ~0u && isVmemAccess(MI)) {
2519 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::X_CNT);
2520 Wait.set(AMDGPU::X_CNT, ~0u);
2521 }
2522
2523 // When forcing emit, we need to skip terminators because that would break the
2524 // terminators of the MBB if we emit a waitcnt between terminators.
2525 if (ForceEmitZeroFlag && !MI.isTerminator())
2526 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2527
2528 // If we force waitcnt then update Wait accordingly.
2530 if (!ForceEmitWaitcnt[T])
2531 continue;
2532 Wait.set(T, 0);
2533 }
2534
2535 if (FlushFlags.FlushVmCnt) {
2538 Wait.set(T, 0);
2539 }
2540
2541 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(AMDGPU::DS_CNT))
2542 Wait.set(AMDGPU::DS_CNT, 0);
2543
2544 if (ForceEmitZeroLoadFlag && Wait.get(AMDGPU::LOAD_CNT) != ~0u)
2545 Wait.set(AMDGPU::LOAD_CNT, 0);
2546
2547 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2548 OldWaitcntInstr);
2549}
2550
2551bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2553 MachineBasicBlock &Block,
2554 WaitcntBrackets &ScoreBrackets,
2555 MachineInstr *OldWaitcntInstr) {
2556 bool Modified = false;
2557
2558 if (OldWaitcntInstr)
2559 // Try to merge the required wait with preexisting waitcnt instructions.
2560 // Also erase redundant waitcnt.
2561 Modified =
2562 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2563
2564 // ExpCnt can be merged into VINTERP.
2565 if (Wait.get(AMDGPU::EXP_CNT) != ~0u && It != Block.instr_end() &&
2567 MachineOperand *WaitExp = TII.getNamedOperand(*It, AMDGPU::OpName::waitexp);
2568 if (Wait.get(AMDGPU::EXP_CNT) < WaitExp->getImm()) {
2569 WaitExp->setImm(Wait.get(AMDGPU::EXP_CNT));
2570 Modified = true;
2571 }
2572 // Apply ExpCnt before resetting it, so applyWaitcnt below sees all counts.
2573 ScoreBrackets.applyWaitcnt(Wait, AMDGPU::EXP_CNT);
2574 Wait.set(AMDGPU::EXP_CNT, ~0u);
2575
2576 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2577 << "Update Instr: " << *It);
2578 }
2579
2580 if (WCG->createNewWaitcnt(Block, It, Wait, ScoreBrackets))
2581 Modified = true;
2582
2583 // Any counts that could have been applied to any existing waitcnt
2584 // instructions will have been done so, now deal with any remaining.
2585 ScoreBrackets.applyWaitcnt(Wait);
2586
2587 return Modified;
2588}
2589
2590bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2591 return (TII.isFLAT(MI) && TII.mayAccessVMEMThroughFlat(MI)) ||
2592 (TII.isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2593}
2594
2595// Return true if the next instruction is S_ENDPGM, following fallthrough
2596// blocks if necessary.
2597bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2598 MachineBasicBlock *Block) const {
2599 auto BlockEnd = Block->getParent()->end();
2600 auto BlockIter = Block->getIterator();
2601
2602 while (true) {
2603 if (It.isEnd()) {
2604 if (++BlockIter != BlockEnd) {
2605 It = BlockIter->instr_begin();
2606 continue;
2607 }
2608
2609 return false;
2610 }
2611
2612 if (!It->isMetaInstruction())
2613 break;
2614
2615 It++;
2616 }
2617
2618 assert(!It.isEnd());
2619
2620 return It->getOpcode() == AMDGPU::S_ENDPGM;
2621}
2622
2623// Add a wait after an instruction if architecture requirements mandate one.
2624bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2625 MachineBasicBlock &Block,
2626 WaitcntBrackets &ScoreBrackets) {
2627 AMDGPU::Waitcnt Wait;
2628 bool NeedsEndPGMCheck = false;
2629
2630 if (ST.isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2631 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2633
2634 if (TII.isAlwaysGDS(Inst.getOpcode())) {
2635 Wait.set(AMDGPU::DS_CNT, 0);
2636 NeedsEndPGMCheck = true;
2637 }
2638
2639 ScoreBrackets.simplifyWaitcnt(Wait);
2640
2641 auto SuccessorIt = std::next(Inst.getIterator());
2642 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2643 /*OldWaitcntInstr=*/nullptr);
2644
2645 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2646 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII.get(AMDGPU::S_NOP))
2647 .addImm(0);
2648 }
2649
2650 return Result;
2651}
2652
2653void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2654 WaitcntBrackets *ScoreBrackets) {
2655
2656 HWEventSet InstEvents = AMDGPU::getEventsFor(Inst, ST, IsExpertMode);
2657 for (HWEvent E : AMDGPU::hw_events()) {
2658 if (InstEvents.contains(E))
2659 ScoreBrackets->updateByEvent(E, Inst);
2660 }
2661
2662 if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
2663 if (TII.isAlwaysGDS(Inst.getOpcode()) ||
2664 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2665 ScoreBrackets->setPendingGDS();
2666 }
2667 } else if (TII.isFLAT(Inst)) {
2668 if (Inst.mayLoadOrStore() && TII.mayAccessVMEMThroughFlat(Inst) &&
2669 TII.mayAccessLDSThroughFlat(Inst) && !SIInstrInfo::isLDSDMA(Inst)) {
2670 // Async/LDSDMA operations have FLAT encoding but do not actually use flat
2671 // pointers. They do have two operands that each access global and LDS,
2672 // thus making it appear at this point that they are using a flat pointer.
2673 // Filter them out, and for the rest, generate a dependency on flat
2674 // pointers so that both VM and LGKM counters are flushed.
2675 ScoreBrackets->setPendingFlat();
2676 }
2677 if (SIInstrInfo::usesASYNC_CNT(Inst)) {
2678 ScoreBrackets->updateByEvent(HWEvent::ASYNC_ACCESS, Inst);
2679 }
2680 } else if (SIInstrInfo::usesTENSOR_CNT(Inst)) {
2681 ScoreBrackets->updateByEvent(HWEvent::TENSOR_ACCESS, Inst);
2682 } else if (Inst.isCall()) {
2683 // Act as a wait on everything, but AsyncCnt and TensorCnt are never
2684 // included in such blanket waits.
2685 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2686 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2687 } else if (TII.isVINTERP(Inst)) {
2688 int64_t Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2689 ScoreBrackets->applyWaitcnt(AMDGPU::EXP_CNT, Imm);
2690 }
2691
2692 // Set XCNT to zero in the bracket for instructions that implicitly drain
2693 // XCNT.
2694 if (ST.hasWaitXcnt() && SIInstrInfo::isXcntDrain(Inst))
2695 ScoreBrackets->applyWaitcnt(AMDGPU::X_CNT, 0);
2696}
2697
2698bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2699 unsigned OtherScore) {
2700 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2701 unsigned OtherShifted =
2702 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2703 Score = std::max(MyShifted, OtherShifted);
2704 return OtherShifted > MyShifted;
2705}
2706
2707bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
2708 ArrayRef<CounterValueArray> OtherMarks) {
2709 bool StrictDom = false;
2710
2711 LLVM_DEBUG(dbgs() << "Merging async marks ...");
2712 // Early exit: nothing to merge when both sides are empty.
2713 if (AsyncMarks.empty() && OtherMarks.empty()) {
2714 LLVM_DEBUG(dbgs() << " nothing to merge\n");
2715 return false;
2716 }
2717 LLVM_DEBUG(dbgs() << '\n');
2718
2719 // Determine maximum length needed after merging
2720 auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.size());
2721 MaxSize = std::min(MaxSize, MaxAsyncMarks);
2722
2723 // Keep only the most recent marks within our limit.
2724 if (AsyncMarks.size() > MaxSize)
2725 AsyncMarks.erase(AsyncMarks.begin(),
2726 AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
2727
2728 // Pad with zero-filled marks if our list is shorter. Zero represents "no
2729 // pending async operations at this checkpoint" and acts as the identity
2730 // element for max() during merging. We pad at the beginning since the marks
2731 // need to be aligned in most-recent order.
2732 constexpr CounterValueArray ZeroMark{};
2733 AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
2734
2735 LLVM_DEBUG({
2736 dbgs() << "Before merge:\n";
2737 for (const auto &Mark : AsyncMarks) {
2738 llvm::interleaveComma(Mark, dbgs());
2739 dbgs() << '\n';
2740 }
2741 dbgs() << "Other marks:\n";
2742 for (const auto &Mark : OtherMarks) {
2743 llvm::interleaveComma(Mark, dbgs());
2744 dbgs() << '\n';
2745 }
2746 });
2747
2748 // Merge element-wise using the existing mergeScore function and the
2749 // appropriate MergeInfo for each counter type. Iterate only while we have
2750 // elements in both vectors.
2751 unsigned OtherSize = OtherMarks.size();
2752 unsigned OurSize = AsyncMarks.size();
2753 unsigned MergeCount = std::min(OtherSize, OurSize);
2754 // OtherMarks is empty -> OtherSize == 0 -> MergeCount == 0.
2755 // Our existing marks are the conservative result; return early to avoid
2756 // passing MergeCount == 0 to seq_inclusive which asserts Begin <= End.
2757 if (MergeCount == 0)
2758 return StrictDom;
2759 for (auto Idx : seq_inclusive<unsigned>(1, MergeCount)) {
2760 for (auto T : inst_counter_types(Context->MaxCounter)) {
2761 StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],
2762 OtherMarks[OtherSize - Idx][T]);
2763 }
2764 }
2765
2766 LLVM_DEBUG({
2767 dbgs() << "After merge:\n";
2768 for (const auto &Mark : AsyncMarks) {
2769 llvm::interleaveComma(Mark, dbgs());
2770 dbgs() << '\n';
2771 }
2772 });
2773
2774 return StrictDom;
2775}
2776
2777/// Merge the pending events and associater score brackets of \p Other into
2778/// this brackets status.
2779///
2780/// Returns whether the merge resulted in a change that requires tighter waits
2781/// (i.e. the merged brackets strictly dominate the original brackets).
2782bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2783 bool StrictDom = false;
2784
2785 // Check if "other" has keys we don't have, and create default entries for
2786 // those. If they remain empty after merging, we will clean it up after.
2787 for (auto K : Other.VMem.keys())
2788 VMem.try_emplace(K);
2789 for (auto K : Other.SGPRs.keys())
2790 SGPRs.try_emplace(K);
2791
2792 // Array to store MergeInfo for each counter type
2793 MergeInfo MergeInfos[AMDGPU::NUM_INST_CNTS];
2794
2795 for (auto T : inst_counter_types(Context->MaxCounter)) {
2796 // Merge event flags for this counter
2797 const HWEventSet &EventsForT = Context->getWaitEvents(T);
2798 const HWEventSet OldEvents = PendingEvents & EventsForT;
2799 const HWEventSet OtherEvents = Other.PendingEvents & EventsForT;
2800 if (!OldEvents.contains(OtherEvents))
2801 StrictDom = true;
2802 PendingEvents |= OtherEvents;
2803
2804 // Merge scores for this counter
2805 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2806 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2807 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2808 if (NewUB < ScoreLBs[T])
2809 report_fatal_error("waitcnt score overflow");
2810
2811 MergeInfo &M = MergeInfos[T];
2812 M.OldLB = ScoreLBs[T];
2813 M.OtherLB = Other.ScoreLBs[T];
2814 M.MyShift = NewUB - ScoreUBs[T];
2815 M.OtherShift = NewUB - Other.ScoreUBs[T];
2816
2817 ScoreUBs[T] = NewUB;
2818
2819 if (T == AMDGPU::LOAD_CNT)
2820 StrictDom |= mergeScore(M, LastFlatLoadCnt, Other.LastFlatLoadCnt);
2821
2822 if (T == AMDGPU::DS_CNT) {
2823 StrictDom |= mergeScore(M, LastFlatDsCnt, Other.LastFlatDsCnt);
2824 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
2825 }
2826
2827 if (T == AMDGPU::KM_CNT) {
2828 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
2829 if (Other.hasPendingEvent(HWEvent::SCC_WRITE)) {
2830 if (!OldEvents.contains(HWEvent::SCC_WRITE)) {
2831 PendingSCCWrite = Other.PendingSCCWrite;
2832 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
2833 PendingSCCWrite = nullptr;
2834 }
2835 }
2836 }
2837
2838 for (auto &[RegID, Info] : VMem)
2839 StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));
2840
2841 if (isSmemCounter(T)) {
2842 for (auto &[RegID, Info] : SGPRs) {
2843 auto It = Other.SGPRs.find(RegID);
2844 unsigned OtherScore = (It != Other.SGPRs.end()) ? It->second.get(T) : 0;
2845 StrictDom |= mergeScore(M, Info.get(T), OtherScore);
2846 }
2847 }
2848 }
2849
2850 for (auto &[TID, Info] : VMem) {
2851 if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {
2852 unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;
2853 StrictDom |= NewVmemTypes != Info.VMEMTypes;
2854 Info.VMEMTypes = NewVmemTypes;
2855 }
2856 }
2857
2858 StrictDom |= mergeAsyncMarks(MergeInfos, Other.AsyncMarks);
2859 for (auto T : inst_counter_types(Context->MaxCounter))
2860 StrictDom |= mergeScore(MergeInfos[T], AsyncScore[T], Other.AsyncScore[T]);
2861
2862 purgeEmptyTrackingData();
2863 return StrictDom;
2864}
2865
2866static bool isWaitInstr(MachineInstr &Inst) {
2867 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2868 return Opcode == AMDGPU::S_WAITCNT ||
2869 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2870 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2871 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2872 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2873 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2874 Opcode == AMDGPU::WAIT_ASYNCMARK ||
2875 AMDGPU::counterTypeForInstr(Opcode).has_value();
2876}
2877
2878void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
2880 bool ExpertMode) const {
2881 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
2883 BuildMI(MBB, I, DebugLoc(), TII.get(AMDGPU::S_SETREG_IMM32_B32))
2884 .addImm(ExpertMode ? 2 : 0)
2885 .addImm(EncodedReg);
2886}
2887
2888namespace {
2889// TODO: Remove this work-around after fixing the scheduler.
2890// There are two reasons why vccz might be incorrect; see ST.hasReadVCCZBug()
2891// and ST.partialVCCWritesUpdateVCCZ().
2892// i. VCCZBug: There is a hardware bug on CI/SI where SMRD instruction may
2893// corrupt vccz bit, so when we detect that an instruction may read from
2894// a corrupt vccz bit, we need to:
2895// 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2896// operations to complete.
2897// 2. Recompute the correct value of vccz by writing the current value
2898// of vcc back to vcc.
2899// ii. Partial writes to vcc don't update vccz, so we need to recompute the
2900// correct value of vccz by reading vcc and writing it back to vcc.
2901// No waitcnt is needed in this case.
2902class VCCZWorkaround {
2903 const WaitcntBrackets &ScoreBrackets;
2904 const GCNSubtarget &ST;
2905 const SIInstrInfo &TII;
2906 const SIRegisterInfo &TRI;
2907 bool VCCZCorruptionBug = false;
2908 bool VCCZNotUpdatedByPartialWrites = false;
2909 /// vccz could be incorrect at a basic block boundary if a predecessor wrote
2910 /// to vcc and then issued an smem load, so initialize to true.
2911 bool MustRecomputeVCCZ = true;
2912
2913public:
2914 VCCZWorkaround(const WaitcntBrackets &ScoreBrackets, const GCNSubtarget &ST,
2915 const SIInstrInfo &TII, const SIRegisterInfo &TRI)
2916 : ScoreBrackets(ScoreBrackets), ST(ST), TII(TII), TRI(TRI) {
2917 VCCZCorruptionBug = ST.hasReadVCCZBug();
2918 VCCZNotUpdatedByPartialWrites = !ST.partialVCCWritesUpdateVCCZ();
2919 }
2920 /// If \p MI reads vccz and we must recompute it based on MustRecomputeVCCZ,
2921 /// then emit a vccz recompute instruction before \p MI. This needs to be
2922 /// called on every instruction in the basic block because it also tracks the
2923 /// state and updates MustRecomputeVCCZ accordingly. Returns true if it
2924 /// modified the IR.
2925 bool tryRecomputeVCCZ(MachineInstr &MI) {
2926 // No need to run this if neither bug is present.
2927 if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)
2928 return false;
2929
2930 // If MI is an SMEM and it can corrupt vccz on this target, then we need
2931 // both to emit a waitcnt and to recompute vccz.
2932 // But we don't actually emit a waitcnt here. This is done in
2933 // generateWaitcntInstBefore() because it tracks all the necessary waitcnt
2934 // state, and can either skip emitting a waitcnt if there is already one in
2935 // the IR, or emit an "optimized" combined waitcnt.
2936 // If this is an smem read, it could complete and clobber vccz at any time.
2937 MustRecomputeVCCZ |= VCCZCorruptionBug && TII.isSMRD(MI);
2938
2939 // If the target partial vcc writes don't update vccz, and MI is such an
2940 // instruction then we must recompute vccz.
2941 // Note: We are using PartiallyWritesToVCCOpt optional to avoid calling
2942 // `definesRegister()` more than needed, because it's not very cheap.
2943 std::optional<bool> PartiallyWritesToVCCOpt;
2944 auto PartiallyWritesToVCC = [](MachineInstr &MI) {
2945 return MI.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
2946 MI.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr);
2947 };
2948 if (VCCZNotUpdatedByPartialWrites) {
2949 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
2950 // If this is a partial VCC write but won't update vccz, then we must
2951 // recompute vccz.
2952 MustRecomputeVCCZ |= *PartiallyWritesToVCCOpt;
2953 }
2954
2955 // If MI is a vcc write with no pending smem, or there is a pending smem
2956 // but the target does not suffer from the vccz corruption bug, then we
2957 // don't need to recompute vccz as this write will recompute it anyway.
2958 if (!ScoreBrackets.hasPendingEvent(HWEvent::SMEM_ACCESS) ||
2959 !VCCZCorruptionBug) {
2960 // Compute PartiallyWritesToVCCOpt if we haven't done so already.
2961 if (!PartiallyWritesToVCCOpt)
2962 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(MI);
2963 bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&
2964 MI.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr);
2965 // If we write to the full vcc or we write partially and the target
2966 // updates vccz on partial writes, then vccz will be updated correctly.
2967 bool UpdatesVCCZ = FullyWritesToVCC || (!VCCZNotUpdatedByPartialWrites &&
2968 *PartiallyWritesToVCCOpt);
2969 if (UpdatesVCCZ)
2970 MustRecomputeVCCZ = false;
2971 }
2972
2973 // If MI is a branch that reads VCCZ then emit a waitcnt and a vccz
2974 // restore instruction if either is needed.
2975 if (SIInstrInfo::isCBranchVCCZRead(MI) && MustRecomputeVCCZ) {
2976 // Recompute the vccz bit. Any time a value is written to vcc, the vccz
2977 // bit is updated, so we can restore the bit by reading the value of vcc
2978 // and then writing it back to the register.
2979 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
2980 TII.get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2981 TRI.getVCC())
2982 .addReg(TRI.getVCC());
2983 MustRecomputeVCCZ = false;
2984 return true;
2985 }
2986 return false;
2987 }
2988};
2989
2990} // namespace
2991
2992// Generate s_waitcnt instructions where needed.
2993bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2994 MachineBasicBlock &Block,
2995 WaitcntBrackets &ScoreBrackets) {
2996 bool Modified = false;
2997
2998 LLVM_DEBUG({
2999 dbgs() << "*** Begin Block: ";
3000 Block.printName(dbgs());
3001 ScoreBrackets.dump();
3002 });
3003 VCCZWorkaround VCCZW(ScoreBrackets, ST, TII, TRI);
3004
3005 // Walk over the instructions.
3006 MachineInstr *OldWaitcntInstr = nullptr;
3007
3008 // NOTE: We may append instrs after Inst while iterating.
3009 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
3010 E = Block.instr_end();
3011 Iter != E; ++Iter) {
3012 MachineInstr &Inst = *Iter;
3013 if (isNonWaitcntMetaInst(Inst))
3014 continue;
3015 // Track pre-existing waitcnts that were added in earlier iterations or by
3016 // the memory legalizer.
3017 if (isWaitInstr(Inst) ||
3018 (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3019 if (!OldWaitcntInstr)
3020 OldWaitcntInstr = &Inst;
3021 continue;
3022 }
3023
3024 PreheaderFlushFlags FlushFlags;
3025 if (Block.getFirstTerminator() == Inst)
3026 FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3027
3028 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
3029 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3030 FlushFlags);
3031 OldWaitcntInstr = nullptr;
3032
3033 if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
3034 // Asyncmarks record the current wait state and so should not allow
3035 // waitcnts that occur after them to be merged into waitcnts that occur
3036 // before.
3037 ScoreBrackets.recordAsyncMark(Inst);
3038 continue;
3039 }
3040
3041 if (TII.isSMRD(Inst)) {
3042 for (const MachineMemOperand *Memop : Inst.memoperands()) {
3043 // No need to handle invariant loads when avoiding WAR conflicts, as
3044 // there cannot be a vector store to the same memory location.
3045 if (!Memop->isInvariant()) {
3046 const Value *Ptr = Memop->getValue();
3047 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
3048 }
3049 }
3050 }
3051
3052 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3053
3054 // Note: insertForcedWaitAfter() may add instrs after Iter that need to be
3055 // visited by the loop.
3056 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
3057
3058 LLVM_DEBUG({
3059 Inst.print(dbgs());
3060 ScoreBrackets.dump();
3061 });
3062
3063 // If the target suffers from the vccz bugs, this may emit the necessary
3064 // vccz recompute instruction before \p Inst if needed.
3065 Modified |= VCCZW.tryRecomputeVCCZ(Inst);
3066 }
3067
3068 // Flush counters at the end of the block if needed (for preheaders with no
3069 // terminator).
3070 AMDGPU::Waitcnt Wait;
3071 if (Block.getFirstTerminator() == Block.end()) {
3072 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
3073 if (FlushFlags.FlushVmCnt) {
3074 if (ScoreBrackets.hasPendingEvent(AMDGPU::LOAD_CNT))
3075 Wait.set(AMDGPU::LOAD_CNT, 0);
3076 if (ScoreBrackets.hasPendingEvent(AMDGPU::SAMPLE_CNT))
3077 Wait.set(AMDGPU::SAMPLE_CNT, 0);
3078 if (ScoreBrackets.hasPendingEvent(AMDGPU::BVH_CNT))
3079 Wait.set(AMDGPU::BVH_CNT, 0);
3080 }
3081 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(AMDGPU::DS_CNT))
3082 Wait.set(AMDGPU::DS_CNT, 0);
3083 }
3084
3085 // Combine or remove any redundant waitcnts at the end of the block.
3086 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
3087 OldWaitcntInstr);
3088
3089 LLVM_DEBUG({
3090 dbgs() << "*** End Block: ";
3091 Block.printName(dbgs());
3092 ScoreBrackets.dump();
3093 });
3094
3095 return Modified;
3096}
3097
3098bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &Block) {
3099 if (Block.size() <= 1)
3100 return false;
3101 // The Memory Legalizer conservatively inserts a soft xcnt before each
3102 // atomic RMW operation. However, for sequences of back-to-back atomic
3103 // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
3104 // the redundant soft xcnts.
3105 bool Modified = false;
3106 // Remember the last atomic with a soft xcnt right before it.
3107 MachineInstr *LastAtomicWithSoftXcnt = nullptr;
3108
3109 for (MachineInstr &MI : drop_begin(Block)) {
3110 // Ignore last atomic if non-LDS VMEM and SMEM.
3111 bool IsLDS =
3112 TII.isDS(MI) || (TII.isFLAT(MI) && TII.mayAccessLDSThroughFlat(MI));
3113 if (!IsLDS && (MI.mayLoad() ^ MI.mayStore()))
3114 LastAtomicWithSoftXcnt = nullptr;
3115
3116 bool IsAtomicRMW = (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) &&
3117 MI.mayLoad() && MI.mayStore();
3118 MachineInstr &PrevMI = *MI.getPrevNode();
3119 // This is an atomic with a soft xcnt.
3120 if (PrevMI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3121 // If we have already found an atomic with a soft xcnt, remove this soft
3122 // xcnt as it's redundant.
3123 if (LastAtomicWithSoftXcnt) {
3124 PrevMI.eraseFromParent();
3125 Modified = true;
3126 }
3127 LastAtomicWithSoftXcnt = &MI;
3128 }
3129 }
3130 return Modified;
3131}
3132
3133// Return flags indicating which counters should be flushed in the preheader.
3134PreheaderFlushFlags
3135SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
3136 const WaitcntBrackets &ScoreBrackets) {
3137 auto [Iterator, IsInserted] =
3138 PreheadersToFlush.try_emplace(&MBB, PreheaderFlushFlags());
3139 if (!IsInserted)
3140 return Iterator->second;
3141
3142 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
3143 if (!Succ)
3144 return PreheaderFlushFlags();
3145
3146 MachineLoop *Loop = MLI.getLoopFor(Succ);
3147 if (!Loop)
3148 return PreheaderFlushFlags();
3149
3150 if (Loop->getLoopPreheader() == &MBB) {
3151 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3152 return Iterator->second;
3153 }
3154
3155 return PreheaderFlushFlags();
3156}
3157
3158bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
3160 return TII.mayAccessVMEMThroughFlat(MI);
3161 return SIInstrInfo::isVMEM(MI);
3162}
3163
3164bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {
3165 return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore();
3166}
3167
3168// Check if instruction is a store to LDS that is counted via DSCNT
3169// (where that counter exists).
3170bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {
3171 return MI.mayStore() && SIInstrInfo::isDS(MI);
3172}
3173
3174// Return flags indicating which counters should be flushed in the preheader of
3175// the given loop. We currently decide to flush in the following situations:
3176// For VMEM (FlushVmCnt):
3177// 1. The loop contains vmem store(s), no vmem load and at least one use of a
3178// vgpr containing a value that is loaded outside of the loop. (Only on
3179// targets with no vscnt counter).
3180// 2. The loop contains vmem load(s), but the loaded values are not used in the
3181// loop, and at least one use of a vgpr containing a value that is loaded
3182// outside of the loop.
3183// For DS (FlushDsCnt, GFX12+ only):
3184// 3. The loop contains no DS reads, and at least one use of a vgpr containing
3185// a value that is DS read outside of the loop.
3186// 4. The loop contains DS read(s), loaded values are not used in the same
3187// iteration but in the next iteration (prefetch pattern), and at least one
3188// use of a vgpr containing a value that is DS read outside of the loop.
3189// Flushing in preheader reduces wait overhead if the wait requirement in
3190// iteration 1 would otherwise be more strict (but unfortunately preheader
3191// flush decision is taken before knowing that).
3192// 5. (Single-block loops only) The loop has DS prefetch reads with flush point
3193// tracking. Some DS reads may be used in the same iteration (creating
3194// "flush points"), but others remain unflushed at the backedge. When a DS
3195// read is consumed in the same iteration, it and all prior reads are
3196// "flushed" (FIFO order). No DS writes are allowed in the loop.
3197// TODO: Find a way to extend to multi-block loops.
3198PreheaderFlushFlags
3199SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
3200 const WaitcntBrackets &Brackets) {
3201 PreheaderFlushFlags Flags;
3202 bool HasVMemLoad = false;
3203 bool HasVMemStore = false;
3204 bool UsesVgprVMEMLoadedOutside = false;
3205 bool UsesVgprDSReadOutside = false;
3206 bool VMemInvalidated = false;
3207 // DS optimization only applies to GFX12+ where DS_CNT is separate.
3208 // Tracking status for "no DS read in loop" or "pure DS prefetch
3209 // (use only in next iteration)".
3210 bool TrackSimpleDSOpt = ST.hasExtendedWaitCounts();
3211 DenseSet<MCRegUnit> VgprUse;
3212 DenseSet<MCRegUnit> VgprDefVMEM;
3213 DenseSet<MCRegUnit> VgprDefDS;
3214
3215 // Track DS reads for prefetch pattern with flush points (single-block only).
3216 // Keeps track of the last DS read (position counted from the top of the loop)
3217 // to each VGPR. Read is considered consumed (and thus needs flushing) if
3218 // the dest register has a use or is overwritten (by any later opertions).
3219 DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
3220 unsigned DSReadPosition = 0;
3221 bool IsSingleBlock = ML->getNumBlocks() == 1;
3222 bool TrackDSFlushPoint = ST.hasExtendedWaitCounts() && IsSingleBlock;
3223 unsigned LastDSFlushPosition = 0;
3224
3225 for (MachineBasicBlock *MBB : ML->blocks()) {
3226 for (MachineInstr &MI : *MBB) {
3227 if (isVMEMOrFlatVMEM(MI)) {
3228 HasVMemLoad |= MI.mayLoad();
3229 HasVMemStore |= MI.mayStore();
3230 }
3231 // TODO: Can we relax DSStore check? There may be cases where
3232 // these DS stores are drained prior to the end of MBB (or loop).
3233 if (mayStoreIncrementingDSCNT(MI)) {
3234 // Early exit if none of the optimizations are feasible.
3235 // Otherwise, set tracking status appropriately and continue.
3236 if (VMemInvalidated)
3237 return Flags;
3238 TrackSimpleDSOpt = false;
3239 TrackDSFlushPoint = false;
3240 }
3241 bool IsDSRead = isDSRead(MI);
3242 if (IsDSRead)
3243 ++DSReadPosition;
3244
3245 // Helper: if RU has a pending DS read, update LastDSFlushPosition
3246 auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
3247 if (!TrackDSFlushPoint)
3248 return;
3249 if (auto It = LastDSReadPositionMap.find(RU);
3250 It != LastDSReadPositionMap.end()) {
3251 // RU defined by DSRead is used or overwritten. Need to complete
3252 // the read, if not already implied by a later DSRead (to any RU)
3253 // needing to complete in FIFO order.
3254 LastDSFlushPosition = std::max(LastDSFlushPosition, It->second);
3255 }
3256 };
3257
3258 for (const MachineOperand &Op : MI.all_uses()) {
3259 if (Op.isDebug() || !TRI.isVectorRegister(MRI, Op.getReg()))
3260 continue;
3261 // Vgpr use
3262 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3263 // If we find a register that is loaded inside the loop, 1. and 2.
3264 // are invalidated.
3265 if (VgprDefVMEM.contains(RU))
3266 VMemInvalidated = true;
3267
3268 // Check for DS reads used inside the loop
3269 if (VgprDefDS.contains(RU))
3270 TrackSimpleDSOpt = false;
3271
3272 // Early exit if all optimizations are invalidated
3273 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3274 return Flags;
3275
3276 // Check for flush points (DS read used in same iteration)
3277 updateDSReadFlushTracking(RU);
3278
3279 VgprUse.insert(RU);
3280 // Check if this register has a pending VMEM load from outside the
3281 // loop (value loaded outside and used inside).
3282 VMEMID ID = toVMEMID(RU);
3283 if (Brackets.hasPendingVMEM(ID, AMDGPU::LOAD_CNT) ||
3284 Brackets.hasPendingVMEM(ID, AMDGPU::SAMPLE_CNT) ||
3285 Brackets.hasPendingVMEM(ID, AMDGPU::BVH_CNT))
3286 UsesVgprVMEMLoadedOutside = true;
3287 // Check if loaded outside the loop via DS (not VMEM/FLAT).
3288 // Only consider it a DS read if there's no pending VMEM load for
3289 // this register, since FLAT can set both counters.
3290 else if (Brackets.hasPendingVMEM(ID, AMDGPU::DS_CNT))
3291 UsesVgprDSReadOutside = true;
3292 }
3293 }
3294
3295 // VMem load vgpr def
3296 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
3297 for (const MachineOperand &Op : MI.all_defs()) {
3298 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3299 // If we find a register that is loaded inside the loop, 1. and 2.
3300 // are invalidated.
3301 if (VgprUse.contains(RU))
3302 VMemInvalidated = true;
3303 VgprDefVMEM.insert(RU);
3304 }
3305 }
3306 // Early exit if all optimizations are invalidated
3307 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3308 return Flags;
3309 }
3310
3311 // DS read vgpr def
3312 // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
3313 // If USE comes before DEF, it's the prefetch pattern (use value from
3314 // previous iteration, read for next iteration). We should still flush
3315 // in preheader so iteration 1 doesn't need to wait inside the loop.
3316 // Only invalidate when DEF comes before USE (same-iteration consumption,
3317 // checked above when processing uses).
3318 if (IsDSRead || TrackDSFlushPoint) {
3319 for (const MachineOperand &Op : MI.all_defs()) {
3320 if (!TRI.isVectorRegister(MRI, Op.getReg()))
3321 continue;
3322 for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
3323 // Check for overwrite of pending DS read (flush point) by any
3324 // instruction
3325 updateDSReadFlushTracking(RU);
3326 if (IsDSRead) {
3327 VgprDefDS.insert(RU);
3328 if (TrackDSFlushPoint)
3329 LastDSReadPositionMap[RU] = DSReadPosition;
3330 }
3331 }
3332 }
3333 }
3334 }
3335 }
3336
3337 // VMEM flush decision
3338 if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
3339 ((!ST.hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3340 (HasVMemLoad && ST.hasVmemWriteVgprInOrder())))
3341 Flags.FlushVmCnt = true;
3342
3343 // DS flush decision:
3344 // Simple DS Opt: flush if loop uses DS read values from outside
3345 // and either has no DS reads in the loop, or DS reads whose results
3346 // are not used in the loop.
3347 bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
3348 // Prefetch with flush points: some DS reads used in same iteration,
3349 // but unflushed reads remain at backedge
3350 bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
3351 bool DSFlushPointPrefetch =
3352 TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
3353
3354 if (SimpleDSOpt || DSFlushPointPrefetch)
3355 Flags.FlushDsCnt = true;
3356
3357 return Flags;
3358}
3359
3360bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3361 auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3362 auto &PDT =
3363 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3364 AliasAnalysis *AA = nullptr;
3365 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3366 AA = &AAR->getAAResults();
3367
3368 return SIInsertWaitcnts(MLI, PDT, AA, MF).run();
3369}
3370
3371PreservedAnalyses
3374 auto &MLI = MFAM.getResult<MachineLoopAnalysis>(MF);
3375 auto &PDT = MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
3377 .getManager()
3378 .getCachedResult<AAManager>(MF.getFunction());
3379
3380 if (!SIInsertWaitcnts(MLI, PDT, AA, MF).run())
3381 return PreservedAnalyses::all();
3382
3385 .preserve<AAManager>();
3386}
3387
3388bool SIInsertWaitcnts::run() {
3390
3392
3393 // Initialize hardware limits first, as they're needed by the generators.
3394 Limits = AMDGPU::HardwareLimits(IV);
3395
3396 if (ST.hasExtendedWaitCounts()) {
3397 IsExpertMode = ST.hasExpertSchedulingMode() &&
3398 (ExpertSchedulingModeFlag.getNumOccurrences()
3400 : MF.getFunction()
3401 .getFnAttribute("amdgpu-expert-scheduling-mode")
3402 .getValueAsBool());
3403 MaxCounter = IsExpertMode ? AMDGPU::NUM_EXPERT_INST_CNTS
3405 // Initialize WCG per MF. It contains state that depends on MF attributes.
3406 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, Limits,
3407 IsExpertMode);
3408 } else {
3409 MaxCounter = AMDGPU::NUM_NORMAL_INST_CNTS;
3410 // Initialize WCG per MF. It contains state that depends on MF attributes.
3411 WCG = std::make_unique<WaitcntGeneratorPreGFX12>(
3412 MF, AMDGPU::NUM_NORMAL_INST_CNTS, Limits);
3413 }
3414
3415 SmemAccessCounter = getCounterFromEvent(HWEvent::SMEM_ACCESS);
3416
3417 bool Modified = false;
3418
3419 MachineBasicBlock &EntryBB = MF.front();
3420
3421 if (!MFI->isEntryFunction() &&
3422 !MF.getFunction().hasFnAttribute(Attribute::Naked)) {
3423 // Wait for any outstanding memory operations that the input registers may
3424 // depend on. We can't track them and it's better to do the wait after the
3425 // costly call sequence.
3426
3427 // TODO: Could insert earlier and schedule more liberally with operations
3428 // that only use caller preserved registers.
3430 while (I != EntryBB.end() && I->isMetaInstruction())
3431 ++I;
3432
3433 if (ST.hasExtendedWaitCounts()) {
3434 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
3435 .addImm(0);
3437 if (CT == AMDGPU::LOAD_CNT || CT == AMDGPU::DS_CNT ||
3438 CT == AMDGPU::STORE_CNT || CT == AMDGPU::X_CNT ||
3440 continue;
3441
3442 if (!ST.hasImageInsts() &&
3443 (CT == AMDGPU::EXP_CNT || CT == AMDGPU::SAMPLE_CNT ||
3444 CT == AMDGPU::BVH_CNT))
3445 continue;
3446
3447 BuildMI(EntryBB, I, DebugLoc(),
3448 TII.get(instrsForExtendedCounterTypes[CT]))
3449 .addImm(0);
3450 }
3451 if (IsExpertMode) {
3452 unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, ST);
3454 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3455 .addImm(Enc);
3456 }
3457 } else {
3458 BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT)).addImm(0);
3459 }
3460
3461 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
3462 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3463 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3464
3465 Modified = true;
3466 }
3467
3468 // Keep iterating over the blocks in reverse post order, inserting and
3469 // updating s_waitcnt where needed, until a fix point is reached.
3470 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3471 BlockInfos.try_emplace(MBB);
3472
3473 std::unique_ptr<WaitcntBrackets> Brackets;
3474 bool Repeat;
3475 do {
3476 Repeat = false;
3477
3478 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
3479 ++BII) {
3480 MachineBasicBlock *MBB = BII->first;
3481 BlockInfo &BI = BII->second;
3482 if (!BI.Dirty)
3483 continue;
3484
3485 if (BI.Incoming) {
3486 if (!Brackets)
3487 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3488 else
3489 *Brackets = *BI.Incoming;
3490 } else {
3491 if (!Brackets) {
3492 Brackets = std::make_unique<WaitcntBrackets>(this);
3493 } else {
3494 // Reinitialize in-place. N.B. do not do this by assigning from a
3495 // temporary because the WaitcntBrackets class is large and it could
3496 // cause this function to use an unreasonable amount of stack space.
3497 Brackets->~WaitcntBrackets();
3498 new (Brackets.get()) WaitcntBrackets(this);
3499 }
3500 }
3501
3502 if (ST.hasWaitXcnt())
3503 Modified |= removeRedundantSoftXcnts(*MBB);
3504 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
3505 BI.Dirty = false;
3506
3507 if (Brackets->hasPendingEvent()) {
3508 BlockInfo *MoveBracketsToSucc = nullptr;
3509 for (MachineBasicBlock *Succ : MBB->successors()) {
3510 auto *SuccBII = BlockInfos.find(Succ);
3511 BlockInfo &SuccBI = SuccBII->second;
3512 if (!SuccBI.Incoming) {
3513 SuccBI.Dirty = true;
3514 if (SuccBII <= BII) {
3515 LLVM_DEBUG(dbgs() << "Repeat on backedge without merge\n");
3516 Repeat = true;
3517 }
3518 if (!MoveBracketsToSucc) {
3519 MoveBracketsToSucc = &SuccBI;
3520 } else {
3521 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3522 }
3523 } else {
3524 LLVM_DEBUG({
3525 dbgs() << "Try to merge ";
3526 MBB->printName(dbgs());
3527 dbgs() << " into ";
3528 Succ->printName(dbgs());
3529 dbgs() << '\n';
3530 });
3531 if (SuccBI.Incoming->merge(*Brackets)) {
3532 SuccBI.Dirty = true;
3533 if (SuccBII <= BII) {
3534 LLVM_DEBUG(dbgs() << "Repeat on backedge with merge\n");
3535 Repeat = true;
3536 }
3537 }
3538 }
3539 }
3540 if (MoveBracketsToSucc)
3541 MoveBracketsToSucc->Incoming = std::move(Brackets);
3542 }
3543 }
3544 } while (Repeat);
3545
3546 if (ST.hasScalarStores()) {
3547 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3548 bool HaveScalarStores = false;
3549
3550 for (MachineBasicBlock &MBB : MF) {
3551 for (MachineInstr &MI : MBB) {
3552 if (!HaveScalarStores && TII.isScalarStore(MI))
3553 HaveScalarStores = true;
3554
3555 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
3556 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3557 EndPgmBlocks.push_back(&MBB);
3558 }
3559 }
3560
3561 if (HaveScalarStores) {
3562 // If scalar writes are used, the cache must be flushed or else the next
3563 // wave to reuse the same scratch memory can be clobbered.
3564 //
3565 // Insert s_dcache_wb at wave termination points if there were any scalar
3566 // stores, and only if the cache hasn't already been flushed. This could
3567 // be improved by looking across blocks for flushes in postdominating
3568 // blocks from the stores but an explicitly requested flush is probably
3569 // very rare.
3570 for (MachineBasicBlock *MBB : EndPgmBlocks) {
3571 bool SeenDCacheWB = false;
3572
3573 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
3574 I != E; ++I) {
3575 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
3576 SeenDCacheWB = true;
3577 else if (TII.isScalarStore(*I))
3578 SeenDCacheWB = false;
3579
3580 // FIXME: It would be better to insert this before a waitcnt if any.
3581 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
3582 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3583 !SeenDCacheWB) {
3584 Modified = true;
3585 BuildMI(*MBB, I, I->getDebugLoc(), TII.get(AMDGPU::S_DCACHE_WB));
3586 }
3587 }
3588 }
3589 }
3590 }
3591
3592 if (IsExpertMode) {
3593 // Enable expert scheduling on function entry. To satisfy ABI requirements
3594 // and to allow calls between function with different expert scheduling
3595 // settings, disable it around calls and before returns.
3596
3598 while (I != EntryBB.end() && I->isMetaInstruction())
3599 ++I;
3600 setSchedulingMode(EntryBB, I, true);
3601
3602 for (MachineInstr *MI : CallInsts) {
3603 MachineBasicBlock &MBB = *MI->getParent();
3604 setSchedulingMode(MBB, MI, false);
3605 setSchedulingMode(MBB, std::next(MI->getIterator()), true);
3606 }
3607
3608 for (MachineInstr *MI : ReturnInsts)
3609 setSchedulingMode(*MI->getParent(), MI, false);
3610
3611 Modified = true;
3612 }
3613
3614 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
3615 // This is done in different ways depending on how the VGPRs were allocated
3616 // (i.e. whether we're in dynamic VGPR mode or not).
3617 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
3618 // waveslot limited kernel runs slower with the deallocation.
3619 if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) {
3620 for (auto [MI, _] : EndPgmInsts) {
3621 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3622 TII.get(AMDGPU::S_ALLOC_VGPR))
3623 .addImm(0);
3624 Modified = true;
3625 }
3626 } else if (!WCG->isOptNone() &&
3627 ST.getGeneration() >= AMDGPUSubtarget::GFX11 &&
3628 (MF.getFrameInfo().hasCalls() ||
3629 ST.getOccupancyWithNumVGPRs(
3630 TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass),
3631 /*IsDynamicVGPR=*/false) <
3633 for (auto [MI, Flag] : EndPgmInsts) {
3634 if (Flag) {
3635 if (ST.requiresNopBeforeDeallocVGPRs()) {
3636 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3637 TII.get(AMDGPU::S_NOP))
3638 .addImm(0);
3639 }
3640 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3641 TII.get(AMDGPU::S_SENDMSG))
3643 Modified = true;
3644 }
3645 }
3646 }
3647
3648 return Modified;
3649}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
#define _
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
AMDGPU::HWEventSet HWEventSet
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
AMDGPU::HWEvent HWEvent
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:484
Provides some synthesis utilities to produce sequences of values.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
void remove(const HWEvent &Event)
bool contains(const HWEvent &Event) const
void insert(const HWEvent &Event)
unsigned get(InstCounterType T) const
void set(InstCounterType T, unsigned Val)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:275
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:225
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:301
bool erase(const KeyT &Val)
Definition DenseMap.h:379
iterator end()
Definition DenseMap.h:143
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:286
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:759
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:724
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
LLVM_ABI void printName(raw_ostream &os, unsigned printNameFlags=PrintNameIr, ModuleSlotTracker *moduleSlotTracker=nullptr) const
Print the basic block's name as:
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
mop_range operands()
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator begin()
Definition MapVector.h:67
iterator find(const KeyT &Key)
Definition MapVector.h:156
iterator end()
Definition MapVector.h:69
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
Definition Pass.cpp:140
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isXcntDrain(const MachineInstr &MI)
True if MI implicitly drains XCNT.
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool usesTENSOR_CNT(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool usesASYNC_CNT(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
Target - Wrapper for Target specific information.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:185
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
iota_range< InstCounterType > inst_counter_types(InstCounterType MaxCounter)
unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getHasMatrixScale(unsigned Opc)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
HWEventSet getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST, bool IsExpertMode)
unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
static constexpr StringLiteral toString(HWEvent Event)
std::optional< AMDGPU::InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
HWEvent
TODO: This should be a bitmask from the start instead of having this enum.
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
iota_range< HWEvent > hw_events(HWEvent MaxEvent=HWEvent::NUM_WAIT_EVENTS)
Return an iterator over all events between FIRST_WAIT_EVENT and MaxEvent (exclusive,...
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
bool empty() const
Definition BasicBlock.h:101
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto seq_inclusive(T Begin, T End)
Iterate over an integral type from Begin to End inclusive.
Definition Sequence.h:325
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Wait
Definition Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2312
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
@ Async
"Asynchronous" unwind tables (instr precise)
Definition CodeGen.h:157
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
#define N
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.