LLVM 17.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
31#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/Sequence.h"
39using namespace llvm;
40
41#define DEBUG_TYPE "si-insert-waitcnts"
42
43DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
44 "Force emit s_waitcnt expcnt(0) instrs");
45DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
46 "Force emit s_waitcnt lgkmcnt(0) instrs");
47DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
48 "Force emit s_waitcnt vmcnt(0) instrs");
49
51 "amdgpu-waitcnt-forcezero",
52 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
53 cl::init(false), cl::Hidden);
54
55namespace {
56// Class of object that encapsulates latest instruction counter score
57// associated with the operand. Used for determining whether
58// s_waitcnt instruction needs to be emitted.
59
60#define CNT_MASK(t) (1u << (t))
61
62enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
63} // namespace
64
65namespace llvm {
66template <> struct enum_iteration_traits<InstCounterType> {
67 static constexpr bool is_iterable = true;
68};
69} // namespace llvm
70
71namespace {
72auto inst_counter_types() { return enum_seq(VM_CNT, NUM_INST_CNTS); }
73
74using RegInterval = std::pair<int, int>;
75
76struct HardwareLimits {
77 unsigned VmcntMax;
78 unsigned ExpcntMax;
79 unsigned LgkmcntMax;
80 unsigned VscntMax;
81};
82
83struct RegisterEncoding {
84 unsigned VGPR0;
85 unsigned VGPRL;
86 unsigned SGPR0;
87 unsigned SGPRL;
88};
89
90enum WaitEventType {
91 VMEM_ACCESS, // vector-memory read & write
92 VMEM_READ_ACCESS, // vector-memory read
93 VMEM_WRITE_ACCESS, // vector-memory write
94 LDS_ACCESS, // lds read & write
95 GDS_ACCESS, // gds read & write
96 SQ_MESSAGE, // send message
97 SMEM_ACCESS, // scalar-memory read & write
98 EXP_GPR_LOCK, // export holding on its data src
99 GDS_GPR_LOCK, // GDS holding on its data and addr src
100 EXP_POS_ACCESS, // write to export position
101 EXP_PARAM_ACCESS, // write to export parameter
102 VMW_GPR_LOCK, // vector-memory write holding on its data src
103 EXP_LDS_ACCESS, // read by ldsdir counting as export
104 NUM_WAIT_EVENTS,
105};
106
107static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
108 (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
109 (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
110 (1 << SQ_MESSAGE),
111 (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
112 (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS),
113 (1 << VMEM_WRITE_ACCESS)};
114
115// The mapping is:
116// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
117// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
118// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
119// We reserve a fixed number of VGPR slots in the scoring tables for
120// special tokens like SCMEM_LDS (needed for buffer load to LDS).
121enum RegisterMapping {
122 SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
123 AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
124 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
125 NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
126 EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes.
127 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
128};
129
130// Enumerate different types of result-returning VMEM operations. Although
131// s_waitcnt orders them all with a single vmcnt counter, in the absence of
132// s_waitcnt only instructions of the same VmemType are guaranteed to write
133// their results in order -- so there is no need to insert an s_waitcnt between
134// two instructions of the same type that write the same vgpr.
135enum VmemType {
136 // BUF instructions and MIMG instructions without a sampler.
137 VMEM_NOSAMPLER,
138 // MIMG instructions with a sampler.
139 VMEM_SAMPLER,
140 // BVH instructions
141 VMEM_BVH
142};
143
144static bool updateVMCntOnly(const MachineInstr &Inst) {
145 return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
147}
148
149VmemType getVmemType(const MachineInstr &Inst) {
150 assert(updateVMCntOnly(Inst));
151 if (!SIInstrInfo::isMIMG(Inst))
152 return VMEM_NOSAMPLER;
154 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
156 return BaseInfo->BVH ? VMEM_BVH
157 : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
158}
159
160void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
161 switch (T) {
162 case VM_CNT:
163 Wait.VmCnt = std::min(Wait.VmCnt, Count);
164 break;
165 case EXP_CNT:
166 Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
167 break;
168 case LGKM_CNT:
169 Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
170 break;
171 case VS_CNT:
172 Wait.VsCnt = std::min(Wait.VsCnt, Count);
173 break;
174 default:
175 llvm_unreachable("bad InstCounterType");
176 }
177}
178
179// This objects maintains the current score brackets of each wait counter, and
180// a per-register scoreboard for each wait counter.
181//
182// We also maintain the latest score for every event type that can change the
183// waitcnt in order to know if there are multiple types of events within
184// the brackets. When multiple types of event happen in the bracket,
185// wait count may get decreased out of order, therefore we need to put in
186// "s_waitcnt 0" before use.
187class WaitcntBrackets {
188public:
189 WaitcntBrackets(const GCNSubtarget *SubTarget, HardwareLimits Limits,
190 RegisterEncoding Encoding)
191 : ST(SubTarget), Limits(Limits), Encoding(Encoding) {}
192
193 unsigned getWaitCountMax(InstCounterType T) const {
194 switch (T) {
195 case VM_CNT:
196 return Limits.VmcntMax;
197 case LGKM_CNT:
198 return Limits.LgkmcntMax;
199 case EXP_CNT:
200 return Limits.ExpcntMax;
201 case VS_CNT:
202 return Limits.VscntMax;
203 default:
204 break;
205 }
206 return 0;
207 }
208
209 unsigned getScoreLB(InstCounterType T) const {
210 assert(T < NUM_INST_CNTS);
211 return ScoreLBs[T];
212 }
213
214 unsigned getScoreUB(InstCounterType T) const {
215 assert(T < NUM_INST_CNTS);
216 return ScoreUBs[T];
217 }
218
219 unsigned getScoreRange(InstCounterType T) const {
220 return getScoreUB(T) - getScoreLB(T);
221 }
222
223 // Mapping from event to counter.
224 InstCounterType eventCounter(WaitEventType E) const {
225 for (auto T : inst_counter_types()) {
226 if (WaitEventMaskForInst[T] & (1 << E))
227 return T;
228 }
229 llvm_unreachable("event type has no associated counter");
230 }
231
232 unsigned getRegScore(int GprNo, InstCounterType T) const {
233 if (GprNo < NUM_ALL_VGPRS) {
234 return VgprScores[T][GprNo];
235 }
236 assert(T == LGKM_CNT);
237 return SgprScores[GprNo - NUM_ALL_VGPRS];
238 }
239
240 bool merge(const WaitcntBrackets &Other);
241
242 RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
244 const SIRegisterInfo *TRI, unsigned OpNo) const;
245
246 bool counterOutOfOrder(InstCounterType T) const;
247 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
248 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
249 void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
250 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
251 void applyWaitcnt(InstCounterType T, unsigned Count);
252 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
253 const MachineRegisterInfo *MRI, WaitEventType E,
255
256 unsigned hasPendingEvent() const { return PendingEvents; }
257 unsigned hasPendingEvent(WaitEventType E) const {
258 return PendingEvents & (1 << E);
259 }
260 unsigned hasPendingEvent(InstCounterType T) const {
261 unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
262 assert((HasPending != 0) == (getScoreRange(T) != 0));
263 return HasPending;
264 }
265
266 bool hasMixedPendingEvents(InstCounterType T) const {
267 unsigned Events = hasPendingEvent(T);
268 // Return true if more than one bit is set in Events.
269 return Events & (Events - 1);
270 }
271
272 bool hasPendingFlat() const {
273 return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
274 LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
275 (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
276 LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
277 }
278
279 void setPendingFlat() {
280 LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
281 LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
282 }
283
284 // Return true if there might be pending writes to the specified vgpr by VMEM
285 // instructions with types different from V.
286 bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
287 assert(GprNo < NUM_ALL_VGPRS);
288 return VgprVmemTypes[GprNo] & ~(1 << V);
289 }
290
291 void clearVgprVmemTypes(int GprNo) {
292 assert(GprNo < NUM_ALL_VGPRS);
293 VgprVmemTypes[GprNo] = 0;
294 }
295
296 void print(raw_ostream &);
297 void dump() { print(dbgs()); }
298
299private:
300 struct MergeInfo {
301 unsigned OldLB;
302 unsigned OtherLB;
303 unsigned MyShift;
304 unsigned OtherShift;
305 };
306 static bool mergeScore(const MergeInfo &M, unsigned &Score,
307 unsigned OtherScore);
308
309 void setScoreLB(InstCounterType T, unsigned Val) {
310 assert(T < NUM_INST_CNTS);
311 ScoreLBs[T] = Val;
312 }
313
314 void setScoreUB(InstCounterType T, unsigned Val) {
315 assert(T < NUM_INST_CNTS);
316 ScoreUBs[T] = Val;
317
318 if (T != EXP_CNT)
319 return;
320
321 if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
322 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
323 }
324
325 void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
326 if (GprNo < NUM_ALL_VGPRS) {
327 VgprUB = std::max(VgprUB, GprNo);
328 VgprScores[T][GprNo] = Val;
329 } else {
330 assert(T == LGKM_CNT);
331 SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
332 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
333 }
334 }
335
336 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
338 unsigned OpNo, unsigned Val);
339
340 const GCNSubtarget *ST = nullptr;
341 HardwareLimits Limits = {};
342 RegisterEncoding Encoding = {};
343 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
344 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
345 unsigned PendingEvents = 0;
346 // Remember the last flat memory operation.
347 unsigned LastFlat[NUM_INST_CNTS] = {0};
348 // wait_cnt scores for every vgpr.
349 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
350 int VgprUB = -1;
351 int SgprUB = -1;
352 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
353 // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
354 unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
355 // Bitmask of the VmemTypes of VMEM instructions that might have a pending
356 // write to each vgpr.
357 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
358};
359
360class SIInsertWaitcnts : public MachineFunctionPass {
361private:
362 const GCNSubtarget *ST = nullptr;
363 const SIInstrInfo *TII = nullptr;
364 const SIRegisterInfo *TRI = nullptr;
365 const MachineRegisterInfo *MRI = nullptr;
367
368 DenseSet<MachineInstr *> TrackedWaitcntSet;
370 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
371 MachineLoopInfo *MLI;
373
374 struct BlockInfo {
376 std::unique_ptr<WaitcntBrackets> Incoming;
377 bool Dirty = true;
378
379 explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
380 };
381
383
384 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
385 // because of amdgpu-waitcnt-forcezero flag
386 bool ForceEmitZeroWaitcnts;
387 bool ForceEmitWaitcnt[NUM_INST_CNTS];
388
389public:
390 static char ID;
391
392 SIInsertWaitcnts() : MachineFunctionPass(ID) {
393 (void)ForceExpCounter;
394 (void)ForceLgkmCounter;
395 (void)ForceVMCounter;
396 }
397
398 bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
399 bool isPreheaderToFlush(MachineBasicBlock &MBB,
400 WaitcntBrackets &ScoreBrackets);
401 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
402 bool runOnMachineFunction(MachineFunction &MF) override;
403
404 StringRef getPassName() const override {
405 return "SI insert wait instructions";
406 }
407
408 void getAnalysisUsage(AnalysisUsage &AU) const override {
409 AU.setPreservesCFG();
413 }
414
415 bool isForceEmitWaitcnt() const {
416 for (auto T : inst_counter_types())
417 if (ForceEmitWaitcnt[T])
418 return true;
419 return false;
420 }
421
422 AMDGPU::Waitcnt allZeroWaitcnt() const {
423 return AMDGPU::Waitcnt::allZero(ST->hasVscnt());
424 }
425
426 void setForceEmitWaitcnt() {
427// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
428// For debug builds, get the debug counter info and adjust if need be
429#ifndef NDEBUG
430 if (DebugCounter::isCounterSet(ForceExpCounter) &&
431 DebugCounter::shouldExecute(ForceExpCounter)) {
432 ForceEmitWaitcnt[EXP_CNT] = true;
433 } else {
434 ForceEmitWaitcnt[EXP_CNT] = false;
435 }
436
437 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
438 DebugCounter::shouldExecute(ForceLgkmCounter)) {
439 ForceEmitWaitcnt[LGKM_CNT] = true;
440 } else {
441 ForceEmitWaitcnt[LGKM_CNT] = false;
442 }
443
444 if (DebugCounter::isCounterSet(ForceVMCounter) &&
445 DebugCounter::shouldExecute(ForceVMCounter)) {
446 ForceEmitWaitcnt[VM_CNT] = true;
447 } else {
448 ForceEmitWaitcnt[VM_CNT] = false;
449 }
450#endif // NDEBUG
451 }
452
453 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
454 // FLAT instruction.
455 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
457 if (!ST->hasVscnt())
458 return VMEM_ACCESS;
459 if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst))
460 return VMEM_WRITE_ACCESS;
461 return VMEM_READ_ACCESS;
462 }
463
464 bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
465 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
466 bool generateWaitcntInstBefore(MachineInstr &MI,
467 WaitcntBrackets &ScoreBrackets,
468 MachineInstr *OldWaitcntInstr,
469 bool FlushVmCnt);
470 bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
471 WaitcntBrackets &ScoreBrackets,
472 MachineInstr *OldWaitcntInstr);
473 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
475 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
476 MachineInstr *OldWaitcntInstr);
477 void updateEventWaitcntAfter(MachineInstr &Inst,
478 WaitcntBrackets *ScoreBrackets);
479 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
480 WaitcntBrackets &ScoreBrackets);
481 bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
482 MachineInstr &OldWaitcntInstr,
485};
486
487} // end anonymous namespace
488
489RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
490 const SIInstrInfo *TII,
492 const SIRegisterInfo *TRI,
493 unsigned OpNo) const {
494 const MachineOperand &Op = MI->getOperand(OpNo);
495 if (!TRI->isInAllocatableClass(Op.getReg()))
496 return {-1, -1};
497
498 // A use via a PW operand does not need a waitcnt.
499 // A partial write is not a WAW.
500 assert(!Op.getSubReg() || !Op.isUndef());
501
502 RegInterval Result;
503
504 unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
505
506 if (TRI->isVectorRegister(*MRI, Op.getReg())) {
507 assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
508 Result.first = Reg - Encoding.VGPR0;
509 if (TRI->isAGPR(*MRI, Op.getReg()))
510 Result.first += AGPR_OFFSET;
511 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
512 } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
513 assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
514 Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
515 assert(Result.first >= NUM_ALL_VGPRS &&
516 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
517 }
518 // TODO: Handle TTMP
519 // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
520 else
521 return {-1, -1};
522
523 const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo);
524 unsigned Size = TRI->getRegSizeInBits(*RC);
525 Result.second = Result.first + ((Size + 16) / 32);
526
527 return Result;
528}
529
530void WaitcntBrackets::setExpScore(const MachineInstr *MI,
531 const SIInstrInfo *TII,
532 const SIRegisterInfo *TRI,
533 const MachineRegisterInfo *MRI, unsigned OpNo,
534 unsigned Val) {
535 RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
536 assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
537 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
538 setRegScore(RegNo, EXP_CNT, Val);
539 }
540}
541
542// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS written
543// can be accessed. A load from LDS to VMEM does not need a wait.
545 return SIInstrInfo::isVALU(MI) &&
547 MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
548}
549
550void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
551 const SIRegisterInfo *TRI,
553 WaitEventType E, MachineInstr &Inst) {
554 InstCounterType T = eventCounter(E);
555 unsigned CurrScore = getScoreUB(T) + 1;
556 if (CurrScore == 0)
557 report_fatal_error("InsertWaitcnt score wraparound");
558 // PendingEvents and ScoreUB need to be update regardless if this event
559 // changes the score of a register or not.
560 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
561 PendingEvents |= 1 << E;
562 setScoreUB(T, CurrScore);
563
564 if (T == EXP_CNT) {
565 // Put score on the source vgprs. If this is a store, just use those
566 // specific register(s).
567 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
568 int AddrOpIdx =
569 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
570 // All GDS operations must protect their address register (same as
571 // export.)
572 if (AddrOpIdx != -1) {
573 setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
574 }
575
576 if (Inst.mayStore()) {
577 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) {
578 setExpScore(
579 &Inst, TII, TRI, MRI,
580 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
581 CurrScore);
582 }
583 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) {
584 setExpScore(&Inst, TII, TRI, MRI,
586 AMDGPU::OpName::data1),
587 CurrScore);
588 }
589 } else if (SIInstrInfo::isAtomicRet(Inst) &&
590 Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
591 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
592 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
593 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
594 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
595 Inst.getOpcode() != AMDGPU::DS_APPEND &&
596 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
597 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
598 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
599 const MachineOperand &Op = Inst.getOperand(I);
600 if (Op.isReg() && !Op.isDef() &&
601 TRI->isVectorRegister(*MRI, Op.getReg())) {
602 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
603 }
604 }
605 }
606 } else if (TII->isFLAT(Inst)) {
607 if (Inst.mayStore()) {
608 setExpScore(
609 &Inst, TII, TRI, MRI,
610 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
611 CurrScore);
612 } else if (SIInstrInfo::isAtomicRet(Inst)) {
613 setExpScore(
614 &Inst, TII, TRI, MRI,
615 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
616 CurrScore);
617 }
618 } else if (TII->isMIMG(Inst)) {
619 if (Inst.mayStore()) {
620 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
621 } else if (SIInstrInfo::isAtomicRet(Inst)) {
622 setExpScore(
623 &Inst, TII, TRI, MRI,
624 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
625 CurrScore);
626 }
627 } else if (TII->isMTBUF(Inst)) {
628 if (Inst.mayStore()) {
629 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
630 }
631 } else if (TII->isMUBUF(Inst)) {
632 if (Inst.mayStore()) {
633 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
634 } else if (SIInstrInfo::isAtomicRet(Inst)) {
635 setExpScore(
636 &Inst, TII, TRI, MRI,
637 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
638 CurrScore);
639 }
640 } else if (TII->isLDSDIR(Inst)) {
641 // LDSDIR instructions attach the score to the destination.
642 setExpScore(
643 &Inst, TII, TRI, MRI,
644 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
645 CurrScore);
646 } else {
647 if (TII->isEXP(Inst)) {
648 // For export the destination registers are really temps that
649 // can be used as the actual source after export patching, so
650 // we need to treat them like sources and set the EXP_CNT
651 // score.
652 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
653 MachineOperand &DefMO = Inst.getOperand(I);
654 if (DefMO.isReg() && DefMO.isDef() &&
655 TRI->isVGPR(*MRI, DefMO.getReg())) {
656 setRegScore(
657 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
658 EXP_CNT, CurrScore);
659 }
660 }
661 }
662 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
663 MachineOperand &MO = Inst.getOperand(I);
664 if (MO.isReg() && !MO.isDef() &&
665 TRI->isVectorRegister(*MRI, MO.getReg())) {
666 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
667 }
668 }
669 }
670#if 0 // TODO: check if this is handled by MUBUF code above.
671 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
672 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
673 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
674 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
675 unsigned OpNo;//TODO: find the OpNo for this operand;
676 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo);
677 for (int RegNo = Interval.first; RegNo < Interval.second;
678 ++RegNo) {
679 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
680 }
681#endif
682 } else {
683 // Match the score to the destination registers.
684 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
685 auto &Op = Inst.getOperand(I);
686 if (!Op.isReg() || !Op.isDef())
687 continue;
688 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I);
689 if (T == VM_CNT) {
690 if (Interval.first >= NUM_ALL_VGPRS)
691 continue;
692 if (updateVMCntOnly(Inst)) {
693 VmemType V = getVmemType(Inst);
694 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
695 VgprVmemTypes[RegNo] |= 1 << V;
696 }
697 }
698 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
699 setRegScore(RegNo, T, CurrScore);
700 }
701 }
702 if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
703 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
704 }
705 }
706}
707
708void WaitcntBrackets::print(raw_ostream &OS) {
709 OS << '\n';
710 for (auto T : inst_counter_types()) {
711 unsigned SR = getScoreRange(T);
712
713 switch (T) {
714 case VM_CNT:
715 OS << " VM_CNT(" << SR << "): ";
716 break;
717 case LGKM_CNT:
718 OS << " LGKM_CNT(" << SR << "): ";
719 break;
720 case EXP_CNT:
721 OS << " EXP_CNT(" << SR << "): ";
722 break;
723 case VS_CNT:
724 OS << " VS_CNT(" << SR << "): ";
725 break;
726 default:
727 OS << " UNKNOWN(" << SR << "): ";
728 break;
729 }
730
731 if (SR != 0) {
732 // Print vgpr scores.
733 unsigned LB = getScoreLB(T);
734
735 for (int J = 0; J <= VgprUB; J++) {
736 unsigned RegScore = getRegScore(J, T);
737 if (RegScore <= LB)
738 continue;
739 unsigned RelScore = RegScore - LB - 1;
740 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
741 OS << RelScore << ":v" << J << " ";
742 } else {
743 OS << RelScore << ":ds ";
744 }
745 }
746 // Also need to print sgpr scores for lgkm_cnt.
747 if (T == LGKM_CNT) {
748 for (int J = 0; J <= SgprUB; J++) {
749 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
750 if (RegScore <= LB)
751 continue;
752 unsigned RelScore = RegScore - LB - 1;
753 OS << RelScore << ":s" << J << " ";
754 }
755 }
756 }
757 OS << '\n';
758 }
759 OS << '\n';
760}
761
762/// Simplify the waitcnt, in the sense of removing redundant counts, and return
763/// whether a waitcnt instruction is needed at all.
764void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
765 simplifyWaitcnt(VM_CNT, Wait.VmCnt);
766 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
767 simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
768 simplifyWaitcnt(VS_CNT, Wait.VsCnt);
769}
770
771void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
772 unsigned &Count) const {
773 // The number of outstanding events for this type, T, can be calculated
774 // as (UB - LB). If the current Count is greater than or equal to the number
775 // of outstanding events, then the wait for this counter is redundant.
776 if (Count >= getScoreRange(T))
777 Count = ~0u;
778}
779
780void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
781 AMDGPU::Waitcnt &Wait) const {
782 unsigned ScoreToWait = getRegScore(RegNo, T);
783
784 // If the score of src_operand falls within the bracket, we need an
785 // s_waitcnt instruction.
786 const unsigned LB = getScoreLB(T);
787 const unsigned UB = getScoreUB(T);
788 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
789 if ((T == VM_CNT || T == LGKM_CNT) &&
790 hasPendingFlat() &&
791 !ST->hasFlatLgkmVMemCountInOrder()) {
792 // If there is a pending FLAT operation, and this is a VMem or LGKM
793 // waitcnt and the target can report early completion, then we need
794 // to force a waitcnt 0.
795 addWait(Wait, T, 0);
796 } else if (counterOutOfOrder(T)) {
797 // Counter can get decremented out-of-order when there
798 // are multiple types event in the bracket. Also emit an s_wait counter
799 // with a conservative value of 0 for the counter.
800 addWait(Wait, T, 0);
801 } else {
802 // If a counter has been maxed out avoid overflow by waiting for
803 // MAX(CounterType) - 1 instead.
804 unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
805 addWait(Wait, T, NeededWait);
806 }
807 }
808}
809
810void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
811 applyWaitcnt(VM_CNT, Wait.VmCnt);
812 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
813 applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
814 applyWaitcnt(VS_CNT, Wait.VsCnt);
815}
816
817void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
818 const unsigned UB = getScoreUB(T);
819 if (Count >= UB)
820 return;
821 if (Count != 0) {
822 if (counterOutOfOrder(T))
823 return;
824 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
825 } else {
826 setScoreLB(T, UB);
827 PendingEvents &= ~WaitEventMaskForInst[T];
828 }
829}
830
831// Where there are multiple types of event in the bracket of a counter,
832// the decrement may go out of order.
833bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
834 // Scalar memory read always can go out of order.
835 if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
836 return true;
837 return hasMixedPendingEvents(T);
838}
839
840INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
841 false)
844INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
845 false)
846
847char SIInsertWaitcnts::ID = 0;
848
849char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
850
852 return new SIInsertWaitcnts();
853}
854
856 unsigned NewEnc) {
857 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
858 assert(OpIdx >= 0);
859
860 MachineOperand &MO = MI.getOperand(OpIdx);
861
862 if (NewEnc == MO.getImm())
863 return false;
864
865 MO.setImm(NewEnc);
866 return true;
867}
868
869/// Combine consecutive waitcnt instructions that precede \p It and follow
870/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
871/// by previous passes. Currently this pass conservatively assumes that these
872/// preexisting waitcnt are required for correctness.
873bool SIInsertWaitcnts::applyPreexistingWaitcnt(
874 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
876 bool Modified = false;
877 MachineInstr *WaitcntInstr = nullptr;
878 MachineInstr *WaitcntVsCntInstr = nullptr;
879
880 for (auto &II :
881 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
882 if (II.isMetaInstruction())
883 continue;
884
885 if (II.getOpcode() == AMDGPU::S_WAITCNT) {
886 // Conservatively update required wait if this waitcnt was added in an
887 // earlier pass. In this case it will not exist in the tracked waitcnt
888 // set.
889 if (!TrackedWaitcntSet.count(&II)) {
890 unsigned IEnc = II.getOperand(0).getImm();
892 Wait = Wait.combined(OldWait);
893 }
894
895 // Merge consecutive waitcnt of the same type by erasing multiples.
896 if (!WaitcntInstr) {
897 WaitcntInstr = &II;
898 } else {
899 II.eraseFromParent();
900 Modified = true;
901 }
902
903 } else {
904 assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
905 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
906 if (!TrackedWaitcntSet.count(&II)) {
907 unsigned OldVSCnt =
908 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
909 Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
910 }
911
912 if (!WaitcntVsCntInstr) {
913 WaitcntVsCntInstr = &II;
914 } else {
915 II.eraseFromParent();
916 Modified = true;
917 }
918 }
919 }
920
921 // Updated encoding of merged waitcnt with the required wait.
922 if (WaitcntInstr) {
923 if (Wait.hasWaitExceptVsCnt()) {
924 Modified |=
925 updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
927 ScoreBrackets.applyWaitcnt(Wait);
928 Wait.VmCnt = ~0u;
929 Wait.LgkmCnt = ~0u;
930 Wait.ExpCnt = ~0u;
931
932 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
933 ? dbgs() << "applyPreexistingWaitcnt\n"
934 << "New Instr at block end: " << *WaitcntInstr
935 << '\n'
936 : dbgs() << "applyPreexistingWaitcnt\n"
937 << "Old Instr: " << *It
938 << "New Instr: " << *WaitcntInstr << '\n');
939
940 } else {
941 WaitcntInstr->eraseFromParent();
942 Modified = true;
943 }
944 }
945
946 if (WaitcntVsCntInstr) {
947 if (Wait.hasWaitVsCnt()) {
948 assert(ST->hasVscnt());
949 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
950 AMDGPU::OpName::simm16, Wait.VsCnt);
951 ScoreBrackets.applyWaitcnt(Wait);
952 Wait.VsCnt = ~0u;
953
954 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
955 ? dbgs() << "applyPreexistingWaitcnt\n"
956 << "New Instr at block end: "
957 << *WaitcntVsCntInstr << '\n'
958 : dbgs() << "applyPreexistingWaitcnt\n"
959 << "Old Instr: " << *It
960 << "New Instr: " << *WaitcntVsCntInstr << '\n');
961 } else {
962 WaitcntVsCntInstr->eraseFromParent();
963 Modified = true;
964 }
965 }
966
967 return Modified;
968}
969
970static bool readsVCCZ(const MachineInstr &MI) {
971 unsigned Opc = MI.getOpcode();
972 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
973 !MI.getOperand(1).isUndef();
974}
975
976/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
978 // Currently all conventions wait, but this may not always be the case.
979 //
980 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
981 // senses to omit the wait and do it in the caller.
982 return true;
983}
984
985/// \returns true if the callee is expected to wait for any outstanding waits
986/// before returning.
988 return true;
989}
990
991/// Generate s_waitcnt instruction to be placed before cur_Inst.
992/// Instructions of a given type are returned in order,
993/// but instructions of different types can complete out of order.
994/// We rely on this in-order completion
995/// and simply assign a score to the memory access instructions.
996/// We keep track of the active "score bracket" to determine
997/// if an access of a memory read requires an s_waitcnt
998/// and if so what the value of each counter is.
999/// The "score bracket" is bound by the lower bound and upper bound
1000/// scores (*_score_LB and *_score_ub respectively).
1001/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1002/// flush the vmcnt counter here.
1003bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1004 WaitcntBrackets &ScoreBrackets,
1005 MachineInstr *OldWaitcntInstr,
1006 bool FlushVmCnt) {
1007 setForceEmitWaitcnt();
1008
1009 if (MI.isMetaInstruction())
1010 return false;
1011
1013
1014 // FIXME: This should have already been handled by the memory legalizer.
1015 // Removing this currently doesn't affect any lit tests, but we need to
1016 // verify that nothing was relying on this. The number of buffer invalidates
1017 // being handled here should not be expanded.
1018 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1019 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1020 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1021 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1022 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1023 Wait.VmCnt = 0;
1024 }
1025
1026 // All waits must be resolved at call return.
1027 // NOTE: this could be improved with knowledge of all call sites or
1028 // with knowledge of the called routines.
1029 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1030 MI.getOpcode() == AMDGPU::SI_RETURN ||
1031 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1032 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1033 Wait = Wait.combined(allZeroWaitcnt());
1034 }
1035 // Resolve vm waits before gs-done.
1036 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
1037 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1038 ST->hasLegacyGeometry() &&
1039 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1041 Wait.VmCnt = 0;
1042 }
1043#if 0 // TODO: the following blocks of logic when we have fence.
1044 else if (MI.getOpcode() == SC_FENCE) {
1045 const unsigned int group_size =
1046 context->shader_info->GetMaxThreadGroupSize();
1047 // group_size == 0 means thread group size is unknown at compile time
1048 const bool group_is_multi_wave =
1049 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
1050 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
1051
1052 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
1053 SCRegType src_type = Inst->GetSrcType(i);
1054 switch (src_type) {
1055 case SCMEM_LDS:
1056 if (group_is_multi_wave ||
1057 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
1058 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
1059 ScoreBrackets->getScoreUB(LGKM_CNT));
1060 // LDS may have to wait for VM_CNT after buffer load to LDS
1061 if (target_info->HasBufferLoadToLDS()) {
1062 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
1063 ScoreBrackets->getScoreUB(VM_CNT));
1064 }
1065 }
1066 break;
1067
1068 case SCMEM_GDS:
1069 if (group_is_multi_wave || fence_is_global) {
1070 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1071 ScoreBrackets->getScoreUB(EXP_CNT));
1072 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
1073 ScoreBrackets->getScoreUB(LGKM_CNT));
1074 }
1075 break;
1076
1077 case SCMEM_UAV:
1078 case SCMEM_TFBUF:
1079 case SCMEM_RING:
1080 case SCMEM_SCATTER:
1081 if (group_is_multi_wave || fence_is_global) {
1082 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1083 ScoreBrackets->getScoreUB(EXP_CNT));
1084 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
1085 ScoreBrackets->getScoreUB(VM_CNT));
1086 }
1087 break;
1088
1089 case SCMEM_SCRATCH:
1090 default:
1091 break;
1092 }
1093 }
1094 }
1095#endif
1096
1097 // Export & GDS instructions do not read the EXEC mask until after the export
1098 // is granted (which can occur well after the instruction is issued).
1099 // The shader program must flush all EXP operations on the export-count
1100 // before overwriting the EXEC mask.
1101 else {
1102 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1103 // Export and GDS are tracked individually, either may trigger a waitcnt
1104 // for EXEC.
1105 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1106 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1107 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1108 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1109 Wait.ExpCnt = 0;
1110 }
1111 }
1112
1113 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1114 // The function is going to insert a wait on everything in its prolog.
1115 // This still needs to be careful if the call target is a load (e.g. a GOT
1116 // load). We also need to check WAW dependency with saved PC.
1118
1119 int CallAddrOpIdx =
1120 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
1121
1122 if (MI.getOperand(CallAddrOpIdx).isReg()) {
1123 RegInterval CallAddrOpInterval =
1124 ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx);
1125
1126 for (int RegNo = CallAddrOpInterval.first;
1127 RegNo < CallAddrOpInterval.second; ++RegNo)
1128 ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
1129
1130 int RtnAddrOpIdx =
1131 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
1132 if (RtnAddrOpIdx != -1) {
1133 RegInterval RtnAddrOpInterval =
1134 ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx);
1135
1136 for (int RegNo = RtnAddrOpInterval.first;
1137 RegNo < RtnAddrOpInterval.second; ++RegNo)
1138 ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
1139 }
1140 }
1141 } else {
1142 // FIXME: Should not be relying on memoperands.
1143 // Look at the source operands of every instruction to see if
1144 // any of them results from a previous memory operation that affects
1145 // its current usage. If so, an s_waitcnt instruction needs to be
1146 // emitted.
1147 // If the source operand was defined by a load, add the s_waitcnt
1148 // instruction.
1149 //
1150 // Two cases are handled for destination operands:
1151 // 1) If the destination operand was defined by a load, add the s_waitcnt
1152 // instruction to guarantee the right WAW order.
1153 // 2) If a destination operand that was used by a recent export/store ins,
1154 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1155 for (const MachineMemOperand *Memop : MI.memoperands()) {
1156 const Value *Ptr = Memop->getValue();
1157 if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
1158 addWait(Wait, LGKM_CNT, 0);
1159 if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1160 SLoadAddresses.erase(Ptr);
1161 }
1162 unsigned AS = Memop->getAddrSpace();
1164 continue;
1165 // No need to wait before load from VMEM to LDS.
1167 continue;
1168 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1169 // VM_CNT is only relevant to vgpr or LDS.
1170 ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
1171 if (Memop->isStore()) {
1172 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1173 }
1174 }
1175
1176 // Loop over use and def operands.
1177 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1178 MachineOperand &Op = MI.getOperand(I);
1179 if (!Op.isReg())
1180 continue;
1181
1182 // If the instruction does not read tied source, skip the operand.
1183 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1184 continue;
1185
1186 RegInterval Interval =
1187 ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
1188
1189 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1190 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1191 if (IsVGPR) {
1192 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1193 // previous write and this write are the same type of VMEM
1194 // instruction, in which case they're guaranteed to write their
1195 // results in order anyway.
1196 if (Op.isUse() || !updateVMCntOnly(MI) ||
1197 ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1198 getVmemType(MI))) {
1199 ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
1200 ScoreBrackets.clearVgprVmemTypes(RegNo);
1201 }
1202 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1203 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1204 }
1205 }
1206 ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
1207 }
1208 }
1209 }
1210 }
1211
1212 // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1213 // not, we need to ensure the subtarget is capable of backing off barrier
1214 // instructions in case there are any outstanding memory operations that may
1215 // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
1216 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1217 !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
1218 Wait = Wait.combined(allZeroWaitcnt());
1219 }
1220
1221 // TODO: Remove this work-around, enable the assert for Bug 457939
1222 // after fixing the scheduler. Also, the Shader Compiler code is
1223 // independent of target.
1224 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1225 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1226 Wait.LgkmCnt = 0;
1227 }
1228 }
1229
1230 // Verify that the wait is actually needed.
1231 ScoreBrackets.simplifyWaitcnt(Wait);
1232
1233 if (ForceEmitZeroWaitcnts)
1234 Wait = allZeroWaitcnt();
1235
1236 if (ForceEmitWaitcnt[VM_CNT])
1237 Wait.VmCnt = 0;
1238 if (ForceEmitWaitcnt[EXP_CNT])
1239 Wait.ExpCnt = 0;
1240 if (ForceEmitWaitcnt[LGKM_CNT])
1241 Wait.LgkmCnt = 0;
1242 if (ForceEmitWaitcnt[VS_CNT])
1243 Wait.VsCnt = 0;
1244
1245 if (FlushVmCnt) {
1246 if (ScoreBrackets.hasPendingEvent(VM_CNT))
1247 Wait.VmCnt = 0;
1248 }
1249
1250 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
1251 OldWaitcntInstr);
1252}
1253
1254// Add a waitcnt to flush the vmcnt counter at the end of the given block if
1255// needed.
1256bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
1257 WaitcntBrackets &ScoreBrackets,
1258 MachineInstr *OldWaitcntInstr) {
1260
1261 if (!ScoreBrackets.hasPendingEvent(VM_CNT))
1262 return false;
1263
1264 Wait.VmCnt = 0;
1265
1266 return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
1267 OldWaitcntInstr);
1268}
1269
1270bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1272 MachineBasicBlock &Block,
1273 WaitcntBrackets &ScoreBrackets,
1274 MachineInstr *OldWaitcntInstr) {
1275 bool Modified = false;
1276 const DebugLoc &DL = Block.findDebugLoc(It);
1277
1278 if (OldWaitcntInstr)
1279 // Try to merge the required wait with preexisting waitcnt instructions.
1280 // Also erase redundant waitcnt.
1281 Modified =
1282 applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
1283 else
1284 ScoreBrackets.applyWaitcnt(Wait);
1285
1286 // ExpCnt can be merged into VINTERP.
1287 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
1289 MachineOperand *WaitExp =
1290 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1291 if (Wait.ExpCnt < WaitExp->getImm()) {
1292 WaitExp->setImm(Wait.ExpCnt);
1293 Modified = true;
1294 }
1295 Wait.ExpCnt = ~0u;
1296
1297 LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1298 << "Update Instr: " << *It);
1299 }
1300
1301 // Build new waitcnt instructions unless no wait is needed or the old waitcnt
1302 // instruction was modified to handle the required wait.
1303 if (Wait.hasWaitExceptVsCnt()) {
1304 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1305 auto SWaitInst =
1306 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1307 TrackedWaitcntSet.insert(SWaitInst);
1308 Modified = true;
1309
1310 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1311 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1312 dbgs() << "New Instr: " << *SWaitInst << '\n');
1313 }
1314
1315 if (Wait.hasWaitVsCnt()) {
1316 assert(ST->hasVscnt());
1317
1318 auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1319 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1320 .addImm(Wait.VsCnt);
1321 TrackedWaitcntSet.insert(SWaitInst);
1322 Modified = true;
1323
1324 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1325 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1326 dbgs() << "New Instr: " << *SWaitInst << '\n');
1327 }
1328 return Modified;
1329}
1330
1331// This is a flat memory operation. Check to see if it has memory tokens other
1332// than LDS. Other address spaces supported by flat memory operations involve
1333// global memory.
1334bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1335 assert(TII->isFLAT(MI));
1336
1337 // All flat instructions use the VMEM counter.
1338 assert(TII->usesVM_CNT(MI));
1339
1340 // If there are no memory operands then conservatively assume the flat
1341 // operation may access VMEM.
1342 if (MI.memoperands_empty())
1343 return true;
1344
1345 // See if any memory operand specifies an address space that involves VMEM.
1346 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1347 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1348 // (GDS) address space is not supported by flat operations. Therefore, simply
1349 // return true unless only the LDS address space is found.
1350 for (const MachineMemOperand *Memop : MI.memoperands()) {
1351 unsigned AS = Memop->getAddrSpace();
1353 if (AS != AMDGPUAS::LOCAL_ADDRESS)
1354 return true;
1355 }
1356
1357 return false;
1358}
1359
1360// This is a flat memory operation. Check to see if it has memory tokens for
1361// either LDS or FLAT.
1362bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1363 assert(TII->isFLAT(MI));
1364
1365 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1366 if (!TII->usesLGKM_CNT(MI))
1367 return false;
1368
1369 // If in tgsplit mode then there can be no use of LDS.
1370 if (ST->isTgSplitEnabled())
1371 return false;
1372
1373 // If there are no memory operands then conservatively assume the flat
1374 // operation may access LDS.
1375 if (MI.memoperands_empty())
1376 return true;
1377
1378 // See if any memory operand specifies an address space that involves LDS.
1379 for (const MachineMemOperand *Memop : MI.memoperands()) {
1380 unsigned AS = Memop->getAddrSpace();
1382 return true;
1383 }
1384
1385 return false;
1386}
1387
1388void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1389 WaitcntBrackets *ScoreBrackets) {
1390 // Now look at the instruction opcode. If it is a memory access
1391 // instruction, update the upper-bound of the appropriate counter's
1392 // bracket and the destination operand scores.
1393 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1394 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1395 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1396 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1397 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1398 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1399 } else {
1400 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1401 }
1402 } else if (TII->isFLAT(Inst)) {
1403 assert(Inst.mayLoadOrStore());
1404
1405 int FlatASCount = 0;
1406
1407 if (mayAccessVMEMThroughFlat(Inst)) {
1408 ++FlatASCount;
1409 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
1410 Inst);
1411 }
1412
1413 if (mayAccessLDSThroughFlat(Inst)) {
1414 ++FlatASCount;
1415 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1416 }
1417
1418 // A Flat memory operation must access at least one address space.
1419 assert(FlatASCount);
1420
1421 // This is a flat memory operation that access both VMEM and LDS, so note it
1422 // - it will require that both the VM and LGKM be flushed to zero if it is
1423 // pending when a VM or LGKM dependency occurs.
1424 if (FlatASCount > 1)
1425 ScoreBrackets->setPendingFlat();
1426 } else if (SIInstrInfo::isVMEM(Inst) &&
1428 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
1429 Inst);
1430
1431 if (ST->vmemWriteNeedsExpWaitcnt() &&
1432 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
1433 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1434 }
1435 } else if (TII->isSMRD(Inst)) {
1436 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1437 } else if (Inst.isCall()) {
1438 if (callWaitsOnFunctionReturn(Inst)) {
1439 // Act as a wait on everything
1440 ScoreBrackets->applyWaitcnt(allZeroWaitcnt());
1441 } else {
1442 // May need to way wait for anything.
1443 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
1444 }
1445 } else if (SIInstrInfo::isLDSDIR(Inst)) {
1446 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
1447 } else if (TII->isVINTERP(Inst)) {
1448 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
1449 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
1450 } else if (SIInstrInfo::isEXP(Inst)) {
1451 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1453 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1454 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
1455 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1456 else
1457 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1458 } else {
1459 switch (Inst.getOpcode()) {
1460 case AMDGPU::S_SENDMSG:
1461 case AMDGPU::S_SENDMSG_RTN_B32:
1462 case AMDGPU::S_SENDMSG_RTN_B64:
1463 case AMDGPU::S_SENDMSGHALT:
1464 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1465 break;
1466 case AMDGPU::S_MEMTIME:
1467 case AMDGPU::S_MEMREALTIME:
1468 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1469 break;
1470 }
1471 }
1472}
1473
1474bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
1475 unsigned OtherScore) {
1476 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
1477 unsigned OtherShifted =
1478 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
1479 Score = std::max(MyShifted, OtherShifted);
1480 return OtherShifted > MyShifted;
1481}
1482
1483/// Merge the pending events and associater score brackets of \p Other into
1484/// this brackets status.
1485///
1486/// Returns whether the merge resulted in a change that requires tighter waits
1487/// (i.e. the merged brackets strictly dominate the original brackets).
1488bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
1489 bool StrictDom = false;
1490
1491 VgprUB = std::max(VgprUB, Other.VgprUB);
1492 SgprUB = std::max(SgprUB, Other.SgprUB);
1493
1494 for (auto T : inst_counter_types()) {
1495 // Merge event flags for this counter
1496 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
1497 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
1498 if (OtherEvents & ~OldEvents)
1499 StrictDom = true;
1500 PendingEvents |= OtherEvents;
1501
1502 // Merge scores for this counter
1503 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
1504 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
1505 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
1506 if (NewUB < ScoreLBs[T])
1507 report_fatal_error("waitcnt score overflow");
1508
1509 MergeInfo M;
1510 M.OldLB = ScoreLBs[T];
1511 M.OtherLB = Other.ScoreLBs[T];
1512 M.MyShift = NewUB - ScoreUBs[T];
1513 M.OtherShift = NewUB - Other.ScoreUBs[T];
1514
1515 ScoreUBs[T] = NewUB;
1516
1517 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
1518
1519 for (int J = 0; J <= VgprUB; J++)
1520 StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
1521
1522 if (T == LGKM_CNT) {
1523 for (int J = 0; J <= SgprUB; J++)
1524 StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
1525 }
1526 }
1527
1528 for (int J = 0; J <= VgprUB; J++) {
1529 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
1530 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
1531 VgprVmemTypes[J] = NewVmemTypes;
1532 }
1533
1534 return StrictDom;
1535}
1536
1537static bool isWaitInstr(MachineInstr &Inst) {
1538 return Inst.getOpcode() == AMDGPU::S_WAITCNT ||
1539 (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1540 Inst.getOperand(0).isReg() &&
1541 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1542}
1543
1544// Generate s_waitcnt instructions where needed.
1545bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1546 MachineBasicBlock &Block,
1547 WaitcntBrackets &ScoreBrackets) {
1548 bool Modified = false;
1549
1550 LLVM_DEBUG({
1551 dbgs() << "*** Block" << Block.getNumber() << " ***";
1552 ScoreBrackets.dump();
1553 });
1554
1555 // Track the correctness of vccz through this basic block. There are two
1556 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
1557 // ST->partialVCCWritesUpdateVCCZ().
1558 bool VCCZCorrect = true;
1559 if (ST->hasReadVCCZBug()) {
1560 // vccz could be incorrect at a basic block boundary if a predecessor wrote
1561 // to vcc and then issued an smem load.
1562 VCCZCorrect = false;
1563 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
1564 // vccz could be incorrect at a basic block boundary if a predecessor wrote
1565 // to vcc_lo or vcc_hi.
1566 VCCZCorrect = false;
1567 }
1568
1569 // Walk over the instructions.
1570 MachineInstr *OldWaitcntInstr = nullptr;
1571
1572 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
1573 E = Block.instr_end();
1574 Iter != E;) {
1575 MachineInstr &Inst = *Iter;
1576
1577 // Track pre-existing waitcnts that were added in earlier iterations or by
1578 // the memory legalizer.
1579 if (isWaitInstr(Inst)) {
1580 if (!OldWaitcntInstr)
1581 OldWaitcntInstr = &Inst;
1582 ++Iter;
1583 continue;
1584 }
1585
1586 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
1587 isPreheaderToFlush(Block, ScoreBrackets);
1588
1589 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
1590 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
1591 FlushVmCnt);
1592 OldWaitcntInstr = nullptr;
1593
1594 // Restore vccz if it's not known to be correct already.
1595 bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
1596
1597 // Don't examine operands unless we need to track vccz correctness.
1598 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
1599 if (Inst.definesRegister(AMDGPU::VCC_LO) ||
1600 Inst.definesRegister(AMDGPU::VCC_HI)) {
1601 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
1602 if (!ST->partialVCCWritesUpdateVCCZ())
1603 VCCZCorrect = false;
1604 } else if (Inst.definesRegister(AMDGPU::VCC)) {
1605 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
1606 // vccz bit, so when we detect that an instruction may read from a
1607 // corrupt vccz bit, we need to:
1608 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
1609 // operations to complete.
1610 // 2. Restore the correct value of vccz by writing the current value
1611 // of vcc back to vcc.
1612 if (ST->hasReadVCCZBug() &&
1613 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1614 // Writes to vcc while there's an outstanding smem read may get
1615 // clobbered as soon as any read completes.
1616 VCCZCorrect = false;
1617 } else {
1618 // Writes to vcc will fix any incorrect value in vccz.
1619 VCCZCorrect = true;
1620 }
1621 }
1622 }
1623
1624 if (TII->isSMRD(Inst)) {
1625 for (const MachineMemOperand *Memop : Inst.memoperands()) {
1626 // No need to handle invariant loads when avoiding WAR conflicts, as
1627 // there cannot be a vector store to the same memory location.
1628 if (!Memop->isInvariant()) {
1629 const Value *Ptr = Memop->getValue();
1630 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
1631 }
1632 }
1633 if (ST->hasReadVCCZBug()) {
1634 // This smem read could complete and clobber vccz at any time.
1635 VCCZCorrect = false;
1636 }
1637 }
1638
1639 updateEventWaitcntAfter(Inst, &ScoreBrackets);
1640
1641#if 0 // TODO: implement resource type check controlled by options with ub = LB.
1642 // If this instruction generates a S_SETVSKIP because it is an
1643 // indexed resource, and we are on Tahiti, then it will also force
1644 // an S_WAITCNT vmcnt(0)
1645 if (RequireCheckResourceType(Inst, context)) {
1646 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1647 ScoreBrackets->setScoreLB(VM_CNT,
1648 ScoreBrackets->getScoreUB(VM_CNT));
1649 }
1650#endif
1651
1652 LLVM_DEBUG({
1653 Inst.print(dbgs());
1654 ScoreBrackets.dump();
1655 });
1656
1657 // TODO: Remove this work-around after fixing the scheduler and enable the
1658 // assert above.
1659 if (RestoreVCCZ) {
1660 // Restore the vccz bit. Any time a value is written to vcc, the vcc
1661 // bit is updated, so we can restore the bit by reading the value of
1662 // vcc and then writing it back to the register.
1663 BuildMI(Block, Inst, Inst.getDebugLoc(),
1664 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1665 TRI->getVCC())
1666 .addReg(TRI->getVCC());
1667 VCCZCorrect = true;
1668 Modified = true;
1669 }
1670
1671 ++Iter;
1672 }
1673
1674 if (Block.getFirstTerminator() == Block.end() &&
1675 isPreheaderToFlush(Block, ScoreBrackets))
1676 Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
1677
1678 return Modified;
1679}
1680
1681// Return true if the given machine basic block is a preheader of a loop in
1682// which we want to flush the vmcnt counter, and false otherwise.
1683bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
1684 WaitcntBrackets &ScoreBrackets) {
1685 if (PreheadersToFlush.count(&MBB))
1686 return PreheadersToFlush[&MBB];
1687
1688 auto UpdateCache = [&](bool val) {
1689 PreheadersToFlush[&MBB] = val;
1690 return val;
1691 };
1692
1694 if (!Succ)
1695 return UpdateCache(false);
1696
1697 MachineLoop *Loop = MLI->getLoopFor(Succ);
1698 if (!Loop)
1699 return UpdateCache(false);
1700
1701 if (Loop->getLoopPreheader() == &MBB && shouldFlushVmCnt(Loop, ScoreBrackets))
1702 return UpdateCache(true);
1703
1704 return UpdateCache(false);
1705}
1706
1707bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
1708 return SIInstrInfo::isVMEM(MI) ||
1709 (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
1710}
1711
1712// Return true if it is better to flush the vmcnt counter in the preheader of
1713// the given loop. We currently decide to flush in two situations:
1714// 1. The loop contains vmem store(s), no vmem load and at least one use of a
1715// vgpr containing a value that is loaded outside of the loop. (Only on
1716// targets with no vscnt counter).
1717// 2. The loop contains vmem load(s), but the loaded values are not used in the
1718// loop, and at least one use of a vgpr containing a value that is loaded
1719// outside of the loop.
1720bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
1721 WaitcntBrackets &Brackets) {
1722 bool HasVMemLoad = false;
1723 bool HasVMemStore = false;
1724 bool UsesVgprLoadedOutside = false;
1725 DenseSet<Register> VgprUse;
1726 DenseSet<Register> VgprDef;
1727
1728 for (MachineBasicBlock *MBB : ML->blocks()) {
1729 for (MachineInstr &MI : *MBB) {
1730 if (isVMEMOrFlatVMEM(MI)) {
1731 if (MI.mayLoad())
1732 HasVMemLoad = true;
1733 if (MI.mayStore())
1734 HasVMemStore = true;
1735 }
1736 for (unsigned I = 0; I < MI.getNumOperands(); I++) {
1737 MachineOperand &Op = MI.getOperand(I);
1738 if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
1739 continue;
1740 RegInterval Interval = Brackets.getRegInterval(&MI, TII, MRI, TRI, I);
1741 // Vgpr use
1742 if (Op.isUse()) {
1743 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1744 // If we find a register that is loaded inside the loop, 1. and 2.
1745 // are invalidated and we can exit.
1746 if (VgprDef.contains(RegNo))
1747 return false;
1748 VgprUse.insert(RegNo);
1749 // If at least one of Op's registers is in the score brackets, the
1750 // value is likely loaded outside of the loop.
1751 if (Brackets.getRegScore(RegNo, VM_CNT) > Brackets.getScoreLB(VM_CNT)) {
1752 UsesVgprLoadedOutside = true;
1753 break;
1754 }
1755 }
1756 }
1757 // VMem load vgpr def
1758 else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
1759 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1760 // If we find a register that is loaded inside the loop, 1. and 2.
1761 // are invalidated and we can exit.
1762 if (VgprUse.contains(RegNo))
1763 return false;
1764 VgprDef.insert(RegNo);
1765 }
1766 }
1767 }
1768 }
1769 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
1770 return true;
1771 return HasVMemLoad && UsesVgprLoadedOutside;
1772}
1773
1774bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1775 ST = &MF.getSubtarget<GCNSubtarget>();
1776 TII = ST->getInstrInfo();
1777 TRI = &TII->getRegisterInfo();
1778 MRI = &MF.getRegInfo();
1779 IV = AMDGPU::getIsaVersion(ST->getCPU());
1781 MLI = &getAnalysis<MachineLoopInfo>();
1782 PDT = &getAnalysis<MachinePostDominatorTree>();
1783
1784 ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1785 for (auto T : inst_counter_types())
1786 ForceEmitWaitcnt[T] = false;
1787
1788 HardwareLimits Limits = {};
1789 Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1790 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1791 Limits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1792 Limits.VscntMax = ST->hasVscnt() ? 63 : 0;
1793
1794 unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
1795 unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
1796 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1797 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1798
1799 RegisterEncoding Encoding = {};
1800 Encoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1801 Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
1802 Encoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1803 Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
1804
1805 TrackedWaitcntSet.clear();
1806 BlockInfos.clear();
1807 bool Modified = false;
1808
1809 if (!MFI->isEntryFunction()) {
1810 // Wait for any outstanding memory operations that the input registers may
1811 // depend on. We can't track them and it's better to do the wait after the
1812 // costly call sequence.
1813
1814 // TODO: Could insert earlier and schedule more liberally with operations
1815 // that only use caller preserved registers.
1816 MachineBasicBlock &EntryBB = MF.front();
1818 for (MachineBasicBlock::iterator E = EntryBB.end();
1819 I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
1820 ;
1821 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
1822 if (ST->hasVscnt())
1823 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
1824 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1825 .addImm(0);
1826
1827 Modified = true;
1828 }
1829
1830 // Keep iterating over the blocks in reverse post order, inserting and
1831 // updating s_waitcnt where needed, until a fix point is reached.
1833 BlockInfos.insert({MBB, BlockInfo(MBB)});
1834
1835 std::unique_ptr<WaitcntBrackets> Brackets;
1836 bool Repeat;
1837 do {
1838 Repeat = false;
1839
1840 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
1841 ++BII) {
1842 BlockInfo &BI = BII->second;
1843 if (!BI.Dirty)
1844 continue;
1845
1846 if (BI.Incoming) {
1847 if (!Brackets)
1848 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
1849 else
1850 *Brackets = *BI.Incoming;
1851 } else {
1852 if (!Brackets)
1853 Brackets = std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
1854 else
1855 *Brackets = WaitcntBrackets(ST, Limits, Encoding);
1856 }
1857
1858 Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
1859 BI.Dirty = false;
1860
1861 if (Brackets->hasPendingEvent()) {
1862 BlockInfo *MoveBracketsToSucc = nullptr;
1863 for (MachineBasicBlock *Succ : BI.MBB->successors()) {
1864 auto SuccBII = BlockInfos.find(Succ);
1865 BlockInfo &SuccBI = SuccBII->second;
1866 if (!SuccBI.Incoming) {
1867 SuccBI.Dirty = true;
1868 if (SuccBII <= BII)
1869 Repeat = true;
1870 if (!MoveBracketsToSucc) {
1871 MoveBracketsToSucc = &SuccBI;
1872 } else {
1873 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
1874 }
1875 } else if (SuccBI.Incoming->merge(*Brackets)) {
1876 SuccBI.Dirty = true;
1877 if (SuccBII <= BII)
1878 Repeat = true;
1879 }
1880 }
1881 if (MoveBracketsToSucc)
1882 MoveBracketsToSucc->Incoming = std::move(Brackets);
1883 }
1884 }
1885 } while (Repeat);
1886
1887 if (ST->hasScalarStores()) {
1889 bool HaveScalarStores = false;
1890
1891 for (MachineBasicBlock &MBB : MF) {
1892 for (MachineInstr &MI : MBB) {
1893 if (!HaveScalarStores && TII->isScalarStore(MI))
1894 HaveScalarStores = true;
1895
1896 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1897 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1898 EndPgmBlocks.push_back(&MBB);
1899 }
1900 }
1901
1902 if (HaveScalarStores) {
1903 // If scalar writes are used, the cache must be flushed or else the next
1904 // wave to reuse the same scratch memory can be clobbered.
1905 //
1906 // Insert s_dcache_wb at wave termination points if there were any scalar
1907 // stores, and only if the cache hasn't already been flushed. This could
1908 // be improved by looking across blocks for flushes in postdominating
1909 // blocks from the stores but an explicitly requested flush is probably
1910 // very rare.
1911 for (MachineBasicBlock *MBB : EndPgmBlocks) {
1912 bool SeenDCacheWB = false;
1913
1914 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
1915 I != E; ++I) {
1916 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1917 SeenDCacheWB = true;
1918 else if (TII->isScalarStore(*I))
1919 SeenDCacheWB = false;
1920
1921 // FIXME: It would be better to insert this before a waitcnt if any.
1922 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1923 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1924 !SeenDCacheWB) {
1925 Modified = true;
1926 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1927 }
1928 }
1929 }
1930 }
1931 }
1932
1933 return Modified;
1934}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:182
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1269
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static bool readsVCCZ(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define DEBUG_TYPE
SI Insert Waitcnts
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Provides some synthesis utilities to produce sequences of values.
static const uint32_t IV[8]
Definition: blake3_impl.h:77
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:265
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:100
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:72
A debug info location.
Definition: DebugLoc.h:33
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool erase(const KeyT &Val)
Definition: DenseMap.h:329
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:36
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:47
const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:68
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:523
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:320
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:904
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:526
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr fully defines the specified register.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:745
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:452
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:533
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
A description of a memory reference used in the backend.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
iterator end()
Definition: MapVector.h:72
iterator find(const KeyT &Key)
Definition: MapVector.h:147
iterator begin()
Definition: MapVector.h:70
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:118
void clear()
Definition: MapVector.h:89
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:398
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:570
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:583
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:722
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:562
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:607
static bool isVINTERP(const MachineInstr &MI)
Definition: SIInstrInfo.h:730
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:494
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:530
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:546
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:390
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
Iterator for intrusive lists based on ilist_node.
self_iterator getIterator()
Definition: ilist_node.h:82
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:390
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:393
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:388
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getVmcntBitMask(const IsaVersion &Version)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition: Sequence.h:327
@ Wait
Definition: Threading.h:61
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:748
char & SIInsertWaitcntsID
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:145
FunctionPass * createSIInsertWaitcntsPass()
Instruction set architecture version.
Definition: TargetParser.h:112
Represents the counter values to wait for in an s_waitcnt instruction.
static Waitcnt allZero(bool HasVscnt)
static constexpr bool is_iterable
Definition: Sequence.h:100