LLVM  16.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert wait instructions for memory reads and writes.
11 ///
12 /// Memory reads and writes are issued asynchronously, so we need to insert
13 /// S_WAITCNT instructions when we want to access any of their results or
14 /// overwrite any register that's used asynchronously.
15 ///
16 /// TODO: This pass currently keeps one timeline per hardware counter. A more
17 /// finely-grained approach that keeps one timeline per event type could
18 /// sometimes get away with generating weaker s_waitcnt instructions. For
19 /// example, when both SMEM and LDS are in flight and we need to wait for
20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21 /// but the pass will currently generate a conservative lgkmcnt(0) because
22 /// multiple event types are in flight.
23 //
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPU.h"
27 #include "GCNSubtarget.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "Utils/AMDGPUBaseInfo.h"
31 #include "llvm/ADT/MapVector.h"
33 #include "llvm/ADT/Sequence.h"
36 #include "llvm/InitializePasses.h"
39 using namespace llvm;
40 
41 #define DEBUG_TYPE "si-insert-waitcnts"
42 
43 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
44  "Force emit s_waitcnt expcnt(0) instrs");
45 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
46  "Force emit s_waitcnt lgkmcnt(0) instrs");
47 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
48  "Force emit s_waitcnt vmcnt(0) instrs");
49 
51  "amdgpu-waitcnt-forcezero",
52  cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
53  cl::init(false), cl::Hidden);
54 
55 namespace {
56 // Class of object that encapsulates latest instruction counter score
57 // associated with the operand. Used for determining whether
58 // s_waitcnt instruction needs to be emitted.
59 
60 #define CNT_MASK(t) (1u << (t))
61 
62 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
63 } // namespace
64 
65 namespace llvm {
66 template <> struct enum_iteration_traits<InstCounterType> {
67  static constexpr bool is_iterable = true;
68 };
69 } // namespace llvm
70 
71 namespace {
72 auto inst_counter_types() { return enum_seq(VM_CNT, NUM_INST_CNTS); }
73 
74 using RegInterval = std::pair<int, int>;
75 
76 struct HardwareLimits {
77  unsigned VmcntMax;
78  unsigned ExpcntMax;
79  unsigned LgkmcntMax;
80  unsigned VscntMax;
81 };
82 
83 struct RegisterEncoding {
84  unsigned VGPR0;
85  unsigned VGPRL;
86  unsigned SGPR0;
87  unsigned SGPRL;
88 };
89 
90 enum WaitEventType {
91  VMEM_ACCESS, // vector-memory read & write
92  VMEM_READ_ACCESS, // vector-memory read
93  VMEM_WRITE_ACCESS, // vector-memory write
94  LDS_ACCESS, // lds read & write
95  GDS_ACCESS, // gds read & write
96  SQ_MESSAGE, // send message
97  SMEM_ACCESS, // scalar-memory read & write
98  EXP_GPR_LOCK, // export holding on its data src
99  GDS_GPR_LOCK, // GDS holding on its data and addr src
100  EXP_POS_ACCESS, // write to export position
101  EXP_PARAM_ACCESS, // write to export parameter
102  VMW_GPR_LOCK, // vector-memory write holding on its data src
103  EXP_LDS_ACCESS, // read by ldsdir counting as export
104  NUM_WAIT_EVENTS,
105 };
106 
107 static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
108  (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
109  (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
110  (1 << SQ_MESSAGE),
111  (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
112  (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS),
113  (1 << VMEM_WRITE_ACCESS)};
114 
115 // The mapping is:
116 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
117 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
118 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
119 // We reserve a fixed number of VGPR slots in the scoring tables for
120 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
121 enum RegisterMapping {
122  SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
123  AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
124  SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
125  NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
126  EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes.
127  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
128 };
129 
130 // Enumerate different types of result-returning VMEM operations. Although
131 // s_waitcnt orders them all with a single vmcnt counter, in the absence of
132 // s_waitcnt only instructions of the same VmemType are guaranteed to write
133 // their results in order -- so there is no need to insert an s_waitcnt between
134 // two instructions of the same type that write the same vgpr.
135 enum VmemType {
136  // BUF instructions and MIMG instructions without a sampler.
137  VMEM_NOSAMPLER,
138  // MIMG instructions with a sampler.
139  VMEM_SAMPLER,
140  // BVH instructions
141  VMEM_BVH
142 };
143 
144 static bool updateVMCntOnly(const MachineInstr &Inst) {
145  return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
147 }
148 
149 VmemType getVmemType(const MachineInstr &Inst) {
150  assert(updateVMCntOnly(Inst));
151  if (!SIInstrInfo::isMIMG(Inst))
152  return VMEM_NOSAMPLER;
153  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
154  const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
155  AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
156  return BaseInfo->BVH ? VMEM_BVH
157  : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
158 }
159 
160 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
161  switch (T) {
162  case VM_CNT:
163  Wait.VmCnt = std::min(Wait.VmCnt, Count);
164  break;
165  case EXP_CNT:
166  Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
167  break;
168  case LGKM_CNT:
169  Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
170  break;
171  case VS_CNT:
172  Wait.VsCnt = std::min(Wait.VsCnt, Count);
173  break;
174  default:
175  llvm_unreachable("bad InstCounterType");
176  }
177 }
178 
179 // This objects maintains the current score brackets of each wait counter, and
180 // a per-register scoreboard for each wait counter.
181 //
182 // We also maintain the latest score for every event type that can change the
183 // waitcnt in order to know if there are multiple types of events within
184 // the brackets. When multiple types of event happen in the bracket,
185 // wait count may get decreased out of order, therefore we need to put in
186 // "s_waitcnt 0" before use.
187 class WaitcntBrackets {
188 public:
189  WaitcntBrackets(const GCNSubtarget *SubTarget, HardwareLimits Limits,
190  RegisterEncoding Encoding)
191  : ST(SubTarget), Limits(Limits), Encoding(Encoding) {}
192 
193  unsigned getWaitCountMax(InstCounterType T) const {
194  switch (T) {
195  case VM_CNT:
196  return Limits.VmcntMax;
197  case LGKM_CNT:
198  return Limits.LgkmcntMax;
199  case EXP_CNT:
200  return Limits.ExpcntMax;
201  case VS_CNT:
202  return Limits.VscntMax;
203  default:
204  break;
205  }
206  return 0;
207  }
208 
209  unsigned getScoreLB(InstCounterType T) const {
210  assert(T < NUM_INST_CNTS);
211  return ScoreLBs[T];
212  }
213 
214  unsigned getScoreUB(InstCounterType T) const {
215  assert(T < NUM_INST_CNTS);
216  return ScoreUBs[T];
217  }
218 
219  unsigned getScoreRange(InstCounterType T) const {
220  return getScoreUB(T) - getScoreLB(T);
221  }
222 
223  // Mapping from event to counter.
224  InstCounterType eventCounter(WaitEventType E) const {
225  for (auto T : inst_counter_types()) {
226  if (WaitEventMaskForInst[T] & (1 << E))
227  return T;
228  }
229  llvm_unreachable("event type has no associated counter");
230  }
231 
232  unsigned getRegScore(int GprNo, InstCounterType T) const {
233  if (GprNo < NUM_ALL_VGPRS) {
234  return VgprScores[T][GprNo];
235  }
236  assert(T == LGKM_CNT);
237  return SgprScores[GprNo - NUM_ALL_VGPRS];
238  }
239 
240  bool merge(const WaitcntBrackets &Other);
241 
242  RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
243  const MachineRegisterInfo *MRI,
244  const SIRegisterInfo *TRI, unsigned OpNo) const;
245 
246  bool counterOutOfOrder(InstCounterType T) const;
247  void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
248  void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
249  void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
250  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
251  void applyWaitcnt(InstCounterType T, unsigned Count);
252  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
253  const MachineRegisterInfo *MRI, WaitEventType E,
254  MachineInstr &MI);
255 
256  unsigned hasPendingEvent() const { return PendingEvents; }
257  unsigned hasPendingEvent(WaitEventType E) const {
258  return PendingEvents & (1 << E);
259  }
260  unsigned hasPendingEvent(InstCounterType T) const {
261  unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
262  assert((HasPending != 0) == (getScoreRange(T) != 0));
263  return HasPending;
264  }
265 
266  bool hasMixedPendingEvents(InstCounterType T) const {
267  unsigned Events = hasPendingEvent(T);
268  // Return true if more than one bit is set in Events.
269  return Events & (Events - 1);
270  }
271 
272  bool hasPendingFlat() const {
273  return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
274  LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
275  (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
276  LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
277  }
278 
279  void setPendingFlat() {
280  LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
281  LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
282  }
283 
284  // Return true if there might be pending writes to the specified vgpr by VMEM
285  // instructions with types different from V.
286  bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
287  assert(GprNo < NUM_ALL_VGPRS);
288  return VgprVmemTypes[GprNo] & ~(1 << V);
289  }
290 
291  void clearVgprVmemTypes(int GprNo) {
292  assert(GprNo < NUM_ALL_VGPRS);
293  VgprVmemTypes[GprNo] = 0;
294  }
295 
296  void print(raw_ostream &);
297  void dump() { print(dbgs()); }
298 
299 private:
300  struct MergeInfo {
301  unsigned OldLB;
302  unsigned OtherLB;
303  unsigned MyShift;
304  unsigned OtherShift;
305  };
306  static bool mergeScore(const MergeInfo &M, unsigned &Score,
307  unsigned OtherScore);
308 
309  void setScoreLB(InstCounterType T, unsigned Val) {
310  assert(T < NUM_INST_CNTS);
311  ScoreLBs[T] = Val;
312  }
313 
314  void setScoreUB(InstCounterType T, unsigned Val) {
315  assert(T < NUM_INST_CNTS);
316  ScoreUBs[T] = Val;
317 
318  if (T != EXP_CNT)
319  return;
320 
321  if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
322  ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
323  }
324 
325  void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
326  if (GprNo < NUM_ALL_VGPRS) {
327  VgprUB = std::max(VgprUB, GprNo);
328  VgprScores[T][GprNo] = Val;
329  } else {
330  assert(T == LGKM_CNT);
331  SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
332  SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
333  }
334  }
335 
336  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
337  const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
338  unsigned OpNo, unsigned Val);
339 
340  const GCNSubtarget *ST = nullptr;
341  HardwareLimits Limits = {};
342  RegisterEncoding Encoding = {};
343  unsigned ScoreLBs[NUM_INST_CNTS] = {0};
344  unsigned ScoreUBs[NUM_INST_CNTS] = {0};
345  unsigned PendingEvents = 0;
346  // Remember the last flat memory operation.
347  unsigned LastFlat[NUM_INST_CNTS] = {0};
348  // wait_cnt scores for every vgpr.
349  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
350  int VgprUB = -1;
351  int SgprUB = -1;
352  unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
353  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
354  unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
355  // Bitmask of the VmemTypes of VMEM instructions that might have a pending
356  // write to each vgpr.
357  unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
358 };
359 
360 class SIInsertWaitcnts : public MachineFunctionPass {
361 private:
362  const GCNSubtarget *ST = nullptr;
363  const SIInstrInfo *TII = nullptr;
364  const SIRegisterInfo *TRI = nullptr;
365  const MachineRegisterInfo *MRI = nullptr;
366  AMDGPU::IsaVersion IV;
367 
368  DenseSet<MachineInstr *> TrackedWaitcntSet;
370  DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
371  MachineLoopInfo *MLI;
373 
374  struct BlockInfo {
376  std::unique_ptr<WaitcntBrackets> Incoming;
377  bool Dirty = true;
378 
379  explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
380  };
381 
383 
384  // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
385  // because of amdgpu-waitcnt-forcezero flag
386  bool ForceEmitZeroWaitcnts;
387  bool ForceEmitWaitcnt[NUM_INST_CNTS];
388 
389 public:
390  static char ID;
391 
392  SIInsertWaitcnts() : MachineFunctionPass(ID) {
393  (void)ForceExpCounter;
394  (void)ForceLgkmCounter;
395  (void)ForceVMCounter;
396  }
397 
398  bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
399  bool isPreheaderToFlush(MachineBasicBlock &MBB,
400  WaitcntBrackets &ScoreBrackets);
401  bool runOnMachineFunction(MachineFunction &MF) override;
402 
403  StringRef getPassName() const override {
404  return "SI insert wait instructions";
405  }
406 
407  void getAnalysisUsage(AnalysisUsage &AU) const override {
408  AU.setPreservesCFG();
412  }
413 
414  bool isForceEmitWaitcnt() const {
415  for (auto T : inst_counter_types())
416  if (ForceEmitWaitcnt[T])
417  return true;
418  return false;
419  }
420 
421  AMDGPU::Waitcnt allZeroWaitcnt() const {
422  return AMDGPU::Waitcnt::allZero(ST->hasVscnt());
423  }
424 
425  void setForceEmitWaitcnt() {
426 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
427 // For debug builds, get the debug counter info and adjust if need be
428 #ifndef NDEBUG
429  if (DebugCounter::isCounterSet(ForceExpCounter) &&
430  DebugCounter::shouldExecute(ForceExpCounter)) {
431  ForceEmitWaitcnt[EXP_CNT] = true;
432  } else {
433  ForceEmitWaitcnt[EXP_CNT] = false;
434  }
435 
436  if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
437  DebugCounter::shouldExecute(ForceLgkmCounter)) {
438  ForceEmitWaitcnt[LGKM_CNT] = true;
439  } else {
440  ForceEmitWaitcnt[LGKM_CNT] = false;
441  }
442 
443  if (DebugCounter::isCounterSet(ForceVMCounter) &&
444  DebugCounter::shouldExecute(ForceVMCounter)) {
445  ForceEmitWaitcnt[VM_CNT] = true;
446  } else {
447  ForceEmitWaitcnt[VM_CNT] = false;
448  }
449 #endif // NDEBUG
450  }
451 
452  // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
453  // FLAT instruction.
454  WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
456  if (!ST->hasVscnt())
457  return VMEM_ACCESS;
458  if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst))
459  return VMEM_WRITE_ACCESS;
460  return VMEM_READ_ACCESS;
461  }
462 
463  bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
464  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
465  bool generateWaitcntInstBefore(MachineInstr &MI,
466  WaitcntBrackets &ScoreBrackets,
467  MachineInstr *OldWaitcntInstr,
468  bool FlushVmCnt);
469  bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
470  WaitcntBrackets &ScoreBrackets,
471  MachineInstr *OldWaitcntInstr);
472  bool generateWaitcnt(AMDGPU::Waitcnt Wait,
474  MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
475  MachineInstr *OldWaitcntInstr);
476  void updateEventWaitcntAfter(MachineInstr &Inst,
477  WaitcntBrackets *ScoreBrackets);
478  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
479  WaitcntBrackets &ScoreBrackets);
480  bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
481  MachineInstr &OldWaitcntInstr,
482  AMDGPU::Waitcnt &Wait,
484 };
485 
486 } // end anonymous namespace
487 
488 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
489  const SIInstrInfo *TII,
490  const MachineRegisterInfo *MRI,
491  const SIRegisterInfo *TRI,
492  unsigned OpNo) const {
493  const MachineOperand &Op = MI->getOperand(OpNo);
494  if (!TRI->isInAllocatableClass(Op.getReg()))
495  return {-1, -1};
496 
497  // A use via a PW operand does not need a waitcnt.
498  // A partial write is not a WAW.
499  assert(!Op.getSubReg() || !Op.isUndef());
500 
501  RegInterval Result;
502 
503  unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
504 
505  if (TRI->isVectorRegister(*MRI, Op.getReg())) {
506  assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
507  Result.first = Reg - Encoding.VGPR0;
508  if (TRI->isAGPR(*MRI, Op.getReg()))
509  Result.first += AGPR_OFFSET;
510  assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
511  } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
512  assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
513  Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
514  assert(Result.first >= NUM_ALL_VGPRS &&
515  Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
516  }
517  // TODO: Handle TTMP
518  // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
519  else
520  return {-1, -1};
521 
522  const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo);
523  unsigned Size = TRI->getRegSizeInBits(*RC);
524  Result.second = Result.first + ((Size + 16) / 32);
525 
526  return Result;
527 }
528 
529 void WaitcntBrackets::setExpScore(const MachineInstr *MI,
530  const SIInstrInfo *TII,
531  const SIRegisterInfo *TRI,
532  const MachineRegisterInfo *MRI, unsigned OpNo,
533  unsigned Val) {
534  RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
535  assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
536  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
537  setRegScore(RegNo, EXP_CNT, Val);
538  }
539 }
540 
541 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS written
542 // can be accessed. A load from LDS to VMEM does not need a wait.
543 static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
544  return SIInstrInfo::isVALU(MI) &&
546  MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
547 }
548 
549 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
550  const SIRegisterInfo *TRI,
551  const MachineRegisterInfo *MRI,
552  WaitEventType E, MachineInstr &Inst) {
553  InstCounterType T = eventCounter(E);
554  unsigned CurrScore = getScoreUB(T) + 1;
555  if (CurrScore == 0)
556  report_fatal_error("InsertWaitcnt score wraparound");
557  // PendingEvents and ScoreUB need to be update regardless if this event
558  // changes the score of a register or not.
559  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
560  PendingEvents |= 1 << E;
561  setScoreUB(T, CurrScore);
562 
563  if (T == EXP_CNT) {
564  // Put score on the source vgprs. If this is a store, just use those
565  // specific register(s).
566  if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
567  int AddrOpIdx =
568  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
569  // All GDS operations must protect their address register (same as
570  // export.)
571  if (AddrOpIdx != -1) {
572  setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
573  }
574 
575  if (Inst.mayStore()) {
576  if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) {
577  setExpScore(
578  &Inst, TII, TRI, MRI,
579  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
580  CurrScore);
581  }
582  if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) {
583  setExpScore(&Inst, TII, TRI, MRI,
585  AMDGPU::OpName::data1),
586  CurrScore);
587  }
588  } else if (SIInstrInfo::isAtomicRet(Inst) &&
589  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
590  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
591  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
592  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
593  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
594  Inst.getOpcode() != AMDGPU::DS_APPEND &&
595  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
597  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
598  const MachineOperand &Op = Inst.getOperand(I);
599  if (Op.isReg() && !Op.isDef() &&
600  TRI->isVectorRegister(*MRI, Op.getReg())) {
601  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
602  }
603  }
604  }
605  } else if (TII->isFLAT(Inst)) {
606  if (Inst.mayStore()) {
607  setExpScore(
608  &Inst, TII, TRI, MRI,
609  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
610  CurrScore);
611  } else if (SIInstrInfo::isAtomicRet(Inst)) {
612  setExpScore(
613  &Inst, TII, TRI, MRI,
614  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
615  CurrScore);
616  }
617  } else if (TII->isMIMG(Inst)) {
618  if (Inst.mayStore()) {
619  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
620  } else if (SIInstrInfo::isAtomicRet(Inst)) {
621  setExpScore(
622  &Inst, TII, TRI, MRI,
623  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
624  CurrScore);
625  }
626  } else if (TII->isMTBUF(Inst)) {
627  if (Inst.mayStore()) {
628  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
629  }
630  } else if (TII->isMUBUF(Inst)) {
631  if (Inst.mayStore()) {
632  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
633  } else if (SIInstrInfo::isAtomicRet(Inst)) {
634  setExpScore(
635  &Inst, TII, TRI, MRI,
636  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
637  CurrScore);
638  }
639  } else if (TII->isLDSDIR(Inst)) {
640  // LDSDIR instructions attach the score to the destination.
641  setExpScore(
642  &Inst, TII, TRI, MRI,
643  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
644  CurrScore);
645  } else {
646  if (TII->isEXP(Inst)) {
647  // For export the destination registers are really temps that
648  // can be used as the actual source after export patching, so
649  // we need to treat them like sources and set the EXP_CNT
650  // score.
651  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
652  MachineOperand &DefMO = Inst.getOperand(I);
653  if (DefMO.isReg() && DefMO.isDef() &&
654  TRI->isVGPR(*MRI, DefMO.getReg())) {
655  setRegScore(
657  EXP_CNT, CurrScore);
658  }
659  }
660  }
661  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
662  MachineOperand &MO = Inst.getOperand(I);
663  if (MO.isReg() && !MO.isDef() &&
664  TRI->isVectorRegister(*MRI, MO.getReg())) {
665  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
666  }
667  }
668  }
669 #if 0 // TODO: check if this is handled by MUBUF code above.
670  } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
671  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
672  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
673  MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
674  unsigned OpNo;//TODO: find the OpNo for this operand;
675  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo);
676  for (int RegNo = Interval.first; RegNo < Interval.second;
677  ++RegNo) {
678  setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
679  }
680 #endif
681  } else {
682  // Match the score to the destination registers.
683  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
684  auto &Op = Inst.getOperand(I);
685  if (!Op.isReg() || !Op.isDef())
686  continue;
687  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I);
688  if (T == VM_CNT) {
689  if (Interval.first >= NUM_ALL_VGPRS)
690  continue;
691  if (updateVMCntOnly(Inst)) {
692  VmemType V = getVmemType(Inst);
693  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
694  VgprVmemTypes[RegNo] |= 1 << V;
695  }
696  }
697  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
698  setRegScore(RegNo, T, CurrScore);
699  }
700  }
701  if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
702  setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
703  }
704  }
705 }
706 
708  OS << '\n';
709  for (auto T : inst_counter_types()) {
710  unsigned SR = getScoreRange(T);
711 
712  switch (T) {
713  case VM_CNT:
714  OS << " VM_CNT(" << SR << "): ";
715  break;
716  case LGKM_CNT:
717  OS << " LGKM_CNT(" << SR << "): ";
718  break;
719  case EXP_CNT:
720  OS << " EXP_CNT(" << SR << "): ";
721  break;
722  case VS_CNT:
723  OS << " VS_CNT(" << SR << "): ";
724  break;
725  default:
726  OS << " UNKNOWN(" << SR << "): ";
727  break;
728  }
729 
730  if (SR != 0) {
731  // Print vgpr scores.
732  unsigned LB = getScoreLB(T);
733 
734  for (int J = 0; J <= VgprUB; J++) {
735  unsigned RegScore = getRegScore(J, T);
736  if (RegScore <= LB)
737  continue;
738  unsigned RelScore = RegScore - LB - 1;
739  if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
740  OS << RelScore << ":v" << J << " ";
741  } else {
742  OS << RelScore << ":ds ";
743  }
744  }
745  // Also need to print sgpr scores for lgkm_cnt.
746  if (T == LGKM_CNT) {
747  for (int J = 0; J <= SgprUB; J++) {
748  unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
749  if (RegScore <= LB)
750  continue;
751  unsigned RelScore = RegScore - LB - 1;
752  OS << RelScore << ":s" << J << " ";
753  }
754  }
755  }
756  OS << '\n';
757  }
758  OS << '\n';
759 }
760 
761 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
762 /// whether a waitcnt instruction is needed at all.
763 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
764  simplifyWaitcnt(VM_CNT, Wait.VmCnt);
765  simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
766  simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
767  simplifyWaitcnt(VS_CNT, Wait.VsCnt);
768 }
769 
770 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
771  unsigned &Count) const {
772  // The number of outstanding events for this type, T, can be calculated
773  // as (UB - LB). If the current Count is greater than or equal to the number
774  // of outstanding events, then the wait for this counter is redundant.
775  if (Count >= getScoreRange(T))
776  Count = ~0u;
777 }
778 
779 void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
780  AMDGPU::Waitcnt &Wait) const {
781  unsigned ScoreToWait = getRegScore(RegNo, T);
782 
783  // If the score of src_operand falls within the bracket, we need an
784  // s_waitcnt instruction.
785  const unsigned LB = getScoreLB(T);
786  const unsigned UB = getScoreUB(T);
787  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
788  if ((T == VM_CNT || T == LGKM_CNT) &&
789  hasPendingFlat() &&
790  !ST->hasFlatLgkmVMemCountInOrder()) {
791  // If there is a pending FLAT operation, and this is a VMem or LGKM
792  // waitcnt and the target can report early completion, then we need
793  // to force a waitcnt 0.
794  addWait(Wait, T, 0);
795  } else if (counterOutOfOrder(T)) {
796  // Counter can get decremented out-of-order when there
797  // are multiple types event in the bracket. Also emit an s_wait counter
798  // with a conservative value of 0 for the counter.
799  addWait(Wait, T, 0);
800  } else {
801  // If a counter has been maxed out avoid overflow by waiting for
802  // MAX(CounterType) - 1 instead.
803  unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
804  addWait(Wait, T, NeededWait);
805  }
806  }
807 }
808 
809 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
810  applyWaitcnt(VM_CNT, Wait.VmCnt);
811  applyWaitcnt(EXP_CNT, Wait.ExpCnt);
812  applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
813  applyWaitcnt(VS_CNT, Wait.VsCnt);
814 }
815 
816 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
817  const unsigned UB = getScoreUB(T);
818  if (Count >= UB)
819  return;
820  if (Count != 0) {
821  if (counterOutOfOrder(T))
822  return;
823  setScoreLB(T, std::max(getScoreLB(T), UB - Count));
824  } else {
825  setScoreLB(T, UB);
826  PendingEvents &= ~WaitEventMaskForInst[T];
827  }
828 }
829 
830 // Where there are multiple types of event in the bracket of a counter,
831 // the decrement may go out of order.
832 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
833  // Scalar memory read always can go out of order.
834  if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
835  return true;
836  return hasMixedPendingEvents(T);
837 }
838 
839 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
840  false)
843 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
844  false)
845 
846 char SIInsertWaitcnts::ID = 0;
847 
848 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
849 
851  return new SIInsertWaitcnts();
852 }
853 
855  unsigned NewEnc) {
856  int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
857  assert(OpIdx >= 0);
858 
859  MachineOperand &MO = MI.getOperand(OpIdx);
860 
861  if (NewEnc == MO.getImm())
862  return false;
863 
864  MO.setImm(NewEnc);
865  return true;
866 }
867 
868 /// Combine consecutive waitcnt instructions that precede \p It and follow
869 /// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
870 /// by previous passes. Currently this pass conservatively assumes that these
871 /// preexisting waitcnt are required for correctness.
872 bool SIInsertWaitcnts::applyPreexistingWaitcnt(
873  WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
874  AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
875  bool Modified = false;
876  MachineInstr *WaitcntInstr = nullptr;
877  MachineInstr *WaitcntVsCntInstr = nullptr;
878 
879  for (auto &II :
880  make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
881  if (II.isMetaInstruction())
882  continue;
883 
884  if (II.getOpcode() == AMDGPU::S_WAITCNT) {
885  // Conservatively update required wait if this waitcnt was added in an
886  // earlier pass. In this case it will not exist in the tracked waitcnt
887  // set.
888  if (!TrackedWaitcntSet.count(&II)) {
889  unsigned IEnc = II.getOperand(0).getImm();
890  AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
891  Wait = Wait.combined(OldWait);
892  }
893 
894  // Merge consecutive waitcnt of the same type by erasing multiples.
895  if (!WaitcntInstr) {
896  WaitcntInstr = &II;
897  } else {
898  II.eraseFromParent();
899  Modified = true;
900  }
901 
902  } else {
903  assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
904  assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
905  if (!TrackedWaitcntSet.count(&II)) {
906  unsigned OldVSCnt =
907  TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
908  Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
909  }
910 
911  if (!WaitcntVsCntInstr) {
912  WaitcntVsCntInstr = &II;
913  } else {
914  II.eraseFromParent();
915  Modified = true;
916  }
917  }
918  }
919 
920  // Updated encoding of merged waitcnt with the required wait.
921  if (WaitcntInstr) {
922  if (Wait.hasWaitExceptVsCnt()) {
923  Modified |=
924  updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
926  ScoreBrackets.applyWaitcnt(Wait);
927  Wait.VmCnt = ~0u;
928  Wait.LgkmCnt = ~0u;
929  Wait.ExpCnt = ~0u;
930 
931  LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
932  ? dbgs() << "applyPreexistingWaitcnt\n"
933  << "New Instr at block end: " << *WaitcntInstr
934  << '\n'
935  : dbgs() << "applyPreexistingWaitcnt\n"
936  << "Old Instr: " << *It
937  << "New Instr: " << *WaitcntInstr << '\n');
938 
939  } else {
940  WaitcntInstr->eraseFromParent();
941  Modified = true;
942  }
943  }
944 
945  if (WaitcntVsCntInstr) {
946  if (Wait.hasWaitVsCnt()) {
947  assert(ST->hasVscnt());
948  Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
949  AMDGPU::OpName::simm16, Wait.VsCnt);
950  ScoreBrackets.applyWaitcnt(Wait);
951  Wait.VsCnt = ~0u;
952 
953  LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
954  ? dbgs() << "applyPreexistingWaitcnt\n"
955  << "New Instr at block end: "
956  << *WaitcntVsCntInstr << '\n'
957  : dbgs() << "applyPreexistingWaitcnt\n"
958  << "Old Instr: " << *It
959  << "New Instr: " << *WaitcntVsCntInstr << '\n');
960  } else {
961  WaitcntVsCntInstr->eraseFromParent();
962  Modified = true;
963  }
964  }
965 
966  return Modified;
967 }
968 
969 static bool readsVCCZ(const MachineInstr &MI) {
970  unsigned Opc = MI.getOpcode();
971  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
972  !MI.getOperand(1).isUndef();
973 }
974 
975 /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
977  // Currently all conventions wait, but this may not always be the case.
978  //
979  // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
980  // senses to omit the wait and do it in the caller.
981  return true;
982 }
983 
984 /// \returns true if the callee is expected to wait for any outstanding waits
985 /// before returning.
987  return true;
988 }
989 
990 /// Generate s_waitcnt instruction to be placed before cur_Inst.
991 /// Instructions of a given type are returned in order,
992 /// but instructions of different types can complete out of order.
993 /// We rely on this in-order completion
994 /// and simply assign a score to the memory access instructions.
995 /// We keep track of the active "score bracket" to determine
996 /// if an access of a memory read requires an s_waitcnt
997 /// and if so what the value of each counter is.
998 /// The "score bracket" is bound by the lower bound and upper bound
999 /// scores (*_score_LB and *_score_ub respectively).
1000 /// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1001 /// flush the vmcnt counter here.
1002 bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1003  WaitcntBrackets &ScoreBrackets,
1004  MachineInstr *OldWaitcntInstr,
1005  bool FlushVmCnt) {
1006  setForceEmitWaitcnt();
1007 
1008  if (MI.isMetaInstruction())
1009  return false;
1010 
1011  AMDGPU::Waitcnt Wait;
1012 
1013  // FIXME: This should have already been handled by the memory legalizer.
1014  // Removing this currently doesn't affect any lit tests, but we need to
1015  // verify that nothing was relying on this. The number of buffer invalidates
1016  // being handled here should not be expanded.
1017  if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1018  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1019  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1020  MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1021  MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1022  Wait.VmCnt = 0;
1023  }
1024 
1025  // All waits must be resolved at call return.
1026  // NOTE: this could be improved with knowledge of all call sites or
1027  // with knowledge of the called routines.
1028  if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1029  MI.getOpcode() == AMDGPU::SI_RETURN ||
1030  MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1031  (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1032  Wait = Wait.combined(allZeroWaitcnt());
1033  }
1034  // Resolve vm waits before gs-done.
1035  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
1036  MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1037  ST->hasLegacyGeometry() &&
1038  ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1040  Wait.VmCnt = 0;
1041  }
1042 #if 0 // TODO: the following blocks of logic when we have fence.
1043  else if (MI.getOpcode() == SC_FENCE) {
1044  const unsigned int group_size =
1045  context->shader_info->GetMaxThreadGroupSize();
1046  // group_size == 0 means thread group size is unknown at compile time
1047  const bool group_is_multi_wave =
1048  (group_size == 0 || group_size > target_info->GetWaveFrontSize());
1049  const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
1050 
1051  for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
1052  SCRegType src_type = Inst->GetSrcType(i);
1053  switch (src_type) {
1054  case SCMEM_LDS:
1055  if (group_is_multi_wave ||
1056  context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
1057  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
1058  ScoreBrackets->getScoreUB(LGKM_CNT));
1059  // LDS may have to wait for VM_CNT after buffer load to LDS
1060  if (target_info->HasBufferLoadToLDS()) {
1061  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
1062  ScoreBrackets->getScoreUB(VM_CNT));
1063  }
1064  }
1065  break;
1066 
1067  case SCMEM_GDS:
1068  if (group_is_multi_wave || fence_is_global) {
1069  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1070  ScoreBrackets->getScoreUB(EXP_CNT));
1071  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
1072  ScoreBrackets->getScoreUB(LGKM_CNT));
1073  }
1074  break;
1075 
1076  case SCMEM_UAV:
1077  case SCMEM_TFBUF:
1078  case SCMEM_RING:
1079  case SCMEM_SCATTER:
1080  if (group_is_multi_wave || fence_is_global) {
1081  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1082  ScoreBrackets->getScoreUB(EXP_CNT));
1083  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
1084  ScoreBrackets->getScoreUB(VM_CNT));
1085  }
1086  break;
1087 
1088  case SCMEM_SCRATCH:
1089  default:
1090  break;
1091  }
1092  }
1093  }
1094 #endif
1095 
1096  // Export & GDS instructions do not read the EXEC mask until after the export
1097  // is granted (which can occur well after the instruction is issued).
1098  // The shader program must flush all EXP operations on the export-count
1099  // before overwriting the EXEC mask.
1100  else {
1101  if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1102  // Export and GDS are tracked individually, either may trigger a waitcnt
1103  // for EXEC.
1104  if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1105  ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1106  ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1107  ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1108  Wait.ExpCnt = 0;
1109  }
1110  }
1111 
1112  if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1113  // The function is going to insert a wait on everything in its prolog.
1114  // This still needs to be careful if the call target is a load (e.g. a GOT
1115  // load). We also need to check WAW dependency with saved PC.
1116  Wait = AMDGPU::Waitcnt();
1117 
1118  int CallAddrOpIdx =
1119  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
1120 
1121  if (MI.getOperand(CallAddrOpIdx).isReg()) {
1122  RegInterval CallAddrOpInterval =
1123  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx);
1124 
1125  for (int RegNo = CallAddrOpInterval.first;
1126  RegNo < CallAddrOpInterval.second; ++RegNo)
1127  ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
1128 
1129  int RtnAddrOpIdx =
1130  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
1131  if (RtnAddrOpIdx != -1) {
1132  RegInterval RtnAddrOpInterval =
1133  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx);
1134 
1135  for (int RegNo = RtnAddrOpInterval.first;
1136  RegNo < RtnAddrOpInterval.second; ++RegNo)
1137  ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
1138  }
1139  }
1140  } else {
1141  // FIXME: Should not be relying on memoperands.
1142  // Look at the source operands of every instruction to see if
1143  // any of them results from a previous memory operation that affects
1144  // its current usage. If so, an s_waitcnt instruction needs to be
1145  // emitted.
1146  // If the source operand was defined by a load, add the s_waitcnt
1147  // instruction.
1148  //
1149  // Two cases are handled for destination operands:
1150  // 1) If the destination operand was defined by a load, add the s_waitcnt
1151  // instruction to guarantee the right WAW order.
1152  // 2) If a destination operand that was used by a recent export/store ins,
1153  // add s_waitcnt on exp_cnt to guarantee the WAR order.
1154  for (const MachineMemOperand *Memop : MI.memoperands()) {
1155  const Value *Ptr = Memop->getValue();
1156  if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
1157  addWait(Wait, LGKM_CNT, 0);
1158  if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1159  SLoadAddresses.erase(Ptr);
1160  }
1161  unsigned AS = Memop->getAddrSpace();
1163  continue;
1164  // No need to wait before load from VMEM to LDS.
1166  continue;
1167  unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1168  // VM_CNT is only relevant to vgpr or LDS.
1169  ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
1170  if (Memop->isStore()) {
1171  ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1172  }
1173  }
1174 
1175  // Loop over use and def operands.
1176  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1177  MachineOperand &Op = MI.getOperand(I);
1178  if (!Op.isReg())
1179  continue;
1180  RegInterval Interval =
1181  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
1182 
1183  const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1184  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1185  if (IsVGPR) {
1186  // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1187  // previous write and this write are the same type of VMEM
1188  // instruction, in which case they're guaranteed to write their
1189  // results in order anyway.
1190  if (Op.isUse() || !updateVMCntOnly(MI) ||
1191  ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1192  getVmemType(MI))) {
1193  ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
1194  ScoreBrackets.clearVgprVmemTypes(RegNo);
1195  }
1196  if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1197  ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1198  }
1199  }
1200  ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
1201  }
1202  }
1203  }
1204  }
1205 
1206  // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1207  // not, we need to ensure the subtarget is capable of backing off barrier
1208  // instructions in case there are any outstanding memory operations that may
1209  // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
1210  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1211  !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
1212  Wait = Wait.combined(allZeroWaitcnt());
1213  }
1214 
1215  // TODO: Remove this work-around, enable the assert for Bug 457939
1216  // after fixing the scheduler. Also, the Shader Compiler code is
1217  // independent of target.
1218  if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1219  if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1220  Wait.LgkmCnt = 0;
1221  }
1222  }
1223 
1224  // Verify that the wait is actually needed.
1225  ScoreBrackets.simplifyWaitcnt(Wait);
1226 
1227  if (ForceEmitZeroWaitcnts)
1228  Wait = allZeroWaitcnt();
1229 
1230  if (ForceEmitWaitcnt[VM_CNT])
1231  Wait.VmCnt = 0;
1232  if (ForceEmitWaitcnt[EXP_CNT])
1233  Wait.ExpCnt = 0;
1234  if (ForceEmitWaitcnt[LGKM_CNT])
1235  Wait.LgkmCnt = 0;
1236  if (ForceEmitWaitcnt[VS_CNT])
1237  Wait.VsCnt = 0;
1238 
1239  if (FlushVmCnt) {
1240  if (ScoreBrackets.hasPendingEvent(VM_CNT))
1241  Wait.VmCnt = 0;
1242  }
1243 
1244  return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
1245  OldWaitcntInstr);
1246 }
1247 
1248 // Add a waitcnt to flush the vmcnt counter at the end of the given block if
1249 // needed.
1250 bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
1251  WaitcntBrackets &ScoreBrackets,
1252  MachineInstr *OldWaitcntInstr) {
1253  AMDGPU::Waitcnt Wait;
1254 
1255  if (!ScoreBrackets.hasPendingEvent(VM_CNT))
1256  return false;
1257 
1258  Wait.VmCnt = 0;
1259 
1260  return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
1261  OldWaitcntInstr);
1262 }
1263 
1264 bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1266  MachineBasicBlock &Block,
1267  WaitcntBrackets &ScoreBrackets,
1268  MachineInstr *OldWaitcntInstr) {
1269  bool Modified = false;
1270  const DebugLoc &DL = Block.findDebugLoc(It);
1271 
1272  if (OldWaitcntInstr)
1273  // Try to merge the required wait with preexisting waitcnt instructions.
1274  // Also erase redundant waitcnt.
1275  Modified =
1276  applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
1277  else
1278  ScoreBrackets.applyWaitcnt(Wait);
1279 
1280  // ExpCnt can be merged into VINTERP.
1281  if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
1282  SIInstrInfo::isVINTERP(*It)) {
1283  MachineOperand *WaitExp =
1284  TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1285  if (Wait.ExpCnt < WaitExp->getImm()) {
1286  WaitExp->setImm(Wait.ExpCnt);
1287  Modified = true;
1288  }
1289  Wait.ExpCnt = ~0u;
1290 
1291  LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1292  << "Update Instr: " << *It);
1293  }
1294 
1295  // Build new waitcnt instructions unless no wait is needed or the old waitcnt
1296  // instruction was modified to handle the required wait.
1297  if (Wait.hasWaitExceptVsCnt()) {
1298  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1299  auto SWaitInst =
1300  BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1301  TrackedWaitcntSet.insert(SWaitInst);
1302  Modified = true;
1303 
1304  LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1305  if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1306  dbgs() << "New Instr: " << *SWaitInst << '\n');
1307  }
1308 
1309  if (Wait.hasWaitVsCnt()) {
1310  assert(ST->hasVscnt());
1311 
1312  auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1313  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1314  .addImm(Wait.VsCnt);
1315  TrackedWaitcntSet.insert(SWaitInst);
1316  Modified = true;
1317 
1318  LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1319  if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1320  dbgs() << "New Instr: " << *SWaitInst << '\n');
1321  }
1322  return Modified;
1323 }
1324 
1325 // This is a flat memory operation. Check to see if it has memory tokens other
1326 // than LDS. Other address spaces supported by flat memory operations involve
1327 // global memory.
1328 bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1329  assert(TII->isFLAT(MI));
1330 
1331  // All flat instructions use the VMEM counter.
1332  assert(TII->usesVM_CNT(MI));
1333 
1334  // If there are no memory operands then conservatively assume the flat
1335  // operation may access VMEM.
1336  if (MI.memoperands_empty())
1337  return true;
1338 
1339  // See if any memory operand specifies an address space that involves VMEM.
1340  // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1341  // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1342  // (GDS) address space is not supported by flat operations. Therefore, simply
1343  // return true unless only the LDS address space is found.
1344  for (const MachineMemOperand *Memop : MI.memoperands()) {
1345  unsigned AS = Memop->getAddrSpace();
1347  if (AS != AMDGPUAS::LOCAL_ADDRESS)
1348  return true;
1349  }
1350 
1351  return false;
1352 }
1353 
1354 // This is a flat memory operation. Check to see if it has memory tokens for
1355 // either LDS or FLAT.
1356 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1357  assert(TII->isFLAT(MI));
1358 
1359  // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1360  if (!TII->usesLGKM_CNT(MI))
1361  return false;
1362 
1363  // If in tgsplit mode then there can be no use of LDS.
1364  if (ST->isTgSplitEnabled())
1365  return false;
1366 
1367  // If there are no memory operands then conservatively assume the flat
1368  // operation may access LDS.
1369  if (MI.memoperands_empty())
1370  return true;
1371 
1372  // See if any memory operand specifies an address space that involves LDS.
1373  for (const MachineMemOperand *Memop : MI.memoperands()) {
1374  unsigned AS = Memop->getAddrSpace();
1376  return true;
1377  }
1378 
1379  return false;
1380 }
1381 
1382 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1383  WaitcntBrackets *ScoreBrackets) {
1384  // Now look at the instruction opcode. If it is a memory access
1385  // instruction, update the upper-bound of the appropriate counter's
1386  // bracket and the destination operand scores.
1387  // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1388  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1389  if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1390  TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1391  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1392  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1393  } else {
1394  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1395  }
1396  } else if (TII->isFLAT(Inst)) {
1397  assert(Inst.mayLoadOrStore());
1398 
1399  int FlatASCount = 0;
1400 
1401  if (mayAccessVMEMThroughFlat(Inst)) {
1402  ++FlatASCount;
1403  ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
1404  Inst);
1405  }
1406 
1407  if (mayAccessLDSThroughFlat(Inst)) {
1408  ++FlatASCount;
1409  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1410  }
1411 
1412  // A Flat memory operation must access at least one address space.
1413  assert(FlatASCount);
1414 
1415  // This is a flat memory operation that access both VMEM and LDS, so note it
1416  // - it will require that both the VM and LGKM be flushed to zero if it is
1417  // pending when a VM or LGKM dependency occurs.
1418  if (FlatASCount > 1)
1419  ScoreBrackets->setPendingFlat();
1420  } else if (SIInstrInfo::isVMEM(Inst) &&
1422  ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
1423  Inst);
1424 
1425  if (ST->vmemWriteNeedsExpWaitcnt() &&
1426  (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
1427  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1428  }
1429  } else if (TII->isSMRD(Inst)) {
1430  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1431  } else if (Inst.isCall()) {
1432  if (callWaitsOnFunctionReturn(Inst)) {
1433  // Act as a wait on everything
1434  ScoreBrackets->applyWaitcnt(allZeroWaitcnt());
1435  } else {
1436  // May need to way wait for anything.
1437  ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
1438  }
1439  } else if (SIInstrInfo::isLDSDIR(Inst)) {
1440  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
1441  } else if (TII->isVINTERP(Inst)) {
1442  int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
1443  ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
1444  } else if (SIInstrInfo::isEXP(Inst)) {
1445  unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1447  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1449  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1450  else
1451  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1452  } else {
1453  switch (Inst.getOpcode()) {
1454  case AMDGPU::S_SENDMSG:
1455  case AMDGPU::S_SENDMSG_RTN_B32:
1456  case AMDGPU::S_SENDMSG_RTN_B64:
1457  case AMDGPU::S_SENDMSGHALT:
1458  ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1459  break;
1460  case AMDGPU::S_MEMTIME:
1461  case AMDGPU::S_MEMREALTIME:
1462  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1463  break;
1464  }
1465  }
1466 }
1467 
1468 bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
1469  unsigned OtherScore) {
1470  unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
1471  unsigned OtherShifted =
1472  OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
1473  Score = std::max(MyShifted, OtherShifted);
1474  return OtherShifted > MyShifted;
1475 }
1476 
1477 /// Merge the pending events and associater score brackets of \p Other into
1478 /// this brackets status.
1479 ///
1480 /// Returns whether the merge resulted in a change that requires tighter waits
1481 /// (i.e. the merged brackets strictly dominate the original brackets).
1482 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
1483  bool StrictDom = false;
1484 
1485  VgprUB = std::max(VgprUB, Other.VgprUB);
1486  SgprUB = std::max(SgprUB, Other.SgprUB);
1487 
1488  for (auto T : inst_counter_types()) {
1489  // Merge event flags for this counter
1490  const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
1491  const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
1492  if (OtherEvents & ~OldEvents)
1493  StrictDom = true;
1494  PendingEvents |= OtherEvents;
1495 
1496  // Merge scores for this counter
1497  const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
1498  const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
1499  const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
1500  if (NewUB < ScoreLBs[T])
1501  report_fatal_error("waitcnt score overflow");
1502 
1503  MergeInfo M;
1504  M.OldLB = ScoreLBs[T];
1505  M.OtherLB = Other.ScoreLBs[T];
1506  M.MyShift = NewUB - ScoreUBs[T];
1507  M.OtherShift = NewUB - Other.ScoreUBs[T];
1508 
1509  ScoreUBs[T] = NewUB;
1510 
1511  StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
1512 
1513  for (int J = 0; J <= VgprUB; J++)
1514  StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
1515 
1516  if (T == LGKM_CNT) {
1517  for (int J = 0; J <= SgprUB; J++)
1518  StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
1519  }
1520  }
1521 
1522  for (int J = 0; J <= VgprUB; J++) {
1523  unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
1524  StrictDom |= NewVmemTypes != VgprVmemTypes[J];
1525  VgprVmemTypes[J] = NewVmemTypes;
1526  }
1527 
1528  return StrictDom;
1529 }
1530 
1531 static bool isWaitInstr(MachineInstr &Inst) {
1532  return Inst.getOpcode() == AMDGPU::S_WAITCNT ||
1533  (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1534  Inst.getOperand(0).isReg() &&
1535  Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1536 }
1537 
1538 // Generate s_waitcnt instructions where needed.
1539 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1540  MachineBasicBlock &Block,
1541  WaitcntBrackets &ScoreBrackets) {
1542  bool Modified = false;
1543 
1544  LLVM_DEBUG({
1545  dbgs() << "*** Block" << Block.getNumber() << " ***";
1546  ScoreBrackets.dump();
1547  });
1548 
1549  // Track the correctness of vccz through this basic block. There are two
1550  // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
1551  // ST->partialVCCWritesUpdateVCCZ().
1552  bool VCCZCorrect = true;
1553  if (ST->hasReadVCCZBug()) {
1554  // vccz could be incorrect at a basic block boundary if a predecessor wrote
1555  // to vcc and then issued an smem load.
1556  VCCZCorrect = false;
1557  } else if (!ST->partialVCCWritesUpdateVCCZ()) {
1558  // vccz could be incorrect at a basic block boundary if a predecessor wrote
1559  // to vcc_lo or vcc_hi.
1560  VCCZCorrect = false;
1561  }
1562 
1563  // Walk over the instructions.
1564  MachineInstr *OldWaitcntInstr = nullptr;
1565 
1566  for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
1567  E = Block.instr_end();
1568  Iter != E;) {
1569  MachineInstr &Inst = *Iter;
1570 
1571  // Track pre-existing waitcnts that were added in earlier iterations or by
1572  // the memory legalizer.
1573  if (isWaitInstr(Inst)) {
1574  if (!OldWaitcntInstr)
1575  OldWaitcntInstr = &Inst;
1576  ++Iter;
1577  continue;
1578  }
1579 
1580  bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
1581  isPreheaderToFlush(Block, ScoreBrackets);
1582 
1583  // Generate an s_waitcnt instruction to be placed before Inst, if needed.
1584  Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
1585  FlushVmCnt);
1586  OldWaitcntInstr = nullptr;
1587 
1588  // Restore vccz if it's not known to be correct already.
1589  bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
1590 
1591  // Don't examine operands unless we need to track vccz correctness.
1592  if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
1593  if (Inst.definesRegister(AMDGPU::VCC_LO) ||
1594  Inst.definesRegister(AMDGPU::VCC_HI)) {
1595  // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
1596  if (!ST->partialVCCWritesUpdateVCCZ())
1597  VCCZCorrect = false;
1598  } else if (Inst.definesRegister(AMDGPU::VCC)) {
1599  // There is a hardware bug on CI/SI where SMRD instruction may corrupt
1600  // vccz bit, so when we detect that an instruction may read from a
1601  // corrupt vccz bit, we need to:
1602  // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
1603  // operations to complete.
1604  // 2. Restore the correct value of vccz by writing the current value
1605  // of vcc back to vcc.
1606  if (ST->hasReadVCCZBug() &&
1607  ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1608  // Writes to vcc while there's an outstanding smem read may get
1609  // clobbered as soon as any read completes.
1610  VCCZCorrect = false;
1611  } else {
1612  // Writes to vcc will fix any incorrect value in vccz.
1613  VCCZCorrect = true;
1614  }
1615  }
1616  }
1617 
1618  if (TII->isSMRD(Inst)) {
1619  for (const MachineMemOperand *Memop : Inst.memoperands()) {
1620  // No need to handle invariant loads when avoiding WAR conflicts, as
1621  // there cannot be a vector store to the same memory location.
1622  if (!Memop->isInvariant()) {
1623  const Value *Ptr = Memop->getValue();
1624  SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
1625  }
1626  }
1627  if (ST->hasReadVCCZBug()) {
1628  // This smem read could complete and clobber vccz at any time.
1629  VCCZCorrect = false;
1630  }
1631  }
1632 
1633  updateEventWaitcntAfter(Inst, &ScoreBrackets);
1634 
1635 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1636  // If this instruction generates a S_SETVSKIP because it is an
1637  // indexed resource, and we are on Tahiti, then it will also force
1638  // an S_WAITCNT vmcnt(0)
1639  if (RequireCheckResourceType(Inst, context)) {
1640  // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1641  ScoreBrackets->setScoreLB(VM_CNT,
1642  ScoreBrackets->getScoreUB(VM_CNT));
1643  }
1644 #endif
1645 
1646  LLVM_DEBUG({
1647  Inst.print(dbgs());
1648  ScoreBrackets.dump();
1649  });
1650 
1651  // TODO: Remove this work-around after fixing the scheduler and enable the
1652  // assert above.
1653  if (RestoreVCCZ) {
1654  // Restore the vccz bit. Any time a value is written to vcc, the vcc
1655  // bit is updated, so we can restore the bit by reading the value of
1656  // vcc and then writing it back to the register.
1657  BuildMI(Block, Inst, Inst.getDebugLoc(),
1658  TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1659  TRI->getVCC())
1660  .addReg(TRI->getVCC());
1661  VCCZCorrect = true;
1662  Modified = true;
1663  }
1664 
1665  ++Iter;
1666  }
1667 
1668  if (Block.getFirstTerminator() == Block.end() &&
1669  isPreheaderToFlush(Block, ScoreBrackets))
1670  Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
1671 
1672  return Modified;
1673 }
1674 
1675 // Return true if the given machine basic block is a preheader of a loop in
1676 // which we want to flush the vmcnt counter, and false otherwise.
1677 bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
1678  WaitcntBrackets &ScoreBrackets) {
1679  if (PreheadersToFlush.count(&MBB))
1680  return PreheadersToFlush[&MBB];
1681 
1682  auto UpdateCache = [&](bool val) {
1683  PreheadersToFlush[&MBB] = val;
1684  return val;
1685  };
1686 
1688  if (!Succ)
1689  return UpdateCache(false);
1690 
1691  MachineLoop *Loop = MLI->getLoopFor(Succ);
1692  if (!Loop)
1693  return UpdateCache(false);
1694 
1695  if (Loop->getLoopPreheader() == &MBB && shouldFlushVmCnt(Loop, ScoreBrackets))
1696  return UpdateCache(true);
1697 
1698  return UpdateCache(false);
1699 }
1700 
1701 // Return true if it is better to flush the vmcnt counter in the preheader of
1702 // the given loop. We currently decide to flush in two situations:
1703 // 1. The loop contains vmem store(s), no vmem load and at least one use of a
1704 // vgpr containing a value that is loaded outside of the loop. (Only on
1705 // targets with no vscnt counter).
1706 // 2. The loop contains vmem load(s), but the loaded values are not used in the
1707 // loop, and at least one use of a vgpr containing a value that is loaded
1708 // outside of the loop.
1709 bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
1710  WaitcntBrackets &Brackets) {
1711  bool HasVMemLoad = false;
1712  bool HasVMemStore = false;
1713  bool UsesVgprLoadedOutside = false;
1714  DenseSet<Register> VgprUse;
1715  DenseSet<Register> VgprDef;
1716 
1717  for (MachineBasicBlock *MBB : ML->blocks()) {
1718  for (MachineInstr &MI : *MBB) {
1719  if (SIInstrInfo::isVMEM(MI)) {
1720  if (MI.mayLoad())
1721  HasVMemLoad = true;
1722  if (MI.mayStore())
1723  HasVMemStore = true;
1724  }
1725  for (unsigned I = 0; I < MI.getNumOperands(); I++) {
1726  MachineOperand &Op = MI.getOperand(I);
1727  if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
1728  continue;
1729  RegInterval Interval = Brackets.getRegInterval(&MI, TII, MRI, TRI, I);
1730  // Vgpr use
1731  if (Op.isUse()) {
1732  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1733  // If we find a register that is loaded inside the loop, 1. and 2.
1734  // are invalidated and we can exit.
1735  if (VgprDef.contains(RegNo))
1736  return false;
1737  VgprUse.insert(RegNo);
1738  // If at least one of Op's registers is in the score brackets, the
1739  // value is likely loaded outside of the loop.
1740  if (Brackets.getRegScore(RegNo, VM_CNT) > Brackets.getScoreLB(VM_CNT)) {
1741  UsesVgprLoadedOutside = true;
1742  break;
1743  }
1744  }
1745  }
1746  // VMem load vgpr def
1747  else if (SIInstrInfo::isVMEM(MI) && MI.mayLoad() && Op.isDef())
1748  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1749  // If we find a register that is loaded inside the loop, 1. and 2.
1750  // are invalidated and we can exit.
1751  if (VgprUse.contains(RegNo))
1752  return false;
1753  VgprDef.insert(RegNo);
1754  }
1755  }
1756  }
1757  }
1758  if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
1759  return true;
1760  return HasVMemLoad && UsesVgprLoadedOutside;
1761 }
1762 
1763 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1764  ST = &MF.getSubtarget<GCNSubtarget>();
1765  TII = ST->getInstrInfo();
1766  TRI = &TII->getRegisterInfo();
1767  MRI = &MF.getRegInfo();
1768  IV = AMDGPU::getIsaVersion(ST->getCPU());
1770  MLI = &getAnalysis<MachineLoopInfo>();
1771  PDT = &getAnalysis<MachinePostDominatorTree>();
1772 
1773  ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1774  for (auto T : inst_counter_types())
1775  ForceEmitWaitcnt[T] = false;
1776 
1777  HardwareLimits Limits = {};
1778  Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1779  Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1780  Limits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1781  Limits.VscntMax = ST->hasVscnt() ? 63 : 0;
1782 
1783  unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
1784  unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
1785  assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1786  assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1787 
1788  RegisterEncoding Encoding = {};
1789  Encoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1790  Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
1791  Encoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1792  Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
1793 
1794  TrackedWaitcntSet.clear();
1795  BlockInfos.clear();
1796  bool Modified = false;
1797 
1798  if (!MFI->isEntryFunction()) {
1799  // Wait for any outstanding memory operations that the input registers may
1800  // depend on. We can't track them and it's better to do the wait after the
1801  // costly call sequence.
1802 
1803  // TODO: Could insert earlier and schedule more liberally with operations
1804  // that only use caller preserved registers.
1805  MachineBasicBlock &EntryBB = MF.front();
1806  MachineBasicBlock::iterator I = EntryBB.begin();
1807  for (MachineBasicBlock::iterator E = EntryBB.end();
1808  I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
1809  ;
1810  BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
1811  if (ST->hasVscnt())
1812  BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
1813  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1814  .addImm(0);
1815 
1816  Modified = true;
1817  }
1818 
1819  // Keep iterating over the blocks in reverse post order, inserting and
1820  // updating s_waitcnt where needed, until a fix point is reached.
1822  BlockInfos.insert({MBB, BlockInfo(MBB)});
1823 
1824  std::unique_ptr<WaitcntBrackets> Brackets;
1825  bool Repeat;
1826  do {
1827  Repeat = false;
1828 
1829  for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
1830  ++BII) {
1831  BlockInfo &BI = BII->second;
1832  if (!BI.Dirty)
1833  continue;
1834 
1835  if (BI.Incoming) {
1836  if (!Brackets)
1837  Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
1838  else
1839  *Brackets = *BI.Incoming;
1840  } else {
1841  if (!Brackets)
1842  Brackets = std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
1843  else
1844  *Brackets = WaitcntBrackets(ST, Limits, Encoding);
1845  }
1846 
1847  Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
1848  BI.Dirty = false;
1849 
1850  if (Brackets->hasPendingEvent()) {
1851  BlockInfo *MoveBracketsToSucc = nullptr;
1852  for (MachineBasicBlock *Succ : BI.MBB->successors()) {
1853  auto SuccBII = BlockInfos.find(Succ);
1854  BlockInfo &SuccBI = SuccBII->second;
1855  if (!SuccBI.Incoming) {
1856  SuccBI.Dirty = true;
1857  if (SuccBII <= BII)
1858  Repeat = true;
1859  if (!MoveBracketsToSucc) {
1860  MoveBracketsToSucc = &SuccBI;
1861  } else {
1862  SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
1863  }
1864  } else if (SuccBI.Incoming->merge(*Brackets)) {
1865  SuccBI.Dirty = true;
1866  if (SuccBII <= BII)
1867  Repeat = true;
1868  }
1869  }
1870  if (MoveBracketsToSucc)
1871  MoveBracketsToSucc->Incoming = std::move(Brackets);
1872  }
1873  }
1874  } while (Repeat);
1875 
1876  if (ST->hasScalarStores()) {
1878  bool HaveScalarStores = false;
1879 
1880  for (MachineBasicBlock &MBB : MF) {
1881  for (MachineInstr &MI : MBB) {
1882  if (!HaveScalarStores && TII->isScalarStore(MI))
1883  HaveScalarStores = true;
1884 
1885  if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1886  MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1887  EndPgmBlocks.push_back(&MBB);
1888  }
1889  }
1890 
1891  if (HaveScalarStores) {
1892  // If scalar writes are used, the cache must be flushed or else the next
1893  // wave to reuse the same scratch memory can be clobbered.
1894  //
1895  // Insert s_dcache_wb at wave termination points if there were any scalar
1896  // stores, and only if the cache hasn't already been flushed. This could
1897  // be improved by looking across blocks for flushes in postdominating
1898  // blocks from the stores but an explicitly requested flush is probably
1899  // very rare.
1900  for (MachineBasicBlock *MBB : EndPgmBlocks) {
1901  bool SeenDCacheWB = false;
1902 
1903  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
1904  I != E; ++I) {
1905  if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1906  SeenDCacheWB = true;
1907  else if (TII->isScalarStore(*I))
1908  SeenDCacheWB = false;
1909 
1910  // FIXME: It would be better to insert this before a waitcnt if any.
1911  if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1912  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1913  !SeenDCacheWB) {
1914  Modified = true;
1915  BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1916  }
1917  }
1918  }
1919  }
1920  }
1921 
1922  return Modified;
1923 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:77
i
i
Definition: README.txt:29
llvm::AMDGPU::getMUBUFIsBufferInv
bool getMUBUFIsBufferInv(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:381
llvm::AMDGPU::getMCReg
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
Definition: AMDGPUBaseInfo.cpp:2076
llvm::SIInstrInfo::isAtomicRet
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:578
readsVCCZ
static bool readsVCCZ(const MachineInstr &MI)
Definition: SIInsertWaitcnts.cpp:969
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:109
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
llvm::MachineLoopInfo::getLoopFor
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
Definition: MachineLoopInfo.h:126
print
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Definition: ArchiveWriter.cpp:189
llvm::SIInstrInfo::isVINTERP
static bool isVINTERP(const MachineInstr &MI)
Definition: SIInstrInfo.h:697
llvm::AMDGPU::getIsaVersion
IsaVersion getIsaVersion(StringRef GPU)
Definition: TargetParser.cpp:193
SIMachineFunctionInfo.h
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:50
T
llvm::MachineInstr::mayLoadOrStore
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition: MachineInstr.h:1079
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
llvm::MachineInstr::mayLoad
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:1056
Waitcnts
SI Insert Waitcnts
Definition: SIInsertWaitcnts.cpp:843
llvm::MapVector::clear
void clear()
Definition: MapVector.h:89
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1199
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:376
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) INITIALIZE_PASS_END(SIInsertWaitcnts
llvm::TargetRegisterInfo::isInAllocatableClass
bool isInAllocatableClass(MCRegister RegNo) const
Return true if the register is in the allocation of any register class.
Definition: TargetRegisterInfo.h:364
llvm::X86Disassembler::Reg
Reg
All possible values of the reg field in the ModR/M byte.
Definition: X86DisassemblerDecoder.h:462
MapVector.h
llvm::SIInstrInfo::isEXP
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:554
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
llvm::enum_iteration_traits
Definition: Sequence.h:99
llvm::SIInstrFlags::LGKM_CNT
@ LGKM_CNT
Definition: SIDefines.h:75
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::erase
bool erase(const KeyT &Val)
Definition: DenseMap.h:302
llvm::MachineOperand::setImm
void setImm(int64_t immVal)
Definition: MachineOperand.h:664
llvm::MCRegisterInfo::getEncodingValue
uint16_t getEncodingValue(MCRegister RegNo) const
Returns the encoding for RegNo.
Definition: MCRegisterInfo.h:553
llvm::AMDGPU::Exp::ET_POS0
@ ET_POS0
Definition: SIDefines.h:860
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:127
llvm::SIInsertWaitcntsID
char & SIInsertWaitcntsID
Definition: SIInsertWaitcnts.cpp:848
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:145
llvm::MapVector
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
llvm::SIInstrFlags::EXP_CNT
@ EXP_CNT
Definition: SIDefines.h:74
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::max
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:337
llvm::AMDGPU::getVmcntBitMask
unsigned getVmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:1202
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:877
TargetParser.h
llvm::AMDGPU::getNamedOperandIdx
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
DEBUG_COUNTER
DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", "Force emit s_waitcnt expcnt(0) instrs")
llvm::AMDGPU::Exp::ET_PARAM0
@ ET_PARAM0
Definition: SIDefines.h:867
Sequence.h
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::count
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1628
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:373
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:167
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::DebugCounter::isCounterSet
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:100
llvm::MachineLoopInfo
Definition: MachineLoopInfo.h:89
llvm::AMDGPU::SendMsg::ID_MASK_PreGFX11_
@ ID_MASK_PreGFX11_
Definition: SIDefines.h:347
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::AMDGPU::hasNamedOperand
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
Definition: AMDGPUBaseInfo.h:310
llvm::MachineFunction::front
const MachineBasicBlock & front() const
Definition: MachineFunction.h:869
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:670
llvm::MapVector::begin
iterator begin()
Definition: MapVector.h:70
llvm::MachineInstr::isCall
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:872
MachineLoopInfo.h
llvm::SIInstrInfo::isMIMG
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:501
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineOperand::getImm
int64_t getImm() const
Definition: MachineOperand.h:546
llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:758
llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:526
llvm::AMDGPUISD::DS_ORDERED_COUNT
@ DS_ORDERED_COUNT
Definition: AMDGPUISelLowering.h:505
llvm::createSIInsertWaitcntsPass
FunctionPass * createSIInsertWaitcntsPass()
Definition: SIInsertWaitcnts.cpp:850
SI
@ SI
Definition: SIInstrInfo.cpp:7982
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:45
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
t
bitcast float %x to i32 %s=and i32 %t, 2147483647 %d=bitcast i32 %s to float ret float %d } declare float @fabsf(float %n) define float @bar(float %x) nounwind { %d=call float @fabsf(float %x) ret float %d } This IR(from PR6194):target datalayout="e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple="x86_64-apple-darwin10.0.0" %0=type { double, double } %struct.float3=type { float, float, float } define void @test(%0, %struct.float3 *nocapture %res) nounwind noinline ssp { entry:%tmp18=extractvalue %0 %0, 0 t
Definition: README-SSE.txt:788
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
false
Definition: StackSlotColoring.cpp:141
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125
merge
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Definition: LoopDeletion.cpp:53
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:48
llvm::DebugCounter::shouldExecute
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:72
OpName
Definition: R600Defines.h:62
llvm::report_fatal_error
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:145
llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
LoopDeletionResult::Modified
@ Modified
llvm::AMDGPU::decodeWaitcnt
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
Definition: AMDGPUBaseInfo.cpp:1246
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:30
llvm::MachineInstr::definesRegister
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr fully defines the specified register.
Definition: MachineInstr.h:1420
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:94
llvm::AMDGPU::Exp::ET_POS_LAST
@ ET_POS_LAST
Definition: SIDefines.h:863
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::DenseSet
Implements a dense probed hash-table based set.
Definition: DenseSet.h:268
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:660
llvm::cl::opt< bool >
val
The initial backend is deliberately restricted to z10 We should add support for later architectures at some point If an asm ties an i32 r result to an i64 the input will be treated as an leaving the upper bits uninitialised For i64 store i32 val
Definition: README.txt:15
llvm::MachineInstr::getDebugLoc
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:445
llvm::AMDGPU::getExpcntBitMask
unsigned getExpcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:1208
llvm::MachineLoop
Definition: MachineLoopInfo.h:44
AMDGPUMCTargetDesc.h
llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition: MachineInstrBuilder.h:52
llvm::MapVector::find
iterator find(const KeyT &Key)
Definition: MapVector.h:148
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:320
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:66
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SIInsertWaitcnts.cpp:41
llvm::Interval
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:36
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::MachineBasicBlock::getSingleSuccessor
const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: MachineBasicBlock.cpp:938
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
move
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional move
Definition: README.txt:546
llvm::sys::Wait
ProcessInfo Wait(const ProcessInfo &PI, unsigned SecondsToWait, bool WaitUntilTerminates, std::string *ErrMsg=nullptr, std::optional< ProcessStatistics > *ProcStat=nullptr)
This function waits for the process specified by PI to finish.
llvm::SIInstrInfo::isFLATScratch
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:541
llvm::DenseMap
Definition: DenseMap.h:714
llvm::SIInstrFlags::VM_CNT
@ VM_CNT
Definition: SIDefines.h:73
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::AMDGPUMachineFunction::isEntryFunction
bool isEntryFunction() const
Definition: AMDGPUMachineFunction.h:79
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:446
llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:716
llvm::LoopBase::getLoopPreheader
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:183
callWaitsOnFunctionEntry
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
Definition: SIInsertWaitcnts.cpp:976
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::find
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:150
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Ptr
@ Ptr
Definition: TargetLibraryInfo.cpp:60
InlinePriorityMode::ML
@ ML
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
MachinePostDominators.h
llvm::AMDGPU::getMIMGInfo
const LLVM_READONLY MIMGInfo * getMIMGInfo(unsigned Opc)
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:359
llvm::MachineFunction
Definition: MachineFunction.h:257
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::clear
void clear()
Definition: DenseSet.h:92
llvm::print
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr)
Definition: GCNRegPressure.cpp:138
llvm::AMDGPU::getMIMGBaseOpcodeInfo
const LLVM_READONLY MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
llvm::enum_iteration_traits::is_iterable
static constexpr bool is_iterable
Definition: Sequence.h:100
llvm::MapVector::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:118
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:265
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
AMDGPU.h
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:516
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::contains
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:82
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::MachineInstr::print
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
Definition: MachineInstr.cpp:1562
llvm::MachineOperand::isDef
bool isDef() const
Definition: MachineOperand.h:374
llvm::SIInstrInfo::isMUBUF
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:465
llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:313
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:207
llvm::MachinePostDominatorTree
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
Definition: MachinePostDominators.h:27
MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105
llvm::MapVector::end
iterator end()
Definition: MapVector.h:72
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::AMDGPU::encodeWaitcnt
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Definition: AMDGPUBaseInfo.cpp:1282
llvm::MachinePostDominatorTree::dominates
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
Definition: MachinePostDominators.h:54
llvm::TargetRegisterInfo::getRegSizeInBits
unsigned getRegSizeInBits(const TargetRegisterClass &RC) const
Return the size in bits of a register from class RC.
Definition: TargetRegisterInfo.h:279
uint16_t
ForceEmitZeroFlag
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:351
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::MachineInstr::mayStore
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:1069
llvm::enum_seq
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition: Sequence.h:327
llvm::SIInstrInfo::isVMEM
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:369
DebugCounter.h
isWaitInstr
static bool isWaitInstr(MachineInstr &Inst)
Definition: SIInsertWaitcnts.cpp:1531
llvm::ReversePostOrderTraversal
Definition: PostOrderIterator.h:291
updateOperandIfDifferent
static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, unsigned NewEnc)
Definition: SIInsertWaitcnts.cpp:854
llvm::AMDGPU::getLgkmcntBitMask
unsigned getLgkmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:1212
llvm::RISCVMatInt::Imm
@ Imm
Definition: RISCVMatInt.h:23
llvm::SIInstrInfo
Definition: SIInstrInfo.h:44
PostOrderIterator.h
mayWriteLDSThroughDMA
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
Definition: SIInsertWaitcnts.cpp:543
Other
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1252
callWaitsOnFunctionReturn
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
Definition: SIInsertWaitcnts.cpp:986
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:357
llvm::MachineBasicBlock::begin
iterator begin()
Definition: MachineBasicBlock.h:309
llvm::SIInstrInfo::isFLATGlobal
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:533
llvm::AMDGPU::SendMsg::ID_GS_DONE_PreGFX11
@ ID_GS_DONE_PreGFX11
Definition: SIDefines.h:325
llvm::SIInstrInfo::isVALU
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:361
llvm::MachineInstr::getNumOperands
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:519
llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition: SIMachineFunctionInfo.h:326
llvm::MachineInstr::memoperands
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:713
IV
static const uint32_t IV[8]
Definition: blake3_impl.h:85
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
llvm::AMDGPU::Exp::ET_PARAM31
@ ET_PARAM31
Definition: SIDefines.h:868
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::AMDGPU::Waitcnt::allZero
static Waitcnt allZero(bool HasVscnt)
Definition: AMDGPUBaseInfo.h:824
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::SIInstrInfo::isFLAT
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:517
llvm::cl::desc
Definition: CommandLine.h:412
llvm::SIInstrInfo::isLDSDIR
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:689
llvm::MachineInstr::eraseFromParent
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Definition: MachineInstr.cpp:691
llvm::MachineInstrBundleIterator< MachineInstr >
llvm::pdb::PDB_SymType::Block
@ Block
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:371
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:311
AMDGPUBaseInfo.h