LLVM  13.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert wait instructions for memory reads and writes.
11 ///
12 /// Memory reads and writes are issued asynchronously, so we need to insert
13 /// S_WAITCNT instructions when we want to access any of their results or
14 /// overwrite any register that's used asynchronously.
15 ///
16 /// TODO: This pass currently keeps one timeline per hardware counter. A more
17 /// finely-grained approach that keeps one timeline per event type could
18 /// sometimes get away with generating weaker s_waitcnt instructions. For
19 /// example, when both SMEM and LDS are in flight and we need to wait for
20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21 /// but the pass will currently generate a conservative lgkmcnt(0) because
22 /// multiple event types are in flight.
23 //
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPU.h"
27 #include "GCNSubtarget.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "llvm/ADT/MapVector.h"
33 #include "llvm/InitializePasses.h"
36 using namespace llvm;
37 
38 #define DEBUG_TYPE "si-insert-waitcnts"
39 
40 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
41  "Force emit s_waitcnt expcnt(0) instrs");
42 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
43  "Force emit s_waitcnt lgkmcnt(0) instrs");
44 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
45  "Force emit s_waitcnt vmcnt(0) instrs");
46 
48  "amdgpu-waitcnt-forcezero",
49  cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
50  cl::init(false), cl::Hidden);
51 
52 namespace {
53 
54 template <typename EnumT>
55 class enum_iterator
56  : public iterator_facade_base<enum_iterator<EnumT>,
57  std::forward_iterator_tag, const EnumT> {
58  EnumT Value;
59 public:
60  enum_iterator() = default;
61  enum_iterator(EnumT Value) : Value(Value) {}
62 
63  enum_iterator &operator++() {
64  Value = static_cast<EnumT>(Value + 1);
65  return *this;
66  }
67 
68  bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
69 
70  EnumT operator*() const { return Value; }
71 };
72 
73 // Class of object that encapsulates latest instruction counter score
74 // associated with the operand. Used for determining whether
75 // s_waitcnt instruction needs to be emited.
76 
77 #define CNT_MASK(t) (1u << (t))
78 
79 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
80 
82  return make_range(enum_iterator<InstCounterType>(VM_CNT),
83  enum_iterator<InstCounterType>(NUM_INST_CNTS));
84 }
85 
86 using RegInterval = std::pair<int, int>;
87 
88 struct {
89  unsigned VmcntMax;
90  unsigned ExpcntMax;
91  unsigned LgkmcntMax;
92  unsigned VscntMax;
93 } HardwareLimits;
94 
95 struct {
96  unsigned VGPR0;
97  unsigned VGPRL;
98  unsigned SGPR0;
99  unsigned SGPRL;
100 } RegisterEncoding;
101 
102 enum WaitEventType {
103  VMEM_ACCESS, // vector-memory read & write
104  VMEM_READ_ACCESS, // vector-memory read
105  VMEM_WRITE_ACCESS,// vector-memory write
106  LDS_ACCESS, // lds read & write
107  GDS_ACCESS, // gds read & write
108  SQ_MESSAGE, // send message
109  SMEM_ACCESS, // scalar-memory read & write
110  EXP_GPR_LOCK, // export holding on its data src
111  GDS_GPR_LOCK, // GDS holding on its data and addr src
112  EXP_POS_ACCESS, // write to export position
113  EXP_PARAM_ACCESS, // write to export parameter
114  VMW_GPR_LOCK, // vector-memory write holding on its data src
115  NUM_WAIT_EVENTS,
116 };
117 
118 static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
119  (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
120  (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
121  (1 << SQ_MESSAGE),
122  (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
123  (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
124  (1 << VMEM_WRITE_ACCESS)
125 };
126 
127 // The mapping is:
128 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
129 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
130 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
131 // We reserve a fixed number of VGPR slots in the scoring tables for
132 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
133 enum RegisterMapping {
134  SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
135  AGPR_OFFSET = 226, // Maximum programmable ArchVGPRs across all targets.
136  SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
137  NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
138  EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
139  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
140 };
141 
142 // Enumerate different types of result-returning VMEM operations. Although
143 // s_waitcnt orders them all with a single vmcnt counter, in the absence of
144 // s_waitcnt only instructions of the same VmemType are guaranteed to write
145 // their results in order -- so there is no need to insert an s_waitcnt between
146 // two instructions of the same type that write the same vgpr.
147 enum VmemType {
148  // BUF instructions and MIMG instructions without a sampler.
149  VMEM_NOSAMPLER,
150  // MIMG instructions with a sampler.
151  VMEM_SAMPLER,
152 };
153 
154 VmemType getVmemType(const MachineInstr &Inst) {
156  if (!SIInstrInfo::isMIMG(Inst))
157  return VMEM_NOSAMPLER;
159  return AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler
160  ? VMEM_SAMPLER
161  : VMEM_NOSAMPLER;
162 }
163 
164 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
165  switch (T) {
166  case VM_CNT:
167  Wait.VmCnt = std::min(Wait.VmCnt, Count);
168  break;
169  case EXP_CNT:
170  Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
171  break;
172  case LGKM_CNT:
173  Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
174  break;
175  case VS_CNT:
176  Wait.VsCnt = std::min(Wait.VsCnt, Count);
177  break;
178  default:
179  llvm_unreachable("bad InstCounterType");
180  }
181 }
182 
183 // This objects maintains the current score brackets of each wait counter, and
184 // a per-register scoreboard for each wait counter.
185 //
186 // We also maintain the latest score for every event type that can change the
187 // waitcnt in order to know if there are multiple types of events within
188 // the brackets. When multiple types of event happen in the bracket,
189 // wait count may get decreased out of order, therefore we need to put in
190 // "s_waitcnt 0" before use.
191 class WaitcntBrackets {
192 public:
193  WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {}
194 
195  static unsigned getWaitCountMax(InstCounterType T) {
196  switch (T) {
197  case VM_CNT:
198  return HardwareLimits.VmcntMax;
199  case LGKM_CNT:
200  return HardwareLimits.LgkmcntMax;
201  case EXP_CNT:
202  return HardwareLimits.ExpcntMax;
203  case VS_CNT:
204  return HardwareLimits.VscntMax;
205  default:
206  break;
207  }
208  return 0;
209  }
210 
211  unsigned getScoreLB(InstCounterType T) const {
212  assert(T < NUM_INST_CNTS);
213  return ScoreLBs[T];
214  }
215 
216  unsigned getScoreUB(InstCounterType T) const {
217  assert(T < NUM_INST_CNTS);
218  return ScoreUBs[T];
219  }
220 
221  // Mapping from event to counter.
222  InstCounterType eventCounter(WaitEventType E) {
223  if (WaitEventMaskForInst[VM_CNT] & (1 << E))
224  return VM_CNT;
225  if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
226  return LGKM_CNT;
227  if (WaitEventMaskForInst[VS_CNT] & (1 << E))
228  return VS_CNT;
229  assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
230  return EXP_CNT;
231  }
232 
233  unsigned getRegScore(int GprNo, InstCounterType T) {
234  if (GprNo < NUM_ALL_VGPRS) {
235  return VgprScores[T][GprNo];
236  }
237  assert(T == LGKM_CNT);
238  return SgprScores[GprNo - NUM_ALL_VGPRS];
239  }
240 
241  bool merge(const WaitcntBrackets &Other);
242 
243  RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
244  const MachineRegisterInfo *MRI,
245  const SIRegisterInfo *TRI, unsigned OpNo) const;
246 
247  bool counterOutOfOrder(InstCounterType T) const;
248  void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
249  void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
250  void determineWait(InstCounterType T, unsigned ScoreToWait,
251  AMDGPU::Waitcnt &Wait) const;
252  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
253  void applyWaitcnt(InstCounterType T, unsigned Count);
254  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
255  const MachineRegisterInfo *MRI, WaitEventType E,
256  MachineInstr &MI);
257 
258  bool hasPending() const { return PendingEvents != 0; }
259  bool hasPendingEvent(WaitEventType E) const {
260  return PendingEvents & (1 << E);
261  }
262 
263  bool hasMixedPendingEvents(InstCounterType T) const {
264  unsigned Events = PendingEvents & WaitEventMaskForInst[T];
265  // Return true if more than one bit is set in Events.
266  return Events & (Events - 1);
267  }
268 
269  bool hasPendingFlat() const {
270  return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
271  LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
272  (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
273  LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
274  }
275 
276  void setPendingFlat() {
277  LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
278  LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
279  }
280 
281  // Return true if there might be pending writes to the specified vgpr by VMEM
282  // instructions with types different from V.
283  bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
284  assert(GprNo < NUM_ALL_VGPRS);
285  return VgprVmemTypes[GprNo] & ~(1 << V);
286  }
287 
288  void clearVgprVmemTypes(int GprNo) {
289  assert(GprNo < NUM_ALL_VGPRS);
290  VgprVmemTypes[GprNo] = 0;
291  }
292 
293  void print(raw_ostream &);
294  void dump() { print(dbgs()); }
295 
296 private:
297  struct MergeInfo {
298  unsigned OldLB;
299  unsigned OtherLB;
300  unsigned MyShift;
301  unsigned OtherShift;
302  };
303  static bool mergeScore(const MergeInfo &M, unsigned &Score,
304  unsigned OtherScore);
305 
306  void setScoreLB(InstCounterType T, unsigned Val) {
307  assert(T < NUM_INST_CNTS);
308  ScoreLBs[T] = Val;
309  }
310 
311  void setScoreUB(InstCounterType T, unsigned Val) {
312  assert(T < NUM_INST_CNTS);
313  ScoreUBs[T] = Val;
314  if (T == EXP_CNT) {
315  unsigned UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
316  if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
317  ScoreLBs[T] = UB;
318  }
319  }
320 
321  void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
322  if (GprNo < NUM_ALL_VGPRS) {
323  VgprUB = std::max(VgprUB, GprNo);
324  VgprScores[T][GprNo] = Val;
325  } else {
326  assert(T == LGKM_CNT);
327  SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
328  SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
329  }
330  }
331 
332  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
333  const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
334  unsigned OpNo, unsigned Val);
335 
336  const GCNSubtarget *ST = nullptr;
337  unsigned ScoreLBs[NUM_INST_CNTS] = {0};
338  unsigned ScoreUBs[NUM_INST_CNTS] = {0};
339  unsigned PendingEvents = 0;
340  // Remember the last flat memory operation.
341  unsigned LastFlat[NUM_INST_CNTS] = {0};
342  // wait_cnt scores for every vgpr.
343  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
344  int VgprUB = -1;
345  int SgprUB = -1;
346  unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
347  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
348  unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
349  // Bitmask of the VmemTypes of VMEM instructions that might have a pending
350  // write to each vgpr.
351  unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
352 };
353 
354 class SIInsertWaitcnts : public MachineFunctionPass {
355 private:
356  const GCNSubtarget *ST = nullptr;
357  const SIInstrInfo *TII = nullptr;
358  const SIRegisterInfo *TRI = nullptr;
359  const MachineRegisterInfo *MRI = nullptr;
361 
362  DenseSet<MachineInstr *> TrackedWaitcntSet;
365 
366  struct BlockInfo {
368  std::unique_ptr<WaitcntBrackets> Incoming;
369  bool Dirty = true;
370 
371  explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
372  };
373 
375 
376  // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
377  // because of amdgpu-waitcnt-forcezero flag
378  bool ForceEmitZeroWaitcnts;
379  bool ForceEmitWaitcnt[NUM_INST_CNTS];
380 
381 public:
382  static char ID;
383 
384  SIInsertWaitcnts() : MachineFunctionPass(ID) {
385  (void)ForceExpCounter;
386  (void)ForceLgkmCounter;
387  (void)ForceVMCounter;
388  }
389 
390  bool runOnMachineFunction(MachineFunction &MF) override;
391 
392  StringRef getPassName() const override {
393  return "SI insert wait instructions";
394  }
395 
396  void getAnalysisUsage(AnalysisUsage &AU) const override {
397  AU.setPreservesCFG();
400  }
401 
402  bool isForceEmitWaitcnt() const {
403  for (auto T : inst_counter_types())
404  if (ForceEmitWaitcnt[T])
405  return true;
406  return false;
407  }
408 
409  void setForceEmitWaitcnt() {
410 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
411 // For debug builds, get the debug counter info and adjust if need be
412 #ifndef NDEBUG
413  if (DebugCounter::isCounterSet(ForceExpCounter) &&
414  DebugCounter::shouldExecute(ForceExpCounter)) {
415  ForceEmitWaitcnt[EXP_CNT] = true;
416  } else {
417  ForceEmitWaitcnt[EXP_CNT] = false;
418  }
419 
420  if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
421  DebugCounter::shouldExecute(ForceLgkmCounter)) {
422  ForceEmitWaitcnt[LGKM_CNT] = true;
423  } else {
424  ForceEmitWaitcnt[LGKM_CNT] = false;
425  }
426 
427  if (DebugCounter::isCounterSet(ForceVMCounter) &&
428  DebugCounter::shouldExecute(ForceVMCounter)) {
429  ForceEmitWaitcnt[VM_CNT] = true;
430  } else {
431  ForceEmitWaitcnt[VM_CNT] = false;
432  }
433 #endif // NDEBUG
434  }
435 
436  bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
437  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
438  bool generateWaitcntInstBefore(MachineInstr &MI,
439  WaitcntBrackets &ScoreBrackets,
440  MachineInstr *OldWaitcntInstr);
441  void updateEventWaitcntAfter(MachineInstr &Inst,
442  WaitcntBrackets *ScoreBrackets);
443  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
444  WaitcntBrackets &ScoreBrackets);
445  bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
446  MachineInstr &OldWaitcntInstr,
448 };
449 
450 } // end anonymous namespace
451 
452 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
453  const SIInstrInfo *TII,
454  const MachineRegisterInfo *MRI,
455  const SIRegisterInfo *TRI,
456  unsigned OpNo) const {
457  const MachineOperand &Op = MI->getOperand(OpNo);
458  if (!TRI->isInAllocatableClass(Op.getReg()))
459  return {-1, -1};
460 
461  // A use via a PW operand does not need a waitcnt.
462  // A partial write is not a WAW.
463  assert(!Op.getSubReg() || !Op.isUndef());
464 
465  RegInterval Result;
466 
467  unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
468 
469  if (TRI->isVectorRegister(*MRI, Op.getReg())) {
470  assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
471  Result.first = Reg - RegisterEncoding.VGPR0;
472  if (TRI->isAGPR(*MRI, Op.getReg()))
473  Result.first += AGPR_OFFSET;
474  assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
475  } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
476  assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
477  Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
478  assert(Result.first >= NUM_ALL_VGPRS &&
479  Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
480  }
481  // TODO: Handle TTMP
482  // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
483  else
484  return {-1, -1};
485 
486  const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo);
487  unsigned Size = TRI->getRegSizeInBits(*RC);
488  Result.second = Result.first + ((Size + 16) / 32);
489 
490  return Result;
491 }
492 
493 void WaitcntBrackets::setExpScore(const MachineInstr *MI,
494  const SIInstrInfo *TII,
495  const SIRegisterInfo *TRI,
496  const MachineRegisterInfo *MRI, unsigned OpNo,
497  unsigned Val) {
498  RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
499  assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
500  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
501  setRegScore(RegNo, EXP_CNT, Val);
502  }
503 }
504 
505 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
506  const SIRegisterInfo *TRI,
507  const MachineRegisterInfo *MRI,
508  WaitEventType E, MachineInstr &Inst) {
509  InstCounterType T = eventCounter(E);
510  unsigned CurrScore = getScoreUB(T) + 1;
511  if (CurrScore == 0)
512  report_fatal_error("InsertWaitcnt score wraparound");
513  // PendingEvents and ScoreUB need to be update regardless if this event
514  // changes the score of a register or not.
515  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
516  PendingEvents |= 1 << E;
517  setScoreUB(T, CurrScore);
518 
519  if (T == EXP_CNT) {
520  // Put score on the source vgprs. If this is a store, just use those
521  // specific register(s).
522  if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
523  int AddrOpIdx =
524  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
525  // All GDS operations must protect their address register (same as
526  // export.)
527  if (AddrOpIdx != -1) {
528  setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
529  }
530 
531  if (Inst.mayStore()) {
533  AMDGPU::OpName::data0) != -1) {
534  setExpScore(
535  &Inst, TII, TRI, MRI,
536  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
537  CurrScore);
538  }
540  AMDGPU::OpName::data1) != -1) {
541  setExpScore(&Inst, TII, TRI, MRI,
543  AMDGPU::OpName::data1),
544  CurrScore);
545  }
546  } else if (SIInstrInfo::isAtomicRet(Inst) &&
547  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
548  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
549  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
550  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
551  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
552  Inst.getOpcode() != AMDGPU::DS_APPEND &&
553  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
555  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
556  const MachineOperand &Op = Inst.getOperand(I);
557  if (Op.isReg() && !Op.isDef() &&
558  TRI->isVectorRegister(*MRI, Op.getReg())) {
559  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
560  }
561  }
562  }
563  } else if (TII->isFLAT(Inst)) {
564  if (Inst.mayStore()) {
565  setExpScore(
566  &Inst, TII, TRI, MRI,
567  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
568  CurrScore);
569  } else if (SIInstrInfo::isAtomicRet(Inst)) {
570  setExpScore(
571  &Inst, TII, TRI, MRI,
572  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
573  CurrScore);
574  }
575  } else if (TII->isMIMG(Inst)) {
576  if (Inst.mayStore()) {
577  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
578  } else if (SIInstrInfo::isAtomicRet(Inst)) {
579  setExpScore(
580  &Inst, TII, TRI, MRI,
581  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
582  CurrScore);
583  }
584  } else if (TII->isMTBUF(Inst)) {
585  if (Inst.mayStore()) {
586  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
587  }
588  } else if (TII->isMUBUF(Inst)) {
589  if (Inst.mayStore()) {
590  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
591  } else if (SIInstrInfo::isAtomicRet(Inst)) {
592  setExpScore(
593  &Inst, TII, TRI, MRI,
594  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
595  CurrScore);
596  }
597  } else {
598  if (TII->isEXP(Inst)) {
599  // For export the destination registers are really temps that
600  // can be used as the actual source after export patching, so
601  // we need to treat them like sources and set the EXP_CNT
602  // score.
603  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
604  MachineOperand &DefMO = Inst.getOperand(I);
605  if (DefMO.isReg() && DefMO.isDef() &&
606  TRI->isVGPR(*MRI, DefMO.getReg())) {
607  setRegScore(
609  EXP_CNT, CurrScore);
610  }
611  }
612  }
613  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
614  MachineOperand &MO = Inst.getOperand(I);
615  if (MO.isReg() && !MO.isDef() &&
616  TRI->isVectorRegister(*MRI, MO.getReg())) {
617  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
618  }
619  }
620  }
621 #if 0 // TODO: check if this is handled by MUBUF code above.
622  } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
623  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
624  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
625  MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
626  unsigned OpNo;//TODO: find the OpNo for this operand;
627  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo);
628  for (int RegNo = Interval.first; RegNo < Interval.second;
629  ++RegNo) {
630  setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
631  }
632 #endif
633  } else {
634  // Match the score to the destination registers.
635  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
636  auto &Op = Inst.getOperand(I);
637  if (!Op.isReg() || !Op.isDef())
638  continue;
639  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I);
640  if (T == VM_CNT) {
641  if (Interval.first >= NUM_ALL_VGPRS)
642  continue;
643  if (SIInstrInfo::isVMEM(Inst)) {
644  VmemType V = getVmemType(Inst);
645  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
646  VgprVmemTypes[RegNo] |= 1 << V;
647  }
648  }
649  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
650  setRegScore(RegNo, T, CurrScore);
651  }
652  }
653  if (TII->isDS(Inst) && Inst.mayStore()) {
654  setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
655  }
656  }
657 }
658 
660  OS << '\n';
661  for (auto T : inst_counter_types()) {
662  unsigned LB = getScoreLB(T);
663  unsigned UB = getScoreUB(T);
664 
665  switch (T) {
666  case VM_CNT:
667  OS << " VM_CNT(" << UB - LB << "): ";
668  break;
669  case LGKM_CNT:
670  OS << " LGKM_CNT(" << UB - LB << "): ";
671  break;
672  case EXP_CNT:
673  OS << " EXP_CNT(" << UB - LB << "): ";
674  break;
675  case VS_CNT:
676  OS << " VS_CNT(" << UB - LB << "): ";
677  break;
678  default:
679  OS << " UNKNOWN(" << UB - LB << "): ";
680  break;
681  }
682 
683  if (LB < UB) {
684  // Print vgpr scores.
685  for (int J = 0; J <= VgprUB; J++) {
686  unsigned RegScore = getRegScore(J, T);
687  if (RegScore <= LB)
688  continue;
689  unsigned RelScore = RegScore - LB - 1;
690  if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
691  OS << RelScore << ":v" << J << " ";
692  } else {
693  OS << RelScore << ":ds ";
694  }
695  }
696  // Also need to print sgpr scores for lgkm_cnt.
697  if (T == LGKM_CNT) {
698  for (int J = 0; J <= SgprUB; J++) {
699  unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
700  if (RegScore <= LB)
701  continue;
702  unsigned RelScore = RegScore - LB - 1;
703  OS << RelScore << ":s" << J << " ";
704  }
705  }
706  }
707  OS << '\n';
708  }
709  OS << '\n';
710 }
711 
712 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
713 /// whether a waitcnt instruction is needed at all.
714 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
715  simplifyWaitcnt(VM_CNT, Wait.VmCnt);
716  simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
717  simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
718  simplifyWaitcnt(VS_CNT, Wait.VsCnt);
719 }
720 
721 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
722  unsigned &Count) const {
723  const unsigned LB = getScoreLB(T);
724  const unsigned UB = getScoreUB(T);
725 
726  // The number of outstanding events for this type, T, can be calculated
727  // as (UB - LB). If the current Count is greater than or equal to the number
728  // of outstanding events, then the wait for this counter is redundant.
729  if (Count >= UB - LB)
730  Count = ~0u;
731 }
732 
733 void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait,
734  AMDGPU::Waitcnt &Wait) const {
735  // If the score of src_operand falls within the bracket, we need an
736  // s_waitcnt instruction.
737  const unsigned LB = getScoreLB(T);
738  const unsigned UB = getScoreUB(T);
739  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
740  if ((T == VM_CNT || T == LGKM_CNT) &&
741  hasPendingFlat() &&
742  !ST->hasFlatLgkmVMemCountInOrder()) {
743  // If there is a pending FLAT operation, and this is a VMem or LGKM
744  // waitcnt and the target can report early completion, then we need
745  // to force a waitcnt 0.
746  addWait(Wait, T, 0);
747  } else if (counterOutOfOrder(T)) {
748  // Counter can get decremented out-of-order when there
749  // are multiple types event in the bracket. Also emit an s_wait counter
750  // with a conservative value of 0 for the counter.
751  addWait(Wait, T, 0);
752  } else {
753  // If a counter has been maxed out avoid overflow by waiting for
754  // MAX(CounterType) - 1 instead.
755  unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
756  addWait(Wait, T, NeededWait);
757  }
758  }
759 }
760 
761 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
762  applyWaitcnt(VM_CNT, Wait.VmCnt);
763  applyWaitcnt(EXP_CNT, Wait.ExpCnt);
764  applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
765  applyWaitcnt(VS_CNT, Wait.VsCnt);
766 }
767 
768 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
769  const unsigned UB = getScoreUB(T);
770  if (Count >= UB)
771  return;
772  if (Count != 0) {
773  if (counterOutOfOrder(T))
774  return;
775  setScoreLB(T, std::max(getScoreLB(T), UB - Count));
776  } else {
777  setScoreLB(T, UB);
778  PendingEvents &= ~WaitEventMaskForInst[T];
779  }
780 }
781 
782 // Where there are multiple types of event in the bracket of a counter,
783 // the decrement may go out of order.
784 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
785  // Scalar memory read always can go out of order.
786  if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
787  return true;
788  return hasMixedPendingEvents(T);
789 }
790 
791 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
792  false)
794 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
795  false)
796 
797 char SIInsertWaitcnts::ID = 0;
798 
799 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
800 
802  return new SIInsertWaitcnts();
803 }
804 
805 /// Combine consecutive waitcnt instructions that precede \p MI and follow
806 /// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
807 /// by previous passes. Currently this pass conservatively assumes that these
808 /// preexisting waitcnt are required for correctness.
809 bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
810  MachineInstr &OldWaitcntInstr,
812  const MachineInstr *MI) {
813  bool Modified = false;
814  MachineInstr *WaitcntInstr = nullptr;
815  MachineInstr *WaitcntVsCntInstr = nullptr;
816  for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II);
817  &*II != MI; II = NextI, ++NextI) {
818  if (II->isMetaInstruction())
819  continue;
820 
821  if (II->getOpcode() == AMDGPU::S_WAITCNT) {
822  // Conservatively update required wait if this waitcnt was added in an
823  // earlier pass. In this case it will not exist in the tracked waitcnt
824  // set.
825  if (!TrackedWaitcntSet.count(&*II)) {
826  unsigned IEnc = II->getOperand(0).getImm();
827  AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
828  Wait = Wait.combined(OldWait);
829  }
830 
831  // Merge consecutive waitcnt of the same type by erasing multiples.
832  if (!WaitcntInstr) {
833  WaitcntInstr = &*II;
834  } else {
835  II->eraseFromParent();
836  Modified = true;
837  }
838 
839  } else {
840  assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
841  assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
842  if (!TrackedWaitcntSet.count(&*II)) {
843  unsigned OldVSCnt =
844  TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
845  Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
846  }
847 
848  if (!WaitcntVsCntInstr) {
849  WaitcntVsCntInstr = &*II;
850  } else {
851  II->eraseFromParent();
852  Modified = true;
853  }
854  }
855  }
856 
857  // Updated encoding of merged waitcnt with the required wait.
858  if (WaitcntInstr) {
859  if (Wait.hasWaitExceptVsCnt()) {
860  unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
861  unsigned OldEnc = WaitcntInstr->getOperand(0).getImm();
862  if (OldEnc != NewEnc) {
863  WaitcntInstr->getOperand(0).setImm(NewEnc);
864  Modified = true;
865  }
866  ScoreBrackets.applyWaitcnt(Wait);
867  Wait.VmCnt = ~0u;
868  Wait.LgkmCnt = ~0u;
869  Wait.ExpCnt = ~0u;
870 
871  LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
872  << "Old Instr: " << MI << "New Instr: " << *WaitcntInstr
873  << '\n');
874  } else {
875  WaitcntInstr->eraseFromParent();
876  Modified = true;
877  }
878  }
879 
880  if (WaitcntVsCntInstr) {
881  if (Wait.hasWaitVsCnt()) {
882  assert(ST->hasVscnt());
883  unsigned OldVSCnt =
884  TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
885  ->getImm();
886  if (Wait.VsCnt != OldVSCnt) {
887  TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
888  ->setImm(Wait.VsCnt);
889  Modified = true;
890  }
891  ScoreBrackets.applyWaitcnt(Wait);
892  Wait.VsCnt = ~0u;
893 
894  LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
895  << "Old Instr: " << MI
896  << "New Instr: " << *WaitcntVsCntInstr << '\n');
897  } else {
898  WaitcntVsCntInstr->eraseFromParent();
899  Modified = true;
900  }
901  }
902 
903  return Modified;
904 }
905 
906 static bool readsVCCZ(const MachineInstr &MI) {
907  unsigned Opc = MI.getOpcode();
908  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
909  !MI.getOperand(1).isUndef();
910 }
911 
912 /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
914  // Currently all conventions wait, but this may not always be the case.
915  //
916  // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
917  // senses to omit the wait and do it in the caller.
918  return true;
919 }
920 
921 /// \returns true if the callee is expected to wait for any outstanding waits
922 /// before returning.
924  return true;
925 }
926 
927 /// Generate s_waitcnt instruction to be placed before cur_Inst.
928 /// Instructions of a given type are returned in order,
929 /// but instructions of different types can complete out of order.
930 /// We rely on this in-order completion
931 /// and simply assign a score to the memory access instructions.
932 /// We keep track of the active "score bracket" to determine
933 /// if an access of a memory read requires an s_waitcnt
934 /// and if so what the value of each counter is.
935 /// The "score bracket" is bound by the lower bound and upper bound
936 /// scores (*_score_LB and *_score_ub respectively).
937 bool SIInsertWaitcnts::generateWaitcntInstBefore(
938  MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
939  MachineInstr *OldWaitcntInstr) {
940  setForceEmitWaitcnt();
941 
942  if (MI.isMetaInstruction())
943  return false;
944 
946  bool Modified = false;
947 
948  // See if this instruction has a forced S_WAITCNT VM.
949  // TODO: Handle other cases of NeedsWaitcntVmBefore()
950  if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
951  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
952  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
953  MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
954  MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
955  Wait.VmCnt = 0;
956  }
957 
958  // All waits must be resolved at call return.
959  // NOTE: this could be improved with knowledge of all call sites or
960  // with knowledge of the called routines.
961  if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
962  MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
963  (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
964  Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
965  }
966  // Resolve vm waits before gs-done.
967  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
968  MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
969  ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
971  Wait.VmCnt = 0;
972  }
973 #if 0 // TODO: the following blocks of logic when we have fence.
974  else if (MI.getOpcode() == SC_FENCE) {
975  const unsigned int group_size =
976  context->shader_info->GetMaxThreadGroupSize();
977  // group_size == 0 means thread group size is unknown at compile time
978  const bool group_is_multi_wave =
979  (group_size == 0 || group_size > target_info->GetWaveFrontSize());
980  const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
981 
982  for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
983  SCRegType src_type = Inst->GetSrcType(i);
984  switch (src_type) {
985  case SCMEM_LDS:
986  if (group_is_multi_wave ||
987  context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
988  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
989  ScoreBrackets->getScoreUB(LGKM_CNT));
990  // LDS may have to wait for VM_CNT after buffer load to LDS
991  if (target_info->HasBufferLoadToLDS()) {
992  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
993  ScoreBrackets->getScoreUB(VM_CNT));
994  }
995  }
996  break;
997 
998  case SCMEM_GDS:
999  if (group_is_multi_wave || fence_is_global) {
1000  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1001  ScoreBrackets->getScoreUB(EXP_CNT));
1002  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
1003  ScoreBrackets->getScoreUB(LGKM_CNT));
1004  }
1005  break;
1006 
1007  case SCMEM_UAV:
1008  case SCMEM_TFBUF:
1009  case SCMEM_RING:
1010  case SCMEM_SCATTER:
1011  if (group_is_multi_wave || fence_is_global) {
1012  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1013  ScoreBrackets->getScoreUB(EXP_CNT));
1014  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
1015  ScoreBrackets->getScoreUB(VM_CNT));
1016  }
1017  break;
1018 
1019  case SCMEM_SCRATCH:
1020  default:
1021  break;
1022  }
1023  }
1024  }
1025 #endif
1026 
1027  // Export & GDS instructions do not read the EXEC mask until after the export
1028  // is granted (which can occur well after the instruction is issued).
1029  // The shader program must flush all EXP operations on the export-count
1030  // before overwriting the EXEC mask.
1031  else {
1032  if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1033  // Export and GDS are tracked individually, either may trigger a waitcnt
1034  // for EXEC.
1035  if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1036  ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1037  ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1038  ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1039  Wait.ExpCnt = 0;
1040  }
1041  }
1042 
1043  if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1044  // The function is going to insert a wait on everything in its prolog.
1045  // This still needs to be careful if the call target is a load (e.g. a GOT
1046  // load). We also need to check WAW depenancy with saved PC.
1047  Wait = AMDGPU::Waitcnt();
1048 
1049  int CallAddrOpIdx =
1050  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
1051 
1052  if (MI.getOperand(CallAddrOpIdx).isReg()) {
1053  RegInterval CallAddrOpInterval =
1054  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx);
1055 
1056  for (int RegNo = CallAddrOpInterval.first;
1057  RegNo < CallAddrOpInterval.second; ++RegNo)
1058  ScoreBrackets.determineWait(
1059  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1060 
1061  int RtnAddrOpIdx =
1062  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
1063  if (RtnAddrOpIdx != -1) {
1064  RegInterval RtnAddrOpInterval =
1065  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx);
1066 
1067  for (int RegNo = RtnAddrOpInterval.first;
1068  RegNo < RtnAddrOpInterval.second; ++RegNo)
1069  ScoreBrackets.determineWait(
1070  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1071  }
1072  }
1073  } else {
1074  // FIXME: Should not be relying on memoperands.
1075  // Look at the source operands of every instruction to see if
1076  // any of them results from a previous memory operation that affects
1077  // its current usage. If so, an s_waitcnt instruction needs to be
1078  // emitted.
1079  // If the source operand was defined by a load, add the s_waitcnt
1080  // instruction.
1081  //
1082  // Two cases are handled for destination operands:
1083  // 1) If the destination operand was defined by a load, add the s_waitcnt
1084  // instruction to guarantee the right WAW order.
1085  // 2) If a destination operand that was used by a recent export/store ins,
1086  // add s_waitcnt on exp_cnt to guarantee the WAR order.
1087  for (const MachineMemOperand *Memop : MI.memoperands()) {
1088  const Value *Ptr = Memop->getValue();
1089  if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
1090  addWait(Wait, LGKM_CNT, 0);
1091  if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1092  SLoadAddresses.erase(Ptr);
1093  }
1094  unsigned AS = Memop->getAddrSpace();
1095  if (AS != AMDGPUAS::LOCAL_ADDRESS)
1096  continue;
1097  unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1098  // VM_CNT is only relevant to vgpr or LDS.
1099  ScoreBrackets.determineWait(
1100  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1101  if (Memop->isStore()) {
1102  ScoreBrackets.determineWait(
1103  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1104  }
1105  }
1106 
1107  // Loop over use and def operands.
1108  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1109  MachineOperand &Op = MI.getOperand(I);
1110  if (!Op.isReg())
1111  continue;
1112  RegInterval Interval =
1113  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
1114 
1115  const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1116  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1117  if (IsVGPR) {
1118  // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1119  // previous write and this write are the same type of VMEM
1120  // instruction, in which case they're guaranteed to write their
1121  // results in order anyway.
1122  if (Op.isUse() || !SIInstrInfo::isVMEM(MI) ||
1123  ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1124  getVmemType(MI))) {
1125  ScoreBrackets.determineWait(
1126  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1127  ScoreBrackets.clearVgprVmemTypes(RegNo);
1128  }
1129  if (Op.isDef()) {
1130  ScoreBrackets.determineWait(
1131  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1132  }
1133  }
1134  ScoreBrackets.determineWait(
1135  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1136  }
1137  }
1138  }
1139  }
1140 
1141  // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1142  // occurs before the instruction. Doing it here prevents any additional
1143  // S_WAITCNTs from being emitted if the instruction was marked as
1144  // requiring a WAITCNT beforehand.
1145  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1146  !ST->hasAutoWaitcntBeforeBarrier()) {
1147  Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
1148  }
1149 
1150  // TODO: Remove this work-around, enable the assert for Bug 457939
1151  // after fixing the scheduler. Also, the Shader Compiler code is
1152  // independent of target.
1153  if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1154  if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1155  ScoreBrackets.getScoreUB(LGKM_CNT) &&
1156  ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1157  Wait.LgkmCnt = 0;
1158  }
1159  }
1160 
1161  // Verify that the wait is actually needed.
1162  ScoreBrackets.simplifyWaitcnt(Wait);
1163 
1164  if (ForceEmitZeroWaitcnts)
1165  Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt());
1166 
1167  if (ForceEmitWaitcnt[VM_CNT])
1168  Wait.VmCnt = 0;
1169  if (ForceEmitWaitcnt[EXP_CNT])
1170  Wait.ExpCnt = 0;
1171  if (ForceEmitWaitcnt[LGKM_CNT])
1172  Wait.LgkmCnt = 0;
1173  if (ForceEmitWaitcnt[VS_CNT])
1174  Wait.VsCnt = 0;
1175 
1176  if (OldWaitcntInstr) {
1177  // Try to merge the required wait with preexisting waitcnt instructions.
1178  // Also erase redundant waitcnt.
1179  Modified =
1180  applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, &MI);
1181  } else {
1182  // Update waitcnt brackets after determining the required wait.
1183  ScoreBrackets.applyWaitcnt(Wait);
1184  }
1185 
1186  // Build new waitcnt instructions unless no wait is needed or the old waitcnt
1187  // instruction was modified to handle the required wait.
1188  if (Wait.hasWaitExceptVsCnt()) {
1189  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1190  auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1191  MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1192  .addImm(Enc);
1193  TrackedWaitcntSet.insert(SWaitInst);
1194  Modified = true;
1195 
1196  LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1197  << "Old Instr: " << MI
1198  << "New Instr: " << *SWaitInst << '\n');
1199  }
1200 
1201  if (Wait.hasWaitVsCnt()) {
1202  assert(ST->hasVscnt());
1203 
1204  auto SWaitInst =
1205  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1206  TII->get(AMDGPU::S_WAITCNT_VSCNT))
1207  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1208  .addImm(Wait.VsCnt);
1209  TrackedWaitcntSet.insert(SWaitInst);
1210  Modified = true;
1211 
1212  LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1213  << "Old Instr: " << MI
1214  << "New Instr: " << *SWaitInst << '\n');
1215  }
1216 
1217  return Modified;
1218 }
1219 
1220 // This is a flat memory operation. Check to see if it has memory tokens other
1221 // than LDS. Other address spaces supported by flat memory operations involve
1222 // global memory.
1223 bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1224  assert(TII->isFLAT(MI));
1225 
1226  // All flat instructions use the VMEM counter.
1227  assert(TII->usesVM_CNT(MI));
1228 
1229  // If there are no memory operands then conservatively assume the flat
1230  // operation may access VMEM.
1231  if (MI.memoperands_empty())
1232  return true;
1233 
1234  // See if any memory operand specifies an address space that involves VMEM.
1235  // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1236  // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1237  // (GDS) address space is not supported by flat operations. Therefore, simply
1238  // return true unless only the LDS address space is found.
1239  for (const MachineMemOperand *Memop : MI.memoperands()) {
1240  unsigned AS = Memop->getAddrSpace();
1242  if (AS != AMDGPUAS::LOCAL_ADDRESS)
1243  return true;
1244  }
1245 
1246  return false;
1247 }
1248 
1249 // This is a flat memory operation. Check to see if it has memory tokens for
1250 // either LDS or FLAT.
1251 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1252  assert(TII->isFLAT(MI));
1253 
1254  // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1255  if (!TII->usesLGKM_CNT(MI))
1256  return false;
1257 
1258  // If in tgsplit mode then there can be no use of LDS.
1259  if (ST->isTgSplitEnabled())
1260  return false;
1261 
1262  // If there are no memory operands then conservatively assume the flat
1263  // operation may access LDS.
1264  if (MI.memoperands_empty())
1265  return true;
1266 
1267  // See if any memory operand specifies an address space that involves LDS.
1268  for (const MachineMemOperand *Memop : MI.memoperands()) {
1269  unsigned AS = Memop->getAddrSpace();
1271  return true;
1272  }
1273 
1274  return false;
1275 }
1276 
1277 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1278  WaitcntBrackets *ScoreBrackets) {
1279  // Now look at the instruction opcode. If it is a memory access
1280  // instruction, update the upper-bound of the appropriate counter's
1281  // bracket and the destination operand scores.
1282  // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1283  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1284  if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1285  TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1286  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1287  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1288  } else {
1289  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1290  }
1291  } else if (TII->isFLAT(Inst)) {
1292  assert(Inst.mayLoadOrStore());
1293 
1294  int FlatASCount = 0;
1295 
1296  if (mayAccessVMEMThroughFlat(Inst)) {
1297  ++FlatASCount;
1298  if (!ST->hasVscnt())
1299  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1300  else if (Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst))
1301  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1302  else
1303  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1304  }
1305 
1306  if (mayAccessLDSThroughFlat(Inst)) {
1307  ++FlatASCount;
1308  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1309  }
1310 
1311  // A Flat memory operation must access at least one address space.
1312  assert(FlatASCount);
1313 
1314  // This is a flat memory operation that access both VMEM and LDS, so note it
1315  // - it will require that both the VM and LGKM be flushed to zero if it is
1316  // pending when a VM or LGKM dependency occurs.
1317  if (FlatASCount > 1)
1318  ScoreBrackets->setPendingFlat();
1319  } else if (SIInstrInfo::isVMEM(Inst) &&
1320  // TODO: get a better carve out.
1321  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1322  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1323  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL &&
1324  Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV &&
1325  Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) {
1326  if (!ST->hasVscnt())
1327  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1328  else if ((Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst)) ||
1329  /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
1330  (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
1331  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1332  else if (Inst.mayStore())
1333  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1334 
1335  if (ST->vmemWriteNeedsExpWaitcnt() &&
1336  (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
1337  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1338  }
1339  } else if (TII->isSMRD(Inst)) {
1340  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1341  } else if (Inst.isCall()) {
1342  if (callWaitsOnFunctionReturn(Inst)) {
1343  // Act as a wait on everything
1344  ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
1345  } else {
1346  // May need to way wait for anything.
1347  ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
1348  }
1349  } else if (SIInstrInfo::isEXP(Inst)) {
1350  unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1351  if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
1352  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1353  else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
1354  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1355  else
1356  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1357  } else {
1358  switch (Inst.getOpcode()) {
1359  case AMDGPU::S_SENDMSG:
1360  case AMDGPU::S_SENDMSGHALT:
1361  ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1362  break;
1363  case AMDGPU::S_MEMTIME:
1364  case AMDGPU::S_MEMREALTIME:
1365  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1366  break;
1367  }
1368  }
1369 }
1370 
1371 bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
1372  unsigned OtherScore) {
1373  unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
1374  unsigned OtherShifted =
1375  OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
1376  Score = std::max(MyShifted, OtherShifted);
1377  return OtherShifted > MyShifted;
1378 }
1379 
1380 /// Merge the pending events and associater score brackets of \p Other into
1381 /// this brackets status.
1382 ///
1383 /// Returns whether the merge resulted in a change that requires tighter waits
1384 /// (i.e. the merged brackets strictly dominate the original brackets).
1385 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
1386  bool StrictDom = false;
1387 
1388  VgprUB = std::max(VgprUB, Other.VgprUB);
1389  SgprUB = std::max(SgprUB, Other.SgprUB);
1390 
1391  for (auto T : inst_counter_types()) {
1392  // Merge event flags for this counter
1393  const bool OldOutOfOrder = counterOutOfOrder(T);
1394  const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
1395  const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
1396  if (OtherEvents & ~OldEvents)
1397  StrictDom = true;
1398  PendingEvents |= OtherEvents;
1399 
1400  // Merge scores for this counter
1401  const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
1402  const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
1403  const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
1404  if (NewUB < ScoreLBs[T])
1405  report_fatal_error("waitcnt score overflow");
1406 
1407  MergeInfo M;
1408  M.OldLB = ScoreLBs[T];
1409  M.OtherLB = Other.ScoreLBs[T];
1410  M.MyShift = NewUB - ScoreUBs[T];
1411  M.OtherShift = NewUB - Other.ScoreUBs[T];
1412 
1413  ScoreUBs[T] = NewUB;
1414 
1415  StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
1416 
1417  bool RegStrictDom = false;
1418  for (int J = 0; J <= VgprUB; J++) {
1419  RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
1420  }
1421 
1422  if (T == VM_CNT) {
1423  for (int J = 0; J <= VgprUB; J++) {
1424  unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
1425  RegStrictDom |= NewVmemTypes != VgprVmemTypes[J];
1426  VgprVmemTypes[J] = NewVmemTypes;
1427  }
1428  }
1429 
1430  if (T == LGKM_CNT) {
1431  for (int J = 0; J <= SgprUB; J++) {
1432  RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
1433  }
1434  }
1435 
1436  if (RegStrictDom && !OldOutOfOrder)
1437  StrictDom = true;
1438  }
1439 
1440  return StrictDom;
1441 }
1442 
1443 // Generate s_waitcnt instructions where needed.
1444 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1445  MachineBasicBlock &Block,
1446  WaitcntBrackets &ScoreBrackets) {
1447  bool Modified = false;
1448 
1449  LLVM_DEBUG({
1450  dbgs() << "*** Block" << Block.getNumber() << " ***";
1451  ScoreBrackets.dump();
1452  });
1453 
1454  // Track the correctness of vccz through this basic block. There are two
1455  // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
1456  // ST->partialVCCWritesUpdateVCCZ().
1457  bool VCCZCorrect = true;
1458  if (ST->hasReadVCCZBug()) {
1459  // vccz could be incorrect at a basic block boundary if a predecessor wrote
1460  // to vcc and then issued an smem load.
1461  VCCZCorrect = false;
1462  } else if (!ST->partialVCCWritesUpdateVCCZ()) {
1463  // vccz could be incorrect at a basic block boundary if a predecessor wrote
1464  // to vcc_lo or vcc_hi.
1465  VCCZCorrect = false;
1466  }
1467 
1468  // Walk over the instructions.
1469  MachineInstr *OldWaitcntInstr = nullptr;
1470 
1471  for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
1472  E = Block.instr_end();
1473  Iter != E;) {
1474  MachineInstr &Inst = *Iter;
1475 
1476  // Track pre-existing waitcnts that were added in earlier iterations or by
1477  // the memory legalizer.
1478  if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
1479  (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1480  Inst.getOperand(0).isReg() &&
1481  Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL)) {
1482  if (!OldWaitcntInstr)
1483  OldWaitcntInstr = &Inst;
1484  ++Iter;
1485  continue;
1486  }
1487 
1488  // Generate an s_waitcnt instruction to be placed before Inst, if needed.
1489  Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
1490  OldWaitcntInstr = nullptr;
1491 
1492  // Restore vccz if it's not known to be correct already.
1493  bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
1494 
1495  // Don't examine operands unless we need to track vccz correctness.
1496  if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
1497  if (Inst.definesRegister(AMDGPU::VCC_LO) ||
1498  Inst.definesRegister(AMDGPU::VCC_HI)) {
1499  // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
1500  if (!ST->partialVCCWritesUpdateVCCZ())
1501  VCCZCorrect = false;
1502  } else if (Inst.definesRegister(AMDGPU::VCC)) {
1503  // There is a hardware bug on CI/SI where SMRD instruction may corrupt
1504  // vccz bit, so when we detect that an instruction may read from a
1505  // corrupt vccz bit, we need to:
1506  // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
1507  // operations to complete.
1508  // 2. Restore the correct value of vccz by writing the current value
1509  // of vcc back to vcc.
1510  if (ST->hasReadVCCZBug() &&
1511  ScoreBrackets.getScoreLB(LGKM_CNT) <
1512  ScoreBrackets.getScoreUB(LGKM_CNT) &&
1513  ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1514  // Writes to vcc while there's an outstanding smem read may get
1515  // clobbered as soon as any read completes.
1516  VCCZCorrect = false;
1517  } else {
1518  // Writes to vcc will fix any incorrect value in vccz.
1519  VCCZCorrect = true;
1520  }
1521  }
1522  }
1523 
1524  if (TII->isSMRD(Inst)) {
1525  for (const MachineMemOperand *Memop : Inst.memoperands()) {
1526  const Value *Ptr = Memop->getValue();
1527  SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
1528  }
1529  if (ST->hasReadVCCZBug()) {
1530  // This smem read could complete and clobber vccz at any time.
1531  VCCZCorrect = false;
1532  }
1533  }
1534 
1535  updateEventWaitcntAfter(Inst, &ScoreBrackets);
1536 
1537 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1538  // If this instruction generates a S_SETVSKIP because it is an
1539  // indexed resource, and we are on Tahiti, then it will also force
1540  // an S_WAITCNT vmcnt(0)
1541  if (RequireCheckResourceType(Inst, context)) {
1542  // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1543  ScoreBrackets->setScoreLB(VM_CNT,
1544  ScoreBrackets->getScoreUB(VM_CNT));
1545  }
1546 #endif
1547 
1548  LLVM_DEBUG({
1549  Inst.print(dbgs());
1550  ScoreBrackets.dump();
1551  });
1552 
1553  // TODO: Remove this work-around after fixing the scheduler and enable the
1554  // assert above.
1555  if (RestoreVCCZ) {
1556  // Restore the vccz bit. Any time a value is written to vcc, the vcc
1557  // bit is updated, so we can restore the bit by reading the value of
1558  // vcc and then writing it back to the register.
1559  BuildMI(Block, Inst, Inst.getDebugLoc(),
1560  TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1561  TRI->getVCC())
1562  .addReg(TRI->getVCC());
1563  VCCZCorrect = true;
1564  Modified = true;
1565  }
1566 
1567  ++Iter;
1568  }
1569 
1570  return Modified;
1571 }
1572 
1573 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1574  ST = &MF.getSubtarget<GCNSubtarget>();
1575  TII = ST->getInstrInfo();
1576  TRI = &TII->getRegisterInfo();
1577  MRI = &MF.getRegInfo();
1578  IV = AMDGPU::getIsaVersion(ST->getCPU());
1580  PDT = &getAnalysis<MachinePostDominatorTree>();
1581 
1582  ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1583  for (auto T : inst_counter_types())
1584  ForceEmitWaitcnt[T] = false;
1585 
1586  HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1587  HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1588  HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1589  HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;
1590 
1591  unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
1592  unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
1593  assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1594  assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1595 
1596  RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1597  RegisterEncoding.VGPRL = RegisterEncoding.VGPR0 + NumVGPRsMax - 1;
1598  RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1599  RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + NumSGPRsMax - 1;
1600 
1601  TrackedWaitcntSet.clear();
1602  BlockInfos.clear();
1603  bool Modified = false;
1604 
1605  if (!MFI->isEntryFunction()) {
1606  // Wait for any outstanding memory operations that the input registers may
1607  // depend on. We can't track them and it's better to do the wait after the
1608  // costly call sequence.
1609 
1610  // TODO: Could insert earlier and schedule more liberally with operations
1611  // that only use caller preserved registers.
1612  MachineBasicBlock &EntryBB = MF.front();
1613  MachineBasicBlock::iterator I = EntryBB.begin();
1614  for (MachineBasicBlock::iterator E = EntryBB.end();
1615  I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
1616  ;
1617  BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
1618  if (ST->hasVscnt())
1619  BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
1620  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1621  .addImm(0);
1622 
1623  Modified = true;
1624  }
1625 
1626  // Keep iterating over the blocks in reverse post order, inserting and
1627  // updating s_waitcnt where needed, until a fix point is reached.
1629  BlockInfos.insert({MBB, BlockInfo(MBB)});
1630 
1631  std::unique_ptr<WaitcntBrackets> Brackets;
1632  bool Repeat;
1633  do {
1634  Repeat = false;
1635 
1636  for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
1637  ++BII) {
1638  BlockInfo &BI = BII->second;
1639  if (!BI.Dirty)
1640  continue;
1641 
1642  if (BI.Incoming) {
1643  if (!Brackets)
1644  Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
1645  else
1646  *Brackets = *BI.Incoming;
1647  } else {
1648  if (!Brackets)
1649  Brackets = std::make_unique<WaitcntBrackets>(ST);
1650  else
1651  *Brackets = WaitcntBrackets(ST);
1652  }
1653 
1654  Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
1655  BI.Dirty = false;
1656 
1657  if (Brackets->hasPending()) {
1658  BlockInfo *MoveBracketsToSucc = nullptr;
1659  for (MachineBasicBlock *Succ : BI.MBB->successors()) {
1660  auto SuccBII = BlockInfos.find(Succ);
1661  BlockInfo &SuccBI = SuccBII->second;
1662  if (!SuccBI.Incoming) {
1663  SuccBI.Dirty = true;
1664  if (SuccBII <= BII)
1665  Repeat = true;
1666  if (!MoveBracketsToSucc) {
1667  MoveBracketsToSucc = &SuccBI;
1668  } else {
1669  SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
1670  }
1671  } else if (SuccBI.Incoming->merge(*Brackets)) {
1672  SuccBI.Dirty = true;
1673  if (SuccBII <= BII)
1674  Repeat = true;
1675  }
1676  }
1677  if (MoveBracketsToSucc)
1678  MoveBracketsToSucc->Incoming = std::move(Brackets);
1679  }
1680  }
1681  } while (Repeat);
1682 
1684 
1685  bool HaveScalarStores = false;
1686 
1687  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1688  ++BI) {
1689  MachineBasicBlock &MBB = *BI;
1690 
1691  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1692  ++I) {
1693  if (!HaveScalarStores && TII->isScalarStore(*I))
1694  HaveScalarStores = true;
1695 
1696  if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1697  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1698  EndPgmBlocks.push_back(&MBB);
1699  }
1700  }
1701 
1702  if (HaveScalarStores) {
1703  // If scalar writes are used, the cache must be flushed or else the next
1704  // wave to reuse the same scratch memory can be clobbered.
1705  //
1706  // Insert s_dcache_wb at wave termination points if there were any scalar
1707  // stores, and only if the cache hasn't already been flushed. This could be
1708  // improved by looking across blocks for flushes in postdominating blocks
1709  // from the stores but an explicitly requested flush is probably very rare.
1710  for (MachineBasicBlock *MBB : EndPgmBlocks) {
1711  bool SeenDCacheWB = false;
1712 
1713  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1714  ++I) {
1715  if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1716  SeenDCacheWB = true;
1717  else if (TII->isScalarStore(*I))
1718  SeenDCacheWB = false;
1719 
1720  // FIXME: It would be better to insert this before a waitcnt if any.
1721  if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1722  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1723  !SeenDCacheWB) {
1724  Modified = true;
1725  BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1726  }
1727  }
1728  }
1729  }
1730 
1731  return Modified;
1732 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
llvm::AMDGPU::getMCReg
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
Definition: AMDGPUBaseInfo.cpp:1490
llvm::SIInstrInfo::isAtomicRet
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:554
llvm::SIInstrFlags::VM_CNT
@ VM_CNT
Definition: SIDefines.h:59
readsVCCZ
static bool readsVCCZ(const MachineInstr &MI)
Definition: SIInsertWaitcnts.cpp:906
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:100
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
Definition: AllocatorList.h:23
Reg
unsigned Reg
Definition: MachineSink.cpp:1566
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
print
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Definition: ArchiveWriter.cpp:147
llvm::AMDGPU::getIsaVersion
IsaVersion getIsaVersion(StringRef GPU)
Definition: TargetParser.cpp:186
SIMachineFunctionInfo.h
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:52
llvm::MachineInstr::mayLoadOrStore
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition: MachineInstr.h:1017
llvm::MachineInstr::mayLoad
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:994
Waitcnts
SI Insert Waitcnts
Definition: SIInsertWaitcnts.cpp:794
llvm::MapVector::clear
void clear()
Definition: MapVector.h:88
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) INITIALIZE_PASS_END(SIInsertWaitcnts
llvm::MachineFunction::end
iterator end()
Definition: MachineFunction.h:742
llvm::TargetRegisterInfo::isInAllocatableClass
bool isInAllocatableClass(MCRegister RegNo) const
Return true if the register is in the allocation of any register class.
Definition: TargetRegisterInfo.h:359
MapVector.h
llvm::SIInstrInfo::isEXP
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:538
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::erase
bool erase(const KeyT &Val)
Definition: DenseMap.h:302
llvm::MachineOperand::setImm
void setImm(int64_t immVal)
Definition: MachineOperand.h:655
llvm::MCRegisterInfo::getEncodingValue
uint16_t getEncodingValue(MCRegister RegNo) const
Returns the encoding for RegNo.
Definition: MCRegisterInfo.h:553
llvm::AMDGPU::Exp::ET_POS0
@ ET_POS0
Definition: SIDefines.h:737
llvm::SIInstrFlags::LGKM_CNT
@ LGKM_CNT
Definition: SIDefines.h:61
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:127
llvm::SIInsertWaitcntsID
char & SIInsertWaitcntsID
Definition: SIInsertWaitcnts.cpp:799
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:145
T
#define T
Definition: Mips16ISelLowering.cpp:341
llvm::MapVector
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
llvm::GCNSubtarget
Definition: GCNSubtarget.h:38
llvm::AMDGPU::getVmcntBitMask
unsigned getVmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:858
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:876
TargetParser.h
llvm::AMDGPU::getNamedOperandIdx
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
DEBUG_COUNTER
DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", "Force emit s_waitcnt expcnt(0) instrs")
llvm::AMDGPU::Exp::ET_PARAM0
@ ET_PARAM0
Definition: SIDefines.h:742
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::count
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1567
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:102
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
llvm::DebugCounter::isCounterSet
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:102
llvm::AMDGPU::SendMsg::ID_MASK_
@ ID_MASK_
Definition: SIDefines.h:311
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
llvm::AMDGPU::IsaVersion
Instruction set architecture version.
Definition: TargetParser.h:102
llvm::MachineFunction::front
const MachineBasicBlock & front() const
Definition: MachineFunction.h:752
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:568
llvm::MapVector::begin
iterator begin()
Definition: MapVector.h:69
llvm::MachineInstr::isCall
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:812
llvm::SIInstrInfo::isMIMG
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:485
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineOperand::getImm
int64_t getImm() const
Definition: MachineOperand.h:537
llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:656
llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:488
llvm::AMDGPUISD::DS_ORDERED_COUNT
@ DS_ORDERED_COUNT
Definition: AMDGPUISelLowering.h:491
llvm::createSIInsertWaitcntsPass
FunctionPass * createSIInsertWaitcntsPass()
Definition: SIInsertWaitcnts.cpp:801
SI
@ SI
Definition: SIInstrInfo.cpp:7463
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
t
bitcast float %x to i32 %s=and i32 %t, 2147483647 %d=bitcast i32 %s to float ret float %d } declare float @fabsf(float %n) define float @bar(float %x) nounwind { %d=call float @fabsf(float %x) ret float %d } This IR(from PR6194):target datalayout="e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple="x86_64-apple-darwin10.0.0" %0=type { double, double } %struct.float3=type { float, float, float } define void @test(%0, %struct.float3 *nocapture %res) nounwind noinline ssp { entry:%tmp18=extractvalue %0 %0, 0 t
Definition: README-SSE.txt:788
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
false
Definition: StackSlotColoring.cpp:142
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:129
merge
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Definition: LoopDeletion.cpp:42
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:49
llvm::DebugCounter::shouldExecute
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:74
llvm::report_fatal_error
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:140
llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:50
LoopDeletionResult::Modified
@ Modified
llvm::AMDGPU::decodeWaitcnt
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
Definition: AMDGPUBaseInfo.cpp:909
llvm::MachineFunction::begin
iterator begin()
Definition: MachineFunction.h:740
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:29
llvm::MachineInstr::definesRegister
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr fully defines the specified register.
Definition: MachineInstr.h:1385
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:361
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:95
llvm::AMDGPU::Exp::ET_POS_LAST
@ ET_POS_LAST
Definition: SIDefines.h:740
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::DenseSet
Implements a dense probed hash-table based set.
Definition: DenseSet.h:268
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:558
llvm::cl::opt< bool >
llvm::MachineInstr::getDebugLoc
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:418
llvm::AMDGPU::getExpcntBitMask
unsigned getExpcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:867
AMDGPUMCTargetDesc.h
llvm::MapVector::find
iterator find(const KeyT &Key)
Definition: MapVector.h:147
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:321
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:64
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SIInsertWaitcnts.cpp:38
llvm::Interval
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:36
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
move
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional move
Definition: README.txt:546
llvm::DenseMap
Definition: DenseMap.h:714
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::AMDGPUMachineFunction::isEntryFunction
bool isEntryFunction() const
Definition: AMDGPUMachineFunction.h:78
llvm::iterator_facade_base
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition: iterator.h:66
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
callWaitsOnFunctionEntry
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
Definition: SIInsertWaitcnts.cpp:913
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::find
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:150
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition: MachineInstrBuilder.h:52
llvm::operator==
bool operator==(uint64_t V1, const APInt &V2)
Definition: APInt.h:2037
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
MachinePostDominators.h
llvm::AMDGPU::getMIMGInfo
const LLVM_READONLY MIMGInfo * getMIMGInfo(unsigned Opc)
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:360
llvm::MachineFunction
Definition: MachineFunction.h:230
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::clear
void clear()
Definition: DenseSet.h:92
llvm::AMDGPU::getMIMGBaseOpcodeInfo
const LLVM_READONLY MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
llvm::operator*
APInt operator*(APInt a, uint64_t RHS)
Definition: APInt.h:2161
llvm::MapVector::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:117
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:253
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
AMDGPU.h
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:478
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:81
llvm::MachineInstr::print
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
Definition: MachineInstr.cpp:1577
llvm::MachineOperand::isDef
bool isDef() const
Definition: MachineOperand.h:375
llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:286
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:207
llvm::MachinePostDominatorTree
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
Definition: MachinePostDominators.h:27
MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105
llvm::AMDGPU::SendMsg::ID_GS_DONE
@ ID_GS_DONE
Definition: SIDefines.h:297
llvm::MapVector::end
iterator end()
Definition: MapVector.h:71
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::AMDGPU::encodeWaitcnt
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Definition: AMDGPUBaseInfo.cpp:946
llvm::MachinePostDominatorTree::dominates
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
Definition: MachinePostDominators.h:54
llvm::AMDGPU::Waitcnt
Represents the counter values to wait for in an s_waitcnt instruction.
Definition: AMDGPUBaseInfo.h:461
llvm::TargetRegisterInfo::getRegSizeInBits
unsigned getRegSizeInBits(const TargetRegisterClass &RC) const
Return the size in bits of a register from class RC.
Definition: TargetRegisterInfo.h:274
ForceEmitZeroFlag
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
llvm::SIInstrInfo::isAtomicNoRet
static bool isAtomicNoRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:546
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:314
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::MachineInstr::mayStore
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:1007
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:364
llvm::SIInstrInfo::isVMEM
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:353
DebugCounter.h
llvm::TargetStackID::Value
Value
Definition: TargetFrameLowering.h:27
llvm::AMDGPU::MIMGInfo
Definition: AMDGPUBaseInfo.h:350
llvm::ReversePostOrderTraversal
Definition: PostOrderIterator.h:290
llvm::AMDGPU::getLgkmcntBitMask
unsigned getLgkmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:871
llvm::SIInstrInfo
Definition: SIInstrInfo.h:38
PostOrderIterator.h
callWaitsOnFunctionReturn
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
Definition: SIInsertWaitcnts.cpp:923
llvm::MachineBasicBlock::begin
iterator begin()
Definition: MachineBasicBlock.h:268
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:328
llvm::MachineInstr::getNumOperands
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:481
llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition: SIMachineFunctionInfo.h:335
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
llvm::iterator_range
A range adaptor for a pair of iterators.
Definition: iterator_range.h:30
llvm::MachineInstr::memoperands
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:679
llvm::sys::Wait
ProcessInfo Wait(const ProcessInfo &PI, unsigned SecondsToWait, bool WaitUntilTerminates, std::string *ErrMsg=nullptr, Optional< ProcessStatistics > *ProcStat=nullptr)
This function waits for the process specified by PI to finish.
llvm::SIInstrFlags::EXP_CNT
@ EXP_CNT
Definition: SIDefines.h:60
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:359
llvm::AMDGPU::Exp::ET_PARAM31
@ ET_PARAM31
Definition: SIDefines.h:743
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::AMDGPU::Waitcnt::allZero
static Waitcnt allZero(bool HasVscnt)
Definition: AMDGPUBaseInfo.h:471
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::cl::desc
Definition: CommandLine.h:414
llvm::MachineInstr::eraseFromParent
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Definition: MachineInstr.cpp:677
llvm::MachineInstrBundleIterator< MachineInstr >
llvm::pdb::PDB_SymType::Block
@ Block
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:270
llvm::AMDGPU::MIMGBaseOpcodeInfo::Sampler
bool Sampler
Definition: AMDGPUBaseInfo.h:285
Other
Optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1167
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38