LLVM  14.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert wait instructions for memory reads and writes.
11 ///
12 /// Memory reads and writes are issued asynchronously, so we need to insert
13 /// S_WAITCNT instructions when we want to access any of their results or
14 /// overwrite any register that's used asynchronously.
15 ///
16 /// TODO: This pass currently keeps one timeline per hardware counter. A more
17 /// finely-grained approach that keeps one timeline per event type could
18 /// sometimes get away with generating weaker s_waitcnt instructions. For
19 /// example, when both SMEM and LDS are in flight and we need to wait for
20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21 /// but the pass will currently generate a conservative lgkmcnt(0) because
22 /// multiple event types are in flight.
23 //
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPU.h"
27 #include "GCNSubtarget.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "Utils/AMDGPUBaseInfo.h"
31 #include "llvm/ADT/MapVector.h"
34 #include "llvm/InitializePasses.h"
37 using namespace llvm;
38 
39 #define DEBUG_TYPE "si-insert-waitcnts"
40 
41 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
42  "Force emit s_waitcnt expcnt(0) instrs");
43 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
44  "Force emit s_waitcnt lgkmcnt(0) instrs");
45 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
46  "Force emit s_waitcnt vmcnt(0) instrs");
47 
49  "amdgpu-waitcnt-forcezero",
50  cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
51  cl::init(false), cl::Hidden);
52 
53 namespace {
54 
55 template <typename EnumT>
56 class enum_iterator
57  : public iterator_facade_base<enum_iterator<EnumT>,
58  std::forward_iterator_tag, const EnumT> {
59  EnumT Value;
60 public:
61  enum_iterator() = default;
62  enum_iterator(EnumT Value) : Value(Value) {}
63 
64  enum_iterator &operator++() {
65  Value = static_cast<EnumT>(Value + 1);
66  return *this;
67  }
68 
69  bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
70 
71  EnumT operator*() const { return Value; }
72 };
73 
74 // Class of object that encapsulates latest instruction counter score
75 // associated with the operand. Used for determining whether
76 // s_waitcnt instruction needs to be emited.
77 
78 #define CNT_MASK(t) (1u << (t))
79 
80 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
81 
83  return make_range(enum_iterator<InstCounterType>(VM_CNT),
84  enum_iterator<InstCounterType>(NUM_INST_CNTS));
85 }
86 
87 using RegInterval = std::pair<int, int>;
88 
89 struct {
90  unsigned VmcntMax;
91  unsigned ExpcntMax;
92  unsigned LgkmcntMax;
93  unsigned VscntMax;
94 } HardwareLimits;
95 
96 struct {
97  unsigned VGPR0;
98  unsigned VGPRL;
99  unsigned SGPR0;
100  unsigned SGPRL;
101 } RegisterEncoding;
102 
103 enum WaitEventType {
104  VMEM_ACCESS, // vector-memory read & write
105  VMEM_READ_ACCESS, // vector-memory read
106  VMEM_WRITE_ACCESS,// vector-memory write
107  LDS_ACCESS, // lds read & write
108  GDS_ACCESS, // gds read & write
109  SQ_MESSAGE, // send message
110  SMEM_ACCESS, // scalar-memory read & write
111  EXP_GPR_LOCK, // export holding on its data src
112  GDS_GPR_LOCK, // GDS holding on its data and addr src
113  EXP_POS_ACCESS, // write to export position
114  EXP_PARAM_ACCESS, // write to export parameter
115  VMW_GPR_LOCK, // vector-memory write holding on its data src
116  NUM_WAIT_EVENTS,
117 };
118 
119 static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
120  (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
121  (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
122  (1 << SQ_MESSAGE),
123  (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
124  (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
125  (1 << VMEM_WRITE_ACCESS)
126 };
127 
128 // The mapping is:
129 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
130 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
131 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
132 // We reserve a fixed number of VGPR slots in the scoring tables for
133 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
134 enum RegisterMapping {
135  SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
136  AGPR_OFFSET = 226, // Maximum programmable ArchVGPRs across all targets.
137  SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
138  NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
139  EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
140  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
141 };
142 
143 // Enumerate different types of result-returning VMEM operations. Although
144 // s_waitcnt orders them all with a single vmcnt counter, in the absence of
145 // s_waitcnt only instructions of the same VmemType are guaranteed to write
146 // their results in order -- so there is no need to insert an s_waitcnt between
147 // two instructions of the same type that write the same vgpr.
148 enum VmemType {
149  // BUF instructions and MIMG instructions without a sampler.
150  VMEM_NOSAMPLER,
151  // MIMG instructions with a sampler.
152  VMEM_SAMPLER,
153 };
154 
155 VmemType getVmemType(const MachineInstr &Inst) {
157  if (!SIInstrInfo::isMIMG(Inst))
158  return VMEM_NOSAMPLER;
160  return AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler
161  ? VMEM_SAMPLER
162  : VMEM_NOSAMPLER;
163 }
164 
165 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
166  switch (T) {
167  case VM_CNT:
168  Wait.VmCnt = std::min(Wait.VmCnt, Count);
169  break;
170  case EXP_CNT:
171  Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
172  break;
173  case LGKM_CNT:
174  Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
175  break;
176  case VS_CNT:
177  Wait.VsCnt = std::min(Wait.VsCnt, Count);
178  break;
179  default:
180  llvm_unreachable("bad InstCounterType");
181  }
182 }
183 
184 // This objects maintains the current score brackets of each wait counter, and
185 // a per-register scoreboard for each wait counter.
186 //
187 // We also maintain the latest score for every event type that can change the
188 // waitcnt in order to know if there are multiple types of events within
189 // the brackets. When multiple types of event happen in the bracket,
190 // wait count may get decreased out of order, therefore we need to put in
191 // "s_waitcnt 0" before use.
192 class WaitcntBrackets {
193 public:
194  WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {}
195 
196  static unsigned getWaitCountMax(InstCounterType T) {
197  switch (T) {
198  case VM_CNT:
199  return HardwareLimits.VmcntMax;
200  case LGKM_CNT:
201  return HardwareLimits.LgkmcntMax;
202  case EXP_CNT:
203  return HardwareLimits.ExpcntMax;
204  case VS_CNT:
205  return HardwareLimits.VscntMax;
206  default:
207  break;
208  }
209  return 0;
210  }
211 
212  unsigned getScoreLB(InstCounterType T) const {
213  assert(T < NUM_INST_CNTS);
214  return ScoreLBs[T];
215  }
216 
217  unsigned getScoreUB(InstCounterType T) const {
218  assert(T < NUM_INST_CNTS);
219  return ScoreUBs[T];
220  }
221 
222  // Mapping from event to counter.
223  InstCounterType eventCounter(WaitEventType E) {
224  if (WaitEventMaskForInst[VM_CNT] & (1 << E))
225  return VM_CNT;
226  if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
227  return LGKM_CNT;
228  if (WaitEventMaskForInst[VS_CNT] & (1 << E))
229  return VS_CNT;
230  assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
231  return EXP_CNT;
232  }
233 
234  unsigned getRegScore(int GprNo, InstCounterType T) {
235  if (GprNo < NUM_ALL_VGPRS) {
236  return VgprScores[T][GprNo];
237  }
238  assert(T == LGKM_CNT);
239  return SgprScores[GprNo - NUM_ALL_VGPRS];
240  }
241 
242  bool merge(const WaitcntBrackets &Other);
243 
244  RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
245  const MachineRegisterInfo *MRI,
246  const SIRegisterInfo *TRI, unsigned OpNo) const;
247 
248  bool counterOutOfOrder(InstCounterType T) const;
249  void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
250  void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
251  void determineWait(InstCounterType T, unsigned ScoreToWait,
252  AMDGPU::Waitcnt &Wait) const;
253  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
254  void applyWaitcnt(InstCounterType T, unsigned Count);
255  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
256  const MachineRegisterInfo *MRI, WaitEventType E,
257  MachineInstr &MI);
258 
259  bool hasPending() const { return PendingEvents != 0; }
260  bool hasPendingEvent(WaitEventType E) const {
261  return PendingEvents & (1 << E);
262  }
263 
264  bool hasMixedPendingEvents(InstCounterType T) const {
265  unsigned Events = PendingEvents & WaitEventMaskForInst[T];
266  // Return true if more than one bit is set in Events.
267  return Events & (Events - 1);
268  }
269 
270  bool hasPendingFlat() const {
271  return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
272  LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
273  (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
274  LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
275  }
276 
277  void setPendingFlat() {
278  LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
279  LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
280  }
281 
282  // Return true if there might be pending writes to the specified vgpr by VMEM
283  // instructions with types different from V.
284  bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
285  assert(GprNo < NUM_ALL_VGPRS);
286  return VgprVmemTypes[GprNo] & ~(1 << V);
287  }
288 
289  void clearVgprVmemTypes(int GprNo) {
290  assert(GprNo < NUM_ALL_VGPRS);
291  VgprVmemTypes[GprNo] = 0;
292  }
293 
294  void print(raw_ostream &);
295  void dump() { print(dbgs()); }
296 
297 private:
298  struct MergeInfo {
299  unsigned OldLB;
300  unsigned OtherLB;
301  unsigned MyShift;
302  unsigned OtherShift;
303  };
304  static bool mergeScore(const MergeInfo &M, unsigned &Score,
305  unsigned OtherScore);
306 
307  void setScoreLB(InstCounterType T, unsigned Val) {
308  assert(T < NUM_INST_CNTS);
309  ScoreLBs[T] = Val;
310  }
311 
312  void setScoreUB(InstCounterType T, unsigned Val) {
313  assert(T < NUM_INST_CNTS);
314  ScoreUBs[T] = Val;
315  if (T == EXP_CNT) {
316  unsigned UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
317  if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
318  ScoreLBs[T] = UB;
319  }
320  }
321 
322  void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
323  if (GprNo < NUM_ALL_VGPRS) {
324  VgprUB = std::max(VgprUB, GprNo);
325  VgprScores[T][GprNo] = Val;
326  } else {
327  assert(T == LGKM_CNT);
328  SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
329  SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
330  }
331  }
332 
333  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
334  const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
335  unsigned OpNo, unsigned Val);
336 
337  const GCNSubtarget *ST = nullptr;
338  unsigned ScoreLBs[NUM_INST_CNTS] = {0};
339  unsigned ScoreUBs[NUM_INST_CNTS] = {0};
340  unsigned PendingEvents = 0;
341  // Remember the last flat memory operation.
342  unsigned LastFlat[NUM_INST_CNTS] = {0};
343  // wait_cnt scores for every vgpr.
344  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
345  int VgprUB = -1;
346  int SgprUB = -1;
347  unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
348  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
349  unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
350  // Bitmask of the VmemTypes of VMEM instructions that might have a pending
351  // write to each vgpr.
352  unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
353 };
354 
355 class SIInsertWaitcnts : public MachineFunctionPass {
356 private:
357  const GCNSubtarget *ST = nullptr;
358  const SIInstrInfo *TII = nullptr;
359  const SIRegisterInfo *TRI = nullptr;
360  const MachineRegisterInfo *MRI = nullptr;
362 
363  DenseSet<MachineInstr *> TrackedWaitcntSet;
366 
367  struct BlockInfo {
369  std::unique_ptr<WaitcntBrackets> Incoming;
370  bool Dirty = true;
371 
372  explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
373  };
374 
376 
377  // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
378  // because of amdgpu-waitcnt-forcezero flag
379  bool ForceEmitZeroWaitcnts;
380  bool ForceEmitWaitcnt[NUM_INST_CNTS];
381 
382 public:
383  static char ID;
384 
385  SIInsertWaitcnts() : MachineFunctionPass(ID) {
386  (void)ForceExpCounter;
387  (void)ForceLgkmCounter;
388  (void)ForceVMCounter;
389  }
390 
391  bool runOnMachineFunction(MachineFunction &MF) override;
392 
393  StringRef getPassName() const override {
394  return "SI insert wait instructions";
395  }
396 
397  void getAnalysisUsage(AnalysisUsage &AU) const override {
398  AU.setPreservesCFG();
401  }
402 
403  bool isForceEmitWaitcnt() const {
404  for (auto T : inst_counter_types())
405  if (ForceEmitWaitcnt[T])
406  return true;
407  return false;
408  }
409 
410  void setForceEmitWaitcnt() {
411 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
412 // For debug builds, get the debug counter info and adjust if need be
413 #ifndef NDEBUG
414  if (DebugCounter::isCounterSet(ForceExpCounter) &&
415  DebugCounter::shouldExecute(ForceExpCounter)) {
416  ForceEmitWaitcnt[EXP_CNT] = true;
417  } else {
418  ForceEmitWaitcnt[EXP_CNT] = false;
419  }
420 
421  if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
422  DebugCounter::shouldExecute(ForceLgkmCounter)) {
423  ForceEmitWaitcnt[LGKM_CNT] = true;
424  } else {
425  ForceEmitWaitcnt[LGKM_CNT] = false;
426  }
427 
428  if (DebugCounter::isCounterSet(ForceVMCounter) &&
429  DebugCounter::shouldExecute(ForceVMCounter)) {
430  ForceEmitWaitcnt[VM_CNT] = true;
431  } else {
432  ForceEmitWaitcnt[VM_CNT] = false;
433  }
434 #endif // NDEBUG
435  }
436 
437  bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
438  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
439  bool generateWaitcntInstBefore(MachineInstr &MI,
440  WaitcntBrackets &ScoreBrackets,
441  MachineInstr *OldWaitcntInstr);
442  void updateEventWaitcntAfter(MachineInstr &Inst,
443  WaitcntBrackets *ScoreBrackets);
444  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
445  WaitcntBrackets &ScoreBrackets);
446  bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
447  MachineInstr &OldWaitcntInstr,
449 };
450 
451 } // end anonymous namespace
452 
453 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
454  const SIInstrInfo *TII,
455  const MachineRegisterInfo *MRI,
456  const SIRegisterInfo *TRI,
457  unsigned OpNo) const {
458  const MachineOperand &Op = MI->getOperand(OpNo);
459  if (!TRI->isInAllocatableClass(Op.getReg()))
460  return {-1, -1};
461 
462  // A use via a PW operand does not need a waitcnt.
463  // A partial write is not a WAW.
464  assert(!Op.getSubReg() || !Op.isUndef());
465 
466  RegInterval Result;
467 
468  unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
469 
470  if (TRI->isVectorRegister(*MRI, Op.getReg())) {
471  assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
472  Result.first = Reg - RegisterEncoding.VGPR0;
473  if (TRI->isAGPR(*MRI, Op.getReg()))
474  Result.first += AGPR_OFFSET;
475  assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
476  } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
477  assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
478  Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
479  assert(Result.first >= NUM_ALL_VGPRS &&
480  Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
481  }
482  // TODO: Handle TTMP
483  // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
484  else
485  return {-1, -1};
486 
487  const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo);
488  unsigned Size = TRI->getRegSizeInBits(*RC);
489  Result.second = Result.first + ((Size + 16) / 32);
490 
491  return Result;
492 }
493 
494 void WaitcntBrackets::setExpScore(const MachineInstr *MI,
495  const SIInstrInfo *TII,
496  const SIRegisterInfo *TRI,
497  const MachineRegisterInfo *MRI, unsigned OpNo,
498  unsigned Val) {
499  RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
500  assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
501  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
502  setRegScore(RegNo, EXP_CNT, Val);
503  }
504 }
505 
506 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
507  const SIRegisterInfo *TRI,
508  const MachineRegisterInfo *MRI,
509  WaitEventType E, MachineInstr &Inst) {
510  InstCounterType T = eventCounter(E);
511  unsigned CurrScore = getScoreUB(T) + 1;
512  if (CurrScore == 0)
513  report_fatal_error("InsertWaitcnt score wraparound");
514  // PendingEvents and ScoreUB need to be update regardless if this event
515  // changes the score of a register or not.
516  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
517  PendingEvents |= 1 << E;
518  setScoreUB(T, CurrScore);
519 
520  if (T == EXP_CNT) {
521  // Put score on the source vgprs. If this is a store, just use those
522  // specific register(s).
523  if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
524  int AddrOpIdx =
525  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
526  // All GDS operations must protect their address register (same as
527  // export.)
528  if (AddrOpIdx != -1) {
529  setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
530  }
531 
532  if (Inst.mayStore()) {
534  AMDGPU::OpName::data0) != -1) {
535  setExpScore(
536  &Inst, TII, TRI, MRI,
537  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
538  CurrScore);
539  }
541  AMDGPU::OpName::data1) != -1) {
542  setExpScore(&Inst, TII, TRI, MRI,
544  AMDGPU::OpName::data1),
545  CurrScore);
546  }
547  } else if (SIInstrInfo::isAtomicRet(Inst) &&
548  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
549  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
550  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
551  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
552  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
553  Inst.getOpcode() != AMDGPU::DS_APPEND &&
554  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
556  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
557  const MachineOperand &Op = Inst.getOperand(I);
558  if (Op.isReg() && !Op.isDef() &&
559  TRI->isVectorRegister(*MRI, Op.getReg())) {
560  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
561  }
562  }
563  }
564  } else if (TII->isFLAT(Inst)) {
565  if (Inst.mayStore()) {
566  setExpScore(
567  &Inst, TII, TRI, MRI,
568  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
569  CurrScore);
570  } else if (SIInstrInfo::isAtomicRet(Inst)) {
571  setExpScore(
572  &Inst, TII, TRI, MRI,
573  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
574  CurrScore);
575  }
576  } else if (TII->isMIMG(Inst)) {
577  if (Inst.mayStore()) {
578  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
579  } else if (SIInstrInfo::isAtomicRet(Inst)) {
580  setExpScore(
581  &Inst, TII, TRI, MRI,
582  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
583  CurrScore);
584  }
585  } else if (TII->isMTBUF(Inst)) {
586  if (Inst.mayStore()) {
587  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
588  }
589  } else if (TII->isMUBUF(Inst)) {
590  if (Inst.mayStore()) {
591  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
592  } else if (SIInstrInfo::isAtomicRet(Inst)) {
593  setExpScore(
594  &Inst, TII, TRI, MRI,
595  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
596  CurrScore);
597  }
598  } else {
599  if (TII->isEXP(Inst)) {
600  // For export the destination registers are really temps that
601  // can be used as the actual source after export patching, so
602  // we need to treat them like sources and set the EXP_CNT
603  // score.
604  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
605  MachineOperand &DefMO = Inst.getOperand(I);
606  if (DefMO.isReg() && DefMO.isDef() &&
607  TRI->isVGPR(*MRI, DefMO.getReg())) {
608  setRegScore(
610  EXP_CNT, CurrScore);
611  }
612  }
613  }
614  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
615  MachineOperand &MO = Inst.getOperand(I);
616  if (MO.isReg() && !MO.isDef() &&
617  TRI->isVectorRegister(*MRI, MO.getReg())) {
618  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
619  }
620  }
621  }
622 #if 0 // TODO: check if this is handled by MUBUF code above.
623  } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
624  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
625  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
626  MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
627  unsigned OpNo;//TODO: find the OpNo for this operand;
628  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo);
629  for (int RegNo = Interval.first; RegNo < Interval.second;
630  ++RegNo) {
631  setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
632  }
633 #endif
634  } else {
635  // Match the score to the destination registers.
636  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
637  auto &Op = Inst.getOperand(I);
638  if (!Op.isReg() || !Op.isDef())
639  continue;
640  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I);
641  if (T == VM_CNT) {
642  if (Interval.first >= NUM_ALL_VGPRS)
643  continue;
644  if (SIInstrInfo::isVMEM(Inst)) {
645  VmemType V = getVmemType(Inst);
646  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
647  VgprVmemTypes[RegNo] |= 1 << V;
648  }
649  }
650  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
651  setRegScore(RegNo, T, CurrScore);
652  }
653  }
654  if (TII->isDS(Inst) && Inst.mayStore()) {
655  setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
656  }
657  }
658 }
659 
661  OS << '\n';
662  for (auto T : inst_counter_types()) {
663  unsigned LB = getScoreLB(T);
664  unsigned UB = getScoreUB(T);
665 
666  switch (T) {
667  case VM_CNT:
668  OS << " VM_CNT(" << UB - LB << "): ";
669  break;
670  case LGKM_CNT:
671  OS << " LGKM_CNT(" << UB - LB << "): ";
672  break;
673  case EXP_CNT:
674  OS << " EXP_CNT(" << UB - LB << "): ";
675  break;
676  case VS_CNT:
677  OS << " VS_CNT(" << UB - LB << "): ";
678  break;
679  default:
680  OS << " UNKNOWN(" << UB - LB << "): ";
681  break;
682  }
683 
684  if (LB < UB) {
685  // Print vgpr scores.
686  for (int J = 0; J <= VgprUB; J++) {
687  unsigned RegScore = getRegScore(J, T);
688  if (RegScore <= LB)
689  continue;
690  unsigned RelScore = RegScore - LB - 1;
691  if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
692  OS << RelScore << ":v" << J << " ";
693  } else {
694  OS << RelScore << ":ds ";
695  }
696  }
697  // Also need to print sgpr scores for lgkm_cnt.
698  if (T == LGKM_CNT) {
699  for (int J = 0; J <= SgprUB; J++) {
700  unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
701  if (RegScore <= LB)
702  continue;
703  unsigned RelScore = RegScore - LB - 1;
704  OS << RelScore << ":s" << J << " ";
705  }
706  }
707  }
708  OS << '\n';
709  }
710  OS << '\n';
711 }
712 
713 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
714 /// whether a waitcnt instruction is needed at all.
715 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
716  simplifyWaitcnt(VM_CNT, Wait.VmCnt);
717  simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
718  simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
719  simplifyWaitcnt(VS_CNT, Wait.VsCnt);
720 }
721 
722 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
723  unsigned &Count) const {
724  const unsigned LB = getScoreLB(T);
725  const unsigned UB = getScoreUB(T);
726 
727  // The number of outstanding events for this type, T, can be calculated
728  // as (UB - LB). If the current Count is greater than or equal to the number
729  // of outstanding events, then the wait for this counter is redundant.
730  if (Count >= UB - LB)
731  Count = ~0u;
732 }
733 
734 void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait,
735  AMDGPU::Waitcnt &Wait) const {
736  // If the score of src_operand falls within the bracket, we need an
737  // s_waitcnt instruction.
738  const unsigned LB = getScoreLB(T);
739  const unsigned UB = getScoreUB(T);
740  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
741  if ((T == VM_CNT || T == LGKM_CNT) &&
742  hasPendingFlat() &&
743  !ST->hasFlatLgkmVMemCountInOrder()) {
744  // If there is a pending FLAT operation, and this is a VMem or LGKM
745  // waitcnt and the target can report early completion, then we need
746  // to force a waitcnt 0.
747  addWait(Wait, T, 0);
748  } else if (counterOutOfOrder(T)) {
749  // Counter can get decremented out-of-order when there
750  // are multiple types event in the bracket. Also emit an s_wait counter
751  // with a conservative value of 0 for the counter.
752  addWait(Wait, T, 0);
753  } else {
754  // If a counter has been maxed out avoid overflow by waiting for
755  // MAX(CounterType) - 1 instead.
756  unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
757  addWait(Wait, T, NeededWait);
758  }
759  }
760 }
761 
762 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
763  applyWaitcnt(VM_CNT, Wait.VmCnt);
764  applyWaitcnt(EXP_CNT, Wait.ExpCnt);
765  applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
766  applyWaitcnt(VS_CNT, Wait.VsCnt);
767 }
768 
769 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
770  const unsigned UB = getScoreUB(T);
771  if (Count >= UB)
772  return;
773  if (Count != 0) {
774  if (counterOutOfOrder(T))
775  return;
776  setScoreLB(T, std::max(getScoreLB(T), UB - Count));
777  } else {
778  setScoreLB(T, UB);
779  PendingEvents &= ~WaitEventMaskForInst[T];
780  }
781 }
782 
783 // Where there are multiple types of event in the bracket of a counter,
784 // the decrement may go out of order.
785 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
786  // Scalar memory read always can go out of order.
787  if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
788  return true;
789  return hasMixedPendingEvents(T);
790 }
791 
792 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
793  false)
795 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
796  false)
797 
798 char SIInsertWaitcnts::ID = 0;
799 
800 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
801 
803  return new SIInsertWaitcnts();
804 }
805 
806 /// Combine consecutive waitcnt instructions that precede \p MI and follow
807 /// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
808 /// by previous passes. Currently this pass conservatively assumes that these
809 /// preexisting waitcnt are required for correctness.
810 bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
811  MachineInstr &OldWaitcntInstr,
813  const MachineInstr *MI) {
814  bool Modified = false;
815  MachineInstr *WaitcntInstr = nullptr;
816  MachineInstr *WaitcntVsCntInstr = nullptr;
817  for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II);
818  &*II != MI; II = NextI, ++NextI) {
819  if (II->isMetaInstruction())
820  continue;
821 
822  if (II->getOpcode() == AMDGPU::S_WAITCNT) {
823  // Conservatively update required wait if this waitcnt was added in an
824  // earlier pass. In this case it will not exist in the tracked waitcnt
825  // set.
826  if (!TrackedWaitcntSet.count(&*II)) {
827  unsigned IEnc = II->getOperand(0).getImm();
828  AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
829  Wait = Wait.combined(OldWait);
830  }
831 
832  // Merge consecutive waitcnt of the same type by erasing multiples.
833  if (!WaitcntInstr) {
834  WaitcntInstr = &*II;
835  } else {
836  II->eraseFromParent();
837  Modified = true;
838  }
839 
840  } else {
841  assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
842  assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
843  if (!TrackedWaitcntSet.count(&*II)) {
844  unsigned OldVSCnt =
845  TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
846  Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
847  }
848 
849  if (!WaitcntVsCntInstr) {
850  WaitcntVsCntInstr = &*II;
851  } else {
852  II->eraseFromParent();
853  Modified = true;
854  }
855  }
856  }
857 
858  // Updated encoding of merged waitcnt with the required wait.
859  if (WaitcntInstr) {
860  if (Wait.hasWaitExceptVsCnt()) {
861  unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
862  unsigned OldEnc = WaitcntInstr->getOperand(0).getImm();
863  if (OldEnc != NewEnc) {
864  WaitcntInstr->getOperand(0).setImm(NewEnc);
865  Modified = true;
866  }
867  ScoreBrackets.applyWaitcnt(Wait);
868  Wait.VmCnt = ~0u;
869  Wait.LgkmCnt = ~0u;
870  Wait.ExpCnt = ~0u;
871 
872  LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
873  << "Old Instr: " << MI << "New Instr: " << *WaitcntInstr
874  << '\n');
875  } else {
876  WaitcntInstr->eraseFromParent();
877  Modified = true;
878  }
879  }
880 
881  if (WaitcntVsCntInstr) {
882  if (Wait.hasWaitVsCnt()) {
883  assert(ST->hasVscnt());
884  unsigned OldVSCnt =
885  TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
886  ->getImm();
887  if (Wait.VsCnt != OldVSCnt) {
888  TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
889  ->setImm(Wait.VsCnt);
890  Modified = true;
891  }
892  ScoreBrackets.applyWaitcnt(Wait);
893  Wait.VsCnt = ~0u;
894 
895  LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
896  << "Old Instr: " << MI
897  << "New Instr: " << *WaitcntVsCntInstr << '\n');
898  } else {
899  WaitcntVsCntInstr->eraseFromParent();
900  Modified = true;
901  }
902  }
903 
904  return Modified;
905 }
906 
907 static bool readsVCCZ(const MachineInstr &MI) {
908  unsigned Opc = MI.getOpcode();
909  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
910  !MI.getOperand(1).isUndef();
911 }
912 
913 /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
915  // Currently all conventions wait, but this may not always be the case.
916  //
917  // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
918  // senses to omit the wait and do it in the caller.
919  return true;
920 }
921 
922 /// \returns true if the callee is expected to wait for any outstanding waits
923 /// before returning.
925  return true;
926 }
927 
928 /// Generate s_waitcnt instruction to be placed before cur_Inst.
929 /// Instructions of a given type are returned in order,
930 /// but instructions of different types can complete out of order.
931 /// We rely on this in-order completion
932 /// and simply assign a score to the memory access instructions.
933 /// We keep track of the active "score bracket" to determine
934 /// if an access of a memory read requires an s_waitcnt
935 /// and if so what the value of each counter is.
936 /// The "score bracket" is bound by the lower bound and upper bound
937 /// scores (*_score_LB and *_score_ub respectively).
938 bool SIInsertWaitcnts::generateWaitcntInstBefore(
939  MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
940  MachineInstr *OldWaitcntInstr) {
941  setForceEmitWaitcnt();
942 
943  if (MI.isMetaInstruction())
944  return false;
945 
947  bool Modified = false;
948 
949  // FIXME: This should have already been handled by the memory legalizer.
950  // Removing this currently doesn't affect any lit tests, but we need to
951  // verify that nothing was relying on this. The number of buffer invalidates
952  // being handled here should not be expanded.
953  if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
954  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
955  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
956  MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
957  MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
958  Wait.VmCnt = 0;
959  }
960 
961  // All waits must be resolved at call return.
962  // NOTE: this could be improved with knowledge of all call sites or
963  // with knowledge of the called routines.
964  if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
965  MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
966  (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
967  Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
968  }
969  // Resolve vm waits before gs-done.
970  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
971  MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
972  ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
974  Wait.VmCnt = 0;
975  }
976 #if 0 // TODO: the following blocks of logic when we have fence.
977  else if (MI.getOpcode() == SC_FENCE) {
978  const unsigned int group_size =
979  context->shader_info->GetMaxThreadGroupSize();
980  // group_size == 0 means thread group size is unknown at compile time
981  const bool group_is_multi_wave =
982  (group_size == 0 || group_size > target_info->GetWaveFrontSize());
983  const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
984 
985  for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
986  SCRegType src_type = Inst->GetSrcType(i);
987  switch (src_type) {
988  case SCMEM_LDS:
989  if (group_is_multi_wave ||
990  context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
991  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
992  ScoreBrackets->getScoreUB(LGKM_CNT));
993  // LDS may have to wait for VM_CNT after buffer load to LDS
994  if (target_info->HasBufferLoadToLDS()) {
995  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
996  ScoreBrackets->getScoreUB(VM_CNT));
997  }
998  }
999  break;
1000 
1001  case SCMEM_GDS:
1002  if (group_is_multi_wave || fence_is_global) {
1003  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1004  ScoreBrackets->getScoreUB(EXP_CNT));
1005  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
1006  ScoreBrackets->getScoreUB(LGKM_CNT));
1007  }
1008  break;
1009 
1010  case SCMEM_UAV:
1011  case SCMEM_TFBUF:
1012  case SCMEM_RING:
1013  case SCMEM_SCATTER:
1014  if (group_is_multi_wave || fence_is_global) {
1015  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1016  ScoreBrackets->getScoreUB(EXP_CNT));
1017  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
1018  ScoreBrackets->getScoreUB(VM_CNT));
1019  }
1020  break;
1021 
1022  case SCMEM_SCRATCH:
1023  default:
1024  break;
1025  }
1026  }
1027  }
1028 #endif
1029 
1030  // Export & GDS instructions do not read the EXEC mask until after the export
1031  // is granted (which can occur well after the instruction is issued).
1032  // The shader program must flush all EXP operations on the export-count
1033  // before overwriting the EXEC mask.
1034  else {
1035  if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1036  // Export and GDS are tracked individually, either may trigger a waitcnt
1037  // for EXEC.
1038  if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1039  ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1040  ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1041  ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1042  Wait.ExpCnt = 0;
1043  }
1044  }
1045 
1046  if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1047  // The function is going to insert a wait on everything in its prolog.
1048  // This still needs to be careful if the call target is a load (e.g. a GOT
1049  // load). We also need to check WAW depenancy with saved PC.
1050  Wait = AMDGPU::Waitcnt();
1051 
1052  int CallAddrOpIdx =
1053  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
1054 
1055  if (MI.getOperand(CallAddrOpIdx).isReg()) {
1056  RegInterval CallAddrOpInterval =
1057  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx);
1058 
1059  for (int RegNo = CallAddrOpInterval.first;
1060  RegNo < CallAddrOpInterval.second; ++RegNo)
1061  ScoreBrackets.determineWait(
1062  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1063 
1064  int RtnAddrOpIdx =
1065  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
1066  if (RtnAddrOpIdx != -1) {
1067  RegInterval RtnAddrOpInterval =
1068  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx);
1069 
1070  for (int RegNo = RtnAddrOpInterval.first;
1071  RegNo < RtnAddrOpInterval.second; ++RegNo)
1072  ScoreBrackets.determineWait(
1073  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1074  }
1075  }
1076  } else {
1077  // FIXME: Should not be relying on memoperands.
1078  // Look at the source operands of every instruction to see if
1079  // any of them results from a previous memory operation that affects
1080  // its current usage. If so, an s_waitcnt instruction needs to be
1081  // emitted.
1082  // If the source operand was defined by a load, add the s_waitcnt
1083  // instruction.
1084  //
1085  // Two cases are handled for destination operands:
1086  // 1) If the destination operand was defined by a load, add the s_waitcnt
1087  // instruction to guarantee the right WAW order.
1088  // 2) If a destination operand that was used by a recent export/store ins,
1089  // add s_waitcnt on exp_cnt to guarantee the WAR order.
1090  for (const MachineMemOperand *Memop : MI.memoperands()) {
1091  const Value *Ptr = Memop->getValue();
1092  if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
1093  addWait(Wait, LGKM_CNT, 0);
1094  if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1095  SLoadAddresses.erase(Ptr);
1096  }
1097  unsigned AS = Memop->getAddrSpace();
1098  if (AS != AMDGPUAS::LOCAL_ADDRESS)
1099  continue;
1100  unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1101  // VM_CNT is only relevant to vgpr or LDS.
1102  ScoreBrackets.determineWait(
1103  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1104  if (Memop->isStore()) {
1105  ScoreBrackets.determineWait(
1106  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1107  }
1108  }
1109 
1110  // Loop over use and def operands.
1111  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1112  MachineOperand &Op = MI.getOperand(I);
1113  if (!Op.isReg())
1114  continue;
1115  RegInterval Interval =
1116  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
1117 
1118  const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1119  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1120  if (IsVGPR) {
1121  // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1122  // previous write and this write are the same type of VMEM
1123  // instruction, in which case they're guaranteed to write their
1124  // results in order anyway.
1125  if (Op.isUse() || !SIInstrInfo::isVMEM(MI) ||
1126  ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1127  getVmemType(MI))) {
1128  ScoreBrackets.determineWait(
1129  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1130  ScoreBrackets.clearVgprVmemTypes(RegNo);
1131  }
1132  if (Op.isDef()) {
1133  ScoreBrackets.determineWait(
1134  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1135  }
1136  }
1137  ScoreBrackets.determineWait(
1138  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1139  }
1140  }
1141  }
1142  }
1143 
1144  // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1145  // occurs before the instruction. Doing it here prevents any additional
1146  // S_WAITCNTs from being emitted if the instruction was marked as
1147  // requiring a WAITCNT beforehand.
1148  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1149  !ST->hasAutoWaitcntBeforeBarrier()) {
1150  Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
1151  }
1152 
1153  // TODO: Remove this work-around, enable the assert for Bug 457939
1154  // after fixing the scheduler. Also, the Shader Compiler code is
1155  // independent of target.
1156  if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1157  if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1158  ScoreBrackets.getScoreUB(LGKM_CNT) &&
1159  ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1160  Wait.LgkmCnt = 0;
1161  }
1162  }
1163 
1164  // Verify that the wait is actually needed.
1165  ScoreBrackets.simplifyWaitcnt(Wait);
1166 
1167  if (ForceEmitZeroWaitcnts)
1168  Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt());
1169 
1170  if (ForceEmitWaitcnt[VM_CNT])
1171  Wait.VmCnt = 0;
1172  if (ForceEmitWaitcnt[EXP_CNT])
1173  Wait.ExpCnt = 0;
1174  if (ForceEmitWaitcnt[LGKM_CNT])
1175  Wait.LgkmCnt = 0;
1176  if (ForceEmitWaitcnt[VS_CNT])
1177  Wait.VsCnt = 0;
1178 
1179  if (OldWaitcntInstr) {
1180  // Try to merge the required wait with preexisting waitcnt instructions.
1181  // Also erase redundant waitcnt.
1182  Modified =
1183  applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, &MI);
1184  } else {
1185  // Update waitcnt brackets after determining the required wait.
1186  ScoreBrackets.applyWaitcnt(Wait);
1187  }
1188 
1189  // Build new waitcnt instructions unless no wait is needed or the old waitcnt
1190  // instruction was modified to handle the required wait.
1191  if (Wait.hasWaitExceptVsCnt()) {
1192  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1193  auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1194  MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1195  .addImm(Enc);
1196  TrackedWaitcntSet.insert(SWaitInst);
1197  Modified = true;
1198 
1199  LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1200  << "Old Instr: " << MI
1201  << "New Instr: " << *SWaitInst << '\n');
1202  }
1203 
1204  if (Wait.hasWaitVsCnt()) {
1205  assert(ST->hasVscnt());
1206 
1207  auto SWaitInst =
1208  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1209  TII->get(AMDGPU::S_WAITCNT_VSCNT))
1210  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1211  .addImm(Wait.VsCnt);
1212  TrackedWaitcntSet.insert(SWaitInst);
1213  Modified = true;
1214 
1215  LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1216  << "Old Instr: " << MI
1217  << "New Instr: " << *SWaitInst << '\n');
1218  }
1219 
1220  return Modified;
1221 }
1222 
1223 // This is a flat memory operation. Check to see if it has memory tokens other
1224 // than LDS. Other address spaces supported by flat memory operations involve
1225 // global memory.
1226 bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1227  assert(TII->isFLAT(MI));
1228 
1229  // All flat instructions use the VMEM counter.
1230  assert(TII->usesVM_CNT(MI));
1231 
1232  // If there are no memory operands then conservatively assume the flat
1233  // operation may access VMEM.
1234  if (MI.memoperands_empty())
1235  return true;
1236 
1237  // See if any memory operand specifies an address space that involves VMEM.
1238  // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1239  // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1240  // (GDS) address space is not supported by flat operations. Therefore, simply
1241  // return true unless only the LDS address space is found.
1242  for (const MachineMemOperand *Memop : MI.memoperands()) {
1243  unsigned AS = Memop->getAddrSpace();
1245  if (AS != AMDGPUAS::LOCAL_ADDRESS)
1246  return true;
1247  }
1248 
1249  return false;
1250 }
1251 
1252 // This is a flat memory operation. Check to see if it has memory tokens for
1253 // either LDS or FLAT.
1254 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1255  assert(TII->isFLAT(MI));
1256 
1257  // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1258  if (!TII->usesLGKM_CNT(MI))
1259  return false;
1260 
1261  // If in tgsplit mode then there can be no use of LDS.
1262  if (ST->isTgSplitEnabled())
1263  return false;
1264 
1265  // If there are no memory operands then conservatively assume the flat
1266  // operation may access LDS.
1267  if (MI.memoperands_empty())
1268  return true;
1269 
1270  // See if any memory operand specifies an address space that involves LDS.
1271  for (const MachineMemOperand *Memop : MI.memoperands()) {
1272  unsigned AS = Memop->getAddrSpace();
1274  return true;
1275  }
1276 
1277  return false;
1278 }
1279 
1280 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1281  WaitcntBrackets *ScoreBrackets) {
1282  // Now look at the instruction opcode. If it is a memory access
1283  // instruction, update the upper-bound of the appropriate counter's
1284  // bracket and the destination operand scores.
1285  // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1286  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1287  if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1288  TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1289  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1290  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1291  } else {
1292  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1293  }
1294  } else if (TII->isFLAT(Inst)) {
1295  assert(Inst.mayLoadOrStore());
1296 
1297  int FlatASCount = 0;
1298 
1299  if (mayAccessVMEMThroughFlat(Inst)) {
1300  ++FlatASCount;
1301  if (!ST->hasVscnt())
1302  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1303  else if (Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst))
1304  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1305  else
1306  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1307  }
1308 
1309  if (mayAccessLDSThroughFlat(Inst)) {
1310  ++FlatASCount;
1311  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1312  }
1313 
1314  // A Flat memory operation must access at least one address space.
1315  assert(FlatASCount);
1316 
1317  // This is a flat memory operation that access both VMEM and LDS, so note it
1318  // - it will require that both the VM and LGKM be flushed to zero if it is
1319  // pending when a VM or LGKM dependency occurs.
1320  if (FlatASCount > 1)
1321  ScoreBrackets->setPendingFlat();
1322  } else if (SIInstrInfo::isVMEM(Inst) &&
1324  if (!ST->hasVscnt())
1325  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1326  else if ((Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst)) ||
1327  /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
1328  (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
1329  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1330  else if (Inst.mayStore())
1331  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1332 
1333  if (ST->vmemWriteNeedsExpWaitcnt() &&
1334  (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
1335  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1336  }
1337  } else if (TII->isSMRD(Inst)) {
1338  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1339  } else if (Inst.isCall()) {
1340  if (callWaitsOnFunctionReturn(Inst)) {
1341  // Act as a wait on everything
1342  ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
1343  } else {
1344  // May need to way wait for anything.
1345  ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
1346  }
1347  } else if (SIInstrInfo::isEXP(Inst)) {
1348  unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1349  if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
1350  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1351  else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
1352  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1353  else
1354  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1355  } else {
1356  switch (Inst.getOpcode()) {
1357  case AMDGPU::S_SENDMSG:
1358  case AMDGPU::S_SENDMSGHALT:
1359  ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1360  break;
1361  case AMDGPU::S_MEMTIME:
1362  case AMDGPU::S_MEMREALTIME:
1363  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1364  break;
1365  }
1366  }
1367 }
1368 
1369 bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
1370  unsigned OtherScore) {
1371  unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
1372  unsigned OtherShifted =
1373  OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
1374  Score = std::max(MyShifted, OtherShifted);
1375  return OtherShifted > MyShifted;
1376 }
1377 
1378 /// Merge the pending events and associater score brackets of \p Other into
1379 /// this brackets status.
1380 ///
1381 /// Returns whether the merge resulted in a change that requires tighter waits
1382 /// (i.e. the merged brackets strictly dominate the original brackets).
1383 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
1384  bool StrictDom = false;
1385 
1386  VgprUB = std::max(VgprUB, Other.VgprUB);
1387  SgprUB = std::max(SgprUB, Other.SgprUB);
1388 
1389  for (auto T : inst_counter_types()) {
1390  // Merge event flags for this counter
1391  const bool OldOutOfOrder = counterOutOfOrder(T);
1392  const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
1393  const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
1394  if (OtherEvents & ~OldEvents)
1395  StrictDom = true;
1396  PendingEvents |= OtherEvents;
1397 
1398  // Merge scores for this counter
1399  const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
1400  const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
1401  const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
1402  if (NewUB < ScoreLBs[T])
1403  report_fatal_error("waitcnt score overflow");
1404 
1405  MergeInfo M;
1406  M.OldLB = ScoreLBs[T];
1407  M.OtherLB = Other.ScoreLBs[T];
1408  M.MyShift = NewUB - ScoreUBs[T];
1409  M.OtherShift = NewUB - Other.ScoreUBs[T];
1410 
1411  ScoreUBs[T] = NewUB;
1412 
1413  StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
1414 
1415  bool RegStrictDom = false;
1416  for (int J = 0; J <= VgprUB; J++) {
1417  RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
1418  }
1419 
1420  if (T == VM_CNT) {
1421  for (int J = 0; J <= VgprUB; J++) {
1422  unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
1423  RegStrictDom |= NewVmemTypes != VgprVmemTypes[J];
1424  VgprVmemTypes[J] = NewVmemTypes;
1425  }
1426  }
1427 
1428  if (T == LGKM_CNT) {
1429  for (int J = 0; J <= SgprUB; J++) {
1430  RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
1431  }
1432  }
1433 
1434  if (RegStrictDom && !OldOutOfOrder)
1435  StrictDom = true;
1436  }
1437 
1438  return StrictDom;
1439 }
1440 
1441 // Generate s_waitcnt instructions where needed.
1442 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1443  MachineBasicBlock &Block,
1444  WaitcntBrackets &ScoreBrackets) {
1445  bool Modified = false;
1446 
1447  LLVM_DEBUG({
1448  dbgs() << "*** Block" << Block.getNumber() << " ***";
1449  ScoreBrackets.dump();
1450  });
1451 
1452  // Track the correctness of vccz through this basic block. There are two
1453  // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
1454  // ST->partialVCCWritesUpdateVCCZ().
1455  bool VCCZCorrect = true;
1456  if (ST->hasReadVCCZBug()) {
1457  // vccz could be incorrect at a basic block boundary if a predecessor wrote
1458  // to vcc and then issued an smem load.
1459  VCCZCorrect = false;
1460  } else if (!ST->partialVCCWritesUpdateVCCZ()) {
1461  // vccz could be incorrect at a basic block boundary if a predecessor wrote
1462  // to vcc_lo or vcc_hi.
1463  VCCZCorrect = false;
1464  }
1465 
1466  // Walk over the instructions.
1467  MachineInstr *OldWaitcntInstr = nullptr;
1468 
1469  for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
1470  E = Block.instr_end();
1471  Iter != E;) {
1472  MachineInstr &Inst = *Iter;
1473 
1474  // Track pre-existing waitcnts that were added in earlier iterations or by
1475  // the memory legalizer.
1476  if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
1477  (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1478  Inst.getOperand(0).isReg() &&
1479  Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL)) {
1480  if (!OldWaitcntInstr)
1481  OldWaitcntInstr = &Inst;
1482  ++Iter;
1483  continue;
1484  }
1485 
1486  // Generate an s_waitcnt instruction to be placed before Inst, if needed.
1487  Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
1488  OldWaitcntInstr = nullptr;
1489 
1490  // Restore vccz if it's not known to be correct already.
1491  bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
1492 
1493  // Don't examine operands unless we need to track vccz correctness.
1494  if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
1495  if (Inst.definesRegister(AMDGPU::VCC_LO) ||
1496  Inst.definesRegister(AMDGPU::VCC_HI)) {
1497  // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
1498  if (!ST->partialVCCWritesUpdateVCCZ())
1499  VCCZCorrect = false;
1500  } else if (Inst.definesRegister(AMDGPU::VCC)) {
1501  // There is a hardware bug on CI/SI where SMRD instruction may corrupt
1502  // vccz bit, so when we detect that an instruction may read from a
1503  // corrupt vccz bit, we need to:
1504  // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
1505  // operations to complete.
1506  // 2. Restore the correct value of vccz by writing the current value
1507  // of vcc back to vcc.
1508  if (ST->hasReadVCCZBug() &&
1509  ScoreBrackets.getScoreLB(LGKM_CNT) <
1510  ScoreBrackets.getScoreUB(LGKM_CNT) &&
1511  ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1512  // Writes to vcc while there's an outstanding smem read may get
1513  // clobbered as soon as any read completes.
1514  VCCZCorrect = false;
1515  } else {
1516  // Writes to vcc will fix any incorrect value in vccz.
1517  VCCZCorrect = true;
1518  }
1519  }
1520  }
1521 
1522  if (TII->isSMRD(Inst)) {
1523  for (const MachineMemOperand *Memop : Inst.memoperands()) {
1524  // No need to handle invariant loads when avoiding WAR conflicts, as
1525  // there cannot be a vector store to the same memory location.
1526  if (!Memop->isInvariant()) {
1527  const Value *Ptr = Memop->getValue();
1528  SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
1529  }
1530  }
1531  if (ST->hasReadVCCZBug()) {
1532  // This smem read could complete and clobber vccz at any time.
1533  VCCZCorrect = false;
1534  }
1535  }
1536 
1537  updateEventWaitcntAfter(Inst, &ScoreBrackets);
1538 
1539 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1540  // If this instruction generates a S_SETVSKIP because it is an
1541  // indexed resource, and we are on Tahiti, then it will also force
1542  // an S_WAITCNT vmcnt(0)
1543  if (RequireCheckResourceType(Inst, context)) {
1544  // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1545  ScoreBrackets->setScoreLB(VM_CNT,
1546  ScoreBrackets->getScoreUB(VM_CNT));
1547  }
1548 #endif
1549 
1550  LLVM_DEBUG({
1551  Inst.print(dbgs());
1552  ScoreBrackets.dump();
1553  });
1554 
1555  // TODO: Remove this work-around after fixing the scheduler and enable the
1556  // assert above.
1557  if (RestoreVCCZ) {
1558  // Restore the vccz bit. Any time a value is written to vcc, the vcc
1559  // bit is updated, so we can restore the bit by reading the value of
1560  // vcc and then writing it back to the register.
1561  BuildMI(Block, Inst, Inst.getDebugLoc(),
1562  TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1563  TRI->getVCC())
1564  .addReg(TRI->getVCC());
1565  VCCZCorrect = true;
1566  Modified = true;
1567  }
1568 
1569  ++Iter;
1570  }
1571 
1572  return Modified;
1573 }
1574 
1575 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1576  ST = &MF.getSubtarget<GCNSubtarget>();
1577  TII = ST->getInstrInfo();
1578  TRI = &TII->getRegisterInfo();
1579  MRI = &MF.getRegInfo();
1580  IV = AMDGPU::getIsaVersion(ST->getCPU());
1582  PDT = &getAnalysis<MachinePostDominatorTree>();
1583 
1584  ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1585  for (auto T : inst_counter_types())
1586  ForceEmitWaitcnt[T] = false;
1587 
1588  HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1589  HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1590  HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1591  HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;
1592 
1593  unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
1594  unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
1595  assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1596  assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1597 
1598  RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1599  RegisterEncoding.VGPRL = RegisterEncoding.VGPR0 + NumVGPRsMax - 1;
1600  RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1601  RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + NumSGPRsMax - 1;
1602 
1603  TrackedWaitcntSet.clear();
1604  BlockInfos.clear();
1605  bool Modified = false;
1606 
1607  if (!MFI->isEntryFunction()) {
1608  // Wait for any outstanding memory operations that the input registers may
1609  // depend on. We can't track them and it's better to do the wait after the
1610  // costly call sequence.
1611 
1612  // TODO: Could insert earlier and schedule more liberally with operations
1613  // that only use caller preserved registers.
1614  MachineBasicBlock &EntryBB = MF.front();
1615  MachineBasicBlock::iterator I = EntryBB.begin();
1616  for (MachineBasicBlock::iterator E = EntryBB.end();
1617  I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
1618  ;
1619  BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
1620  if (ST->hasVscnt())
1621  BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
1622  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1623  .addImm(0);
1624 
1625  Modified = true;
1626  }
1627 
1628  // Keep iterating over the blocks in reverse post order, inserting and
1629  // updating s_waitcnt where needed, until a fix point is reached.
1631  BlockInfos.insert({MBB, BlockInfo(MBB)});
1632 
1633  std::unique_ptr<WaitcntBrackets> Brackets;
1634  bool Repeat;
1635  do {
1636  Repeat = false;
1637 
1638  for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
1639  ++BII) {
1640  BlockInfo &BI = BII->second;
1641  if (!BI.Dirty)
1642  continue;
1643 
1644  if (BI.Incoming) {
1645  if (!Brackets)
1646  Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
1647  else
1648  *Brackets = *BI.Incoming;
1649  } else {
1650  if (!Brackets)
1651  Brackets = std::make_unique<WaitcntBrackets>(ST);
1652  else
1653  *Brackets = WaitcntBrackets(ST);
1654  }
1655 
1656  Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
1657  BI.Dirty = false;
1658 
1659  if (Brackets->hasPending()) {
1660  BlockInfo *MoveBracketsToSucc = nullptr;
1661  for (MachineBasicBlock *Succ : BI.MBB->successors()) {
1662  auto SuccBII = BlockInfos.find(Succ);
1663  BlockInfo &SuccBI = SuccBII->second;
1664  if (!SuccBI.Incoming) {
1665  SuccBI.Dirty = true;
1666  if (SuccBII <= BII)
1667  Repeat = true;
1668  if (!MoveBracketsToSucc) {
1669  MoveBracketsToSucc = &SuccBI;
1670  } else {
1671  SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
1672  }
1673  } else if (SuccBI.Incoming->merge(*Brackets)) {
1674  SuccBI.Dirty = true;
1675  if (SuccBII <= BII)
1676  Repeat = true;
1677  }
1678  }
1679  if (MoveBracketsToSucc)
1680  MoveBracketsToSucc->Incoming = std::move(Brackets);
1681  }
1682  }
1683  } while (Repeat);
1684 
1686 
1687  bool HaveScalarStores = false;
1688 
1689  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1690  ++BI) {
1691  MachineBasicBlock &MBB = *BI;
1692 
1693  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1694  ++I) {
1695  if (!HaveScalarStores && TII->isScalarStore(*I))
1696  HaveScalarStores = true;
1697 
1698  if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1699  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1700  EndPgmBlocks.push_back(&MBB);
1701  }
1702  }
1703 
1704  if (HaveScalarStores) {
1705  // If scalar writes are used, the cache must be flushed or else the next
1706  // wave to reuse the same scratch memory can be clobbered.
1707  //
1708  // Insert s_dcache_wb at wave termination points if there were any scalar
1709  // stores, and only if the cache hasn't already been flushed. This could be
1710  // improved by looking across blocks for flushes in postdominating blocks
1711  // from the stores but an explicitly requested flush is probably very rare.
1712  for (MachineBasicBlock *MBB : EndPgmBlocks) {
1713  bool SeenDCacheWB = false;
1714 
1715  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1716  ++I) {
1717  if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1718  SeenDCacheWB = true;
1719  else if (TII->isScalarStore(*I))
1720  SeenDCacheWB = false;
1721 
1722  // FIXME: It would be better to insert this before a waitcnt if any.
1723  if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1724  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1725  !SeenDCacheWB) {
1726  Modified = true;
1727  BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1728  }
1729  }
1730  }
1731  }
1732 
1733  return Modified;
1734 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
llvm::AMDGPU::getMUBUFIsBufferInv
bool getMUBUFIsBufferInv(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:289
llvm::AMDGPU::getMCReg
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
Definition: AMDGPUBaseInfo.cpp:1543
llvm::SIInstrFlags::LGKM_CNT
@ LGKM_CNT
Definition: SIDefines.h:68
llvm::SIInstrInfo::isAtomicRet
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:569
readsVCCZ
static bool readsVCCZ(const MachineInstr &MI)
Definition: SIInsertWaitcnts.cpp:907
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:103
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
Reg
unsigned Reg
Definition: MachineSink.cpp:1566
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
print
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Definition: ArchiveWriter.cpp:147
llvm::AMDGPU::getIsaVersion
IsaVersion getIsaVersion(StringRef GPU)
Definition: TargetParser.cpp:189
SIMachineFunctionInfo.h
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:52
llvm::MachineInstr::mayLoadOrStore
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition: MachineInstr.h:1028
llvm::MachineInstr::mayLoad
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:1005
Waitcnts
SI Insert Waitcnts
Definition: SIInsertWaitcnts.cpp:795
llvm::MapVector::clear
void clear()
Definition: MapVector.h:88
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) INITIALIZE_PASS_END(SIInsertWaitcnts
llvm::MachineFunction::end
iterator end()
Definition: MachineFunction.h:810
llvm::TargetRegisterInfo::isInAllocatableClass
bool isInAllocatableClass(MCRegister RegNo) const
Return true if the register is in the allocation of any register class.
Definition: TargetRegisterInfo.h:361
MapVector.h
llvm::SIInstrInfo::isEXP
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:553
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::erase
bool erase(const KeyT &Val)
Definition: DenseMap.h:302
llvm::MachineOperand::setImm
void setImm(int64_t immVal)
Definition: MachineOperand.h:655
llvm::MCRegisterInfo::getEncodingValue
uint16_t getEncodingValue(MCRegister RegNo) const
Returns the encoding for RegNo.
Definition: MCRegisterInfo.h:553
llvm::AMDGPU::Exp::ET_POS0
@ ET_POS0
Definition: SIDefines.h:744
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:128
llvm::SIInsertWaitcntsID
char & SIInsertWaitcntsID
Definition: SIInsertWaitcnts.cpp:800
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:145
T
#define T
Definition: Mips16ISelLowering.cpp:341
llvm::MapVector
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::AMDGPU::getVmcntBitMask
unsigned getVmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:892
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:876
TargetParser.h
llvm::AMDGPU::getNamedOperandIdx
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
DEBUG_COUNTER
DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", "Force emit s_waitcnt expcnt(0) instrs")
llvm::AMDGPU::Exp::ET_PARAM0
@ ET_PARAM0
Definition: SIDefines.h:749
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::count
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1567
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:102
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::DebugCounter::isCounterSet
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:102
llvm::AMDGPU::SendMsg::ID_MASK_
@ ID_MASK_
Definition: SIDefines.h:318
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::AMDGPU::IsaVersion
Instruction set architecture version.
Definition: TargetParser.h:105
llvm::MachineFunction::front
const MachineBasicBlock & front() const
Definition: MachineFunction.h:820
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:636
llvm::MapVector::begin
iterator begin()
Definition: MapVector.h:69
llvm::MachineInstr::isCall
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:823
llvm::SIInstrInfo::isMIMG
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:500
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineOperand::getImm
int64_t getImm() const
Definition: MachineOperand.h:537
llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:724
llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition: MachineInstrBuilder.h:52
llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:499
llvm::AMDGPUISD::DS_ORDERED_COUNT
@ DS_ORDERED_COUNT
Definition: AMDGPUISelLowering.h:491
llvm::createSIInsertWaitcntsPass
FunctionPass * createSIInsertWaitcntsPass()
Definition: SIInsertWaitcnts.cpp:802
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
t
bitcast float %x to i32 %s=and i32 %t, 2147483647 %d=bitcast i32 %s to float ret float %d } declare float @fabsf(float %n) define float @bar(float %x) nounwind { %d=call float @fabsf(float %x) ret float %d } This IR(from PR6194):target datalayout="e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple="x86_64-apple-darwin10.0.0" %0=type { double, double } %struct.float3=type { float, float, float } define void @test(%0, %struct.float3 *nocapture %res) nounwind noinline ssp { entry:%tmp18=extractvalue %0 %0, 0 t
Definition: README-SSE.txt:788
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
false
Definition: StackSlotColoring.cpp:142
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:129
merge
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Definition: LoopDeletion.cpp:51
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:49
llvm::DebugCounter::shouldExecute
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:74
llvm::report_fatal_error
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:140
llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
LoopDeletionResult::Modified
@ Modified
llvm::AMDGPU::decodeWaitcnt
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
Definition: AMDGPUBaseInfo.cpp:943
llvm::MachineFunction::begin
iterator begin()
Definition: MachineFunction.h:808
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:28
llvm::MachineInstr::definesRegister
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr fully defines the specified register.
Definition: MachineInstr.h:1398
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:95
llvm::AMDGPU::Exp::ET_POS_LAST
@ ET_POS_LAST
Definition: SIDefines.h:747
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::DenseSet
Implements a dense probed hash-table based set.
Definition: DenseSet.h:268
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:626
llvm::cl::opt< bool >
llvm::MachineInstr::getDebugLoc
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:418
llvm::AMDGPU::getExpcntBitMask
unsigned getExpcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:901
AMDGPUMCTargetDesc.h
llvm::MapVector::find
iterator find(const KeyT &Key)
Definition: MapVector.h:147
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:321
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:64
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SIInsertWaitcnts.cpp:39
llvm::Interval
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:36
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::SIInstrFlags::VM_CNT
@ VM_CNT
Definition: SIDefines.h:66
move
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional move
Definition: README.txt:546
llvm::DenseMap
Definition: DenseMap.h:714
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::AMDGPUMachineFunction::isEntryFunction
bool isEntryFunction() const
Definition: AMDGPUMachineFunction.h:78
llvm::iterator_facade_base
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition: iterator.h:66
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
callWaitsOnFunctionEntry
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
Definition: SIInsertWaitcnts.cpp:914
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::find
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:150
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:354
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::operator==
bool operator==(uint64_t V1, const APInt &V2)
Definition: APInt.h:1974
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
MachinePostDominators.h
llvm::AMDGPU::getMIMGInfo
const LLVM_READONLY MIMGInfo * getMIMGInfo(unsigned Opc)
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:360
llvm::MachineFunction
Definition: MachineFunction.h:230
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:351
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::clear
void clear()
Definition: DenseSet.h:92
llvm::AMDGPU::getMIMGBaseOpcodeInfo
const LLVM_READONLY MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
llvm::operator*
APInt operator*(APInt a, uint64_t RHS)
Definition: APInt.h:2098
llvm::MapVector::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:117
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:253
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
AMDGPU.h
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:489
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:81
llvm::MachineInstr::print
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
Definition: MachineInstr.cpp:1577
llvm::MachineOperand::isDef
bool isDef() const
Definition: MachineOperand.h:375
llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:286
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:207
llvm::MachinePostDominatorTree
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
Definition: MachinePostDominators.h:27
MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105
llvm::AMDGPU::SendMsg::ID_GS_DONE
@ ID_GS_DONE
Definition: SIDefines.h:304
llvm::MapVector::end
iterator end()
Definition: MapVector.h:71
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:349
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::AMDGPU::encodeWaitcnt
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Definition: AMDGPUBaseInfo.cpp:980
llvm::MachinePostDominatorTree::dominates
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
Definition: MachinePostDominators.h:54
llvm::AMDGPU::Waitcnt
Represents the counter values to wait for in an s_waitcnt instruction.
Definition: AMDGPUBaseInfo.h:473
llvm::TargetRegisterInfo::getRegSizeInBits
unsigned getRegSizeInBits(const TargetRegisterClass &RC) const
Return the size in bits of a register from class RC.
Definition: TargetRegisterInfo.h:276
ForceEmitZeroFlag
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
llvm::SIInstrInfo::isAtomicNoRet
static bool isAtomicNoRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:561
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:321
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::MachineInstr::mayStore
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:1018
llvm::SIInstrInfo::isVMEM
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:368
DebugCounter.h
llvm::TargetStackID::Value
Value
Definition: TargetFrameLowering.h:27
llvm::AMDGPU::MIMGInfo
Definition: AMDGPUBaseInfo.h:359
llvm::ReversePostOrderTraversal
Definition: PostOrderIterator.h:290
llvm::AMDGPU::getLgkmcntBitMask
unsigned getLgkmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:905
llvm::SIInstrInfo
Definition: SIInstrInfo.h:38
PostOrderIterator.h
callWaitsOnFunctionReturn
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
Definition: SIInsertWaitcnts.cpp:924
llvm::MachineBasicBlock::begin
iterator begin()
Definition: MachineBasicBlock.h:268
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:328
llvm::MachineInstr::getNumOperands
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:492
llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition: SIMachineFunctionInfo.h:335
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
llvm::iterator_range
A range adaptor for a pair of iterators.
Definition: iterator_range.h:30
llvm::SIInstrFlags::EXP_CNT
@ EXP_CNT
Definition: SIDefines.h:67
llvm::MachineInstr::memoperands
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:690
llvm::sys::Wait
ProcessInfo Wait(const ProcessInfo &PI, unsigned SecondsToWait, bool WaitUntilTerminates, std::string *ErrMsg=nullptr, Optional< ProcessStatistics > *ProcStat=nullptr)
This function waits for the process specified by PI to finish.
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
llvm::AMDGPU::Exp::ET_PARAM31
@ ET_PARAM31
Definition: SIDefines.h:750
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::AMDGPU::Waitcnt::allZero
static Waitcnt allZero(bool HasVscnt)
Definition: AMDGPUBaseInfo.h:483
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::cl::desc
Definition: CommandLine.h:414
llvm::MachineInstr::eraseFromParent
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Definition: MachineInstr.cpp:677
llvm::MachineInstrBundleIterator< MachineInstr >
llvm::pdb::PDB_SymType::Block
@ Block
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:270
llvm::AMDGPU::MIMGBaseOpcodeInfo::Sampler
bool Sampler
Definition: AMDGPUBaseInfo.h:285
Other
Optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1172
AMDGPUBaseInfo.h
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37