LLVM  10.0.0svn
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert wait instructions for memory reads and writes.
11 ///
12 /// Memory reads and writes are issued asynchronously, so we need to insert
13 /// S_WAITCNT instructions when we want to access any of their results or
14 /// overwrite any register that's used asynchronously.
15 ///
16 /// TODO: This pass currently keeps one timeline per hardware counter. A more
17 /// finely-grained approach that keeps one timeline per event type could
18 /// sometimes get away with generating weaker s_waitcnt instructions. For
19 /// example, when both SMEM and LDS are in flight and we need to wait for
20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21 /// but the pass will currently generate a conservative lgkmcnt(0) because
22 /// multiple event types are in flight.
23 //
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPU.h"
27 #include "AMDGPUSubtarget.h"
28 #include "SIDefines.h"
29 #include "SIInstrInfo.h"
30 #include "SIMachineFunctionInfo.h"
31 #include "SIRegisterInfo.h"
32 #include "Utils/AMDGPUBaseInfo.h"
33 #include "llvm/ADT/DenseMap.h"
34 #include "llvm/ADT/DenseSet.h"
36 #include "llvm/ADT/STLExtras.h"
37 #include "llvm/ADT/SmallVector.h"
46 #include "llvm/IR/DebugLoc.h"
47 #include "llvm/Pass.h"
48 #include "llvm/Support/Debug.h"
52 #include <algorithm>
53 #include <cassert>
54 #include <cstdint>
55 #include <cstring>
56 #include <memory>
57 #include <utility>
58 #include <vector>
59 
60 using namespace llvm;
61 
62 #define DEBUG_TYPE "si-insert-waitcnts"
63 
64 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
65  "Force emit s_waitcnt expcnt(0) instrs");
66 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
67  "Force emit s_waitcnt lgkmcnt(0) instrs");
68 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
69  "Force emit s_waitcnt vmcnt(0) instrs");
70 
72  "amdgpu-waitcnt-forcezero",
73  cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
74  cl::init(false), cl::Hidden);
75 
76 namespace {
77 
78 template <typename EnumT>
79 class enum_iterator
80  : public iterator_facade_base<enum_iterator<EnumT>,
81  std::forward_iterator_tag, const EnumT> {
82  EnumT Value;
83 public:
84  enum_iterator() = default;
85  enum_iterator(EnumT Value) : Value(Value) {}
86 
87  enum_iterator &operator++() {
88  Value = static_cast<EnumT>(Value + 1);
89  return *this;
90  }
91 
92  bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
93 
94  EnumT operator*() const { return Value; }
95 };
96 
97 // Class of object that encapsulates latest instruction counter score
98 // associated with the operand. Used for determining whether
99 // s_waitcnt instruction needs to be emited.
100 
101 #define CNT_MASK(t) (1u << (t))
102 
103 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
104 
105 iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
106  return make_range(enum_iterator<InstCounterType>(VM_CNT),
107  enum_iterator<InstCounterType>(NUM_INST_CNTS));
108 }
109 
110 using RegInterval = std::pair<signed, signed>;
111 
112 struct {
113  uint32_t VmcntMax;
114  uint32_t ExpcntMax;
115  uint32_t LgkmcntMax;
116  uint32_t VscntMax;
117  int32_t NumVGPRsMax;
118  int32_t NumSGPRsMax;
119 } HardwareLimits;
120 
121 struct {
122  unsigned VGPR0;
123  unsigned VGPRL;
124  unsigned SGPR0;
125  unsigned SGPRL;
126 } RegisterEncoding;
127 
129  VMEM_ACCESS, // vector-memory read & write
130  VMEM_READ_ACCESS, // vector-memory read
131  VMEM_WRITE_ACCESS,// vector-memory write
132  LDS_ACCESS, // lds read & write
133  GDS_ACCESS, // gds read & write
134  SQ_MESSAGE, // send message
135  SMEM_ACCESS, // scalar-memory read & write
136  EXP_GPR_LOCK, // export holding on its data src
137  GDS_GPR_LOCK, // GDS holding on its data and addr src
138  EXP_POS_ACCESS, // write to export position
139  EXP_PARAM_ACCESS, // write to export parameter
140  VMW_GPR_LOCK, // vector-memory write holding on its data src
141  NUM_WAIT_EVENTS,
142 };
143 
144 static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
145  (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
146  (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
147  (1 << SQ_MESSAGE),
148  (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
149  (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
150  (1 << VMEM_WRITE_ACCESS)
151 };
152 
153 // The mapping is:
154 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
155 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
156 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
157 // We reserve a fixed number of VGPR slots in the scoring tables for
158 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
160  SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
161  SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
162  NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
163  EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
164  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
165 };
166 
167 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
168  switch (T) {
169  case VM_CNT:
170  Wait.VmCnt = std::min(Wait.VmCnt, Count);
171  break;
172  case EXP_CNT:
173  Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
174  break;
175  case LGKM_CNT:
176  Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
177  break;
178  case VS_CNT:
179  Wait.VsCnt = std::min(Wait.VsCnt, Count);
180  break;
181  default:
182  llvm_unreachable("bad InstCounterType");
183  }
184 }
185 
186 // This objects maintains the current score brackets of each wait counter, and
187 // a per-register scoreboard for each wait counter.
188 //
189 // We also maintain the latest score for every event type that can change the
190 // waitcnt in order to know if there are multiple types of events within
191 // the brackets. When multiple types of event happen in the bracket,
192 // wait count may get decreased out of order, therefore we need to put in
193 // "s_waitcnt 0" before use.
194 class WaitcntBrackets {
195 public:
196  WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
197  for (auto T : inst_counter_types())
198  memset(VgprScores[T], 0, sizeof(VgprScores[T]));
199  }
200 
201  static uint32_t getWaitCountMax(InstCounterType T) {
202  switch (T) {
203  case VM_CNT:
204  return HardwareLimits.VmcntMax;
205  case LGKM_CNT:
206  return HardwareLimits.LgkmcntMax;
207  case EXP_CNT:
208  return HardwareLimits.ExpcntMax;
209  case VS_CNT:
210  return HardwareLimits.VscntMax;
211  default:
212  break;
213  }
214  return 0;
215  }
216 
217  uint32_t getScoreLB(InstCounterType T) const {
218  assert(T < NUM_INST_CNTS);
219  if (T >= NUM_INST_CNTS)
220  return 0;
221  return ScoreLBs[T];
222  }
223 
224  uint32_t getScoreUB(InstCounterType T) const {
225  assert(T < NUM_INST_CNTS);
226  if (T >= NUM_INST_CNTS)
227  return 0;
228  return ScoreUBs[T];
229  }
230 
231  // Mapping from event to counter.
232  InstCounterType eventCounter(WaitEventType E) {
233  if (WaitEventMaskForInst[VM_CNT] & (1 << E))
234  return VM_CNT;
235  if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
236  return LGKM_CNT;
237  if (WaitEventMaskForInst[VS_CNT] & (1 << E))
238  return VS_CNT;
239  assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
240  return EXP_CNT;
241  }
242 
243  uint32_t getRegScore(int GprNo, InstCounterType T) {
244  if (GprNo < NUM_ALL_VGPRS) {
245  return VgprScores[T][GprNo];
246  }
247  assert(T == LGKM_CNT);
248  return SgprScores[GprNo - NUM_ALL_VGPRS];
249  }
250 
251  void clear() {
252  memset(ScoreLBs, 0, sizeof(ScoreLBs));
253  memset(ScoreUBs, 0, sizeof(ScoreUBs));
254  PendingEvents = 0;
255  memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
256  for (auto T : inst_counter_types())
257  memset(VgprScores[T], 0, sizeof(VgprScores[T]));
258  memset(SgprScores, 0, sizeof(SgprScores));
259  }
260 
261  bool merge(const WaitcntBrackets &Other);
262 
263  RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
264  const MachineRegisterInfo *MRI,
265  const SIRegisterInfo *TRI, unsigned OpNo,
266  bool Def) const;
267 
268  int32_t getMaxVGPR() const { return VgprUB; }
269  int32_t getMaxSGPR() const { return SgprUB; }
270 
271  bool counterOutOfOrder(InstCounterType T) const;
272  bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
273  bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
274  void determineWait(InstCounterType T, uint32_t ScoreToWait,
275  AMDGPU::Waitcnt &Wait) const;
276  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
277  void applyWaitcnt(InstCounterType T, unsigned Count);
278  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
279  const MachineRegisterInfo *MRI, WaitEventType E,
280  MachineInstr &MI);
281 
282  bool hasPending() const { return PendingEvents != 0; }
283  bool hasPendingEvent(WaitEventType E) const {
284  return PendingEvents & (1 << E);
285  }
286 
287  bool hasPendingFlat() const {
288  return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
289  LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
290  (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
291  LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
292  }
293 
294  void setPendingFlat() {
295  LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
296  LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
297  }
298 
299  void print(raw_ostream &);
300  void dump() { print(dbgs()); }
301 
302 private:
303  struct MergeInfo {
304  uint32_t OldLB;
305  uint32_t OtherLB;
306  uint32_t MyShift;
307  uint32_t OtherShift;
308  };
309  static bool mergeScore(const MergeInfo &M, uint32_t &Score,
310  uint32_t OtherScore);
311 
312  void setScoreLB(InstCounterType T, uint32_t Val) {
313  assert(T < NUM_INST_CNTS);
314  if (T >= NUM_INST_CNTS)
315  return;
316  ScoreLBs[T] = Val;
317  }
318 
319  void setScoreUB(InstCounterType T, uint32_t Val) {
320  assert(T < NUM_INST_CNTS);
321  if (T >= NUM_INST_CNTS)
322  return;
323  ScoreUBs[T] = Val;
324  if (T == EXP_CNT) {
325  uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
326  if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
327  ScoreLBs[T] = UB;
328  }
329  }
330 
331  void setRegScore(int GprNo, InstCounterType T, uint32_t Val) {
332  if (GprNo < NUM_ALL_VGPRS) {
333  if (GprNo > VgprUB) {
334  VgprUB = GprNo;
335  }
336  VgprScores[T][GprNo] = Val;
337  } else {
338  assert(T == LGKM_CNT);
339  if (GprNo - NUM_ALL_VGPRS > SgprUB) {
340  SgprUB = GprNo - NUM_ALL_VGPRS;
341  }
342  SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
343  }
344  }
345 
346  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
347  const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
348  unsigned OpNo, uint32_t Val);
349 
350  const GCNSubtarget *ST = nullptr;
351  uint32_t ScoreLBs[NUM_INST_CNTS] = {0};
352  uint32_t ScoreUBs[NUM_INST_CNTS] = {0};
353  uint32_t PendingEvents = 0;
354  bool MixedPendingEvents[NUM_INST_CNTS] = {false};
355  // Remember the last flat memory operation.
356  uint32_t LastFlat[NUM_INST_CNTS] = {0};
357  // wait_cnt scores for every vgpr.
358  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
359  int32_t VgprUB = 0;
360  int32_t SgprUB = 0;
361  uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
362  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
363  uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
364 };
365 
366 class SIInsertWaitcnts : public MachineFunctionPass {
367 private:
368  const GCNSubtarget *ST = nullptr;
369  const SIInstrInfo *TII = nullptr;
370  const SIRegisterInfo *TRI = nullptr;
371  const MachineRegisterInfo *MRI = nullptr;
372  AMDGPU::IsaVersion IV;
373 
374  DenseSet<MachineInstr *> TrackedWaitcntSet;
375  DenseSet<MachineInstr *> VCCZBugHandledSet;
376 
377  struct BlockInfo {
378  MachineBasicBlock *MBB;
379  std::unique_ptr<WaitcntBrackets> Incoming;
380  bool Dirty = true;
381 
382  explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
383  };
384 
385  std::vector<BlockInfo> BlockInfos; // by reverse post-order traversal index
387 
388  // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
389  // because of amdgpu-waitcnt-forcezero flag
390  bool ForceEmitZeroWaitcnts;
391  bool ForceEmitWaitcnt[NUM_INST_CNTS];
392 
393 public:
394  static char ID;
395 
396  SIInsertWaitcnts() : MachineFunctionPass(ID) {
397  (void)ForceExpCounter;
398  (void)ForceLgkmCounter;
399  (void)ForceVMCounter;
400  }
401 
402  bool runOnMachineFunction(MachineFunction &MF) override;
403 
404  StringRef getPassName() const override {
405  return "SI insert wait instructions";
406  }
407 
408  void getAnalysisUsage(AnalysisUsage &AU) const override {
409  AU.setPreservesCFG();
411  }
412 
413  bool isForceEmitWaitcnt() const {
414  for (auto T : inst_counter_types())
415  if (ForceEmitWaitcnt[T])
416  return true;
417  return false;
418  }
419 
420  void setForceEmitWaitcnt() {
421 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
422 // For debug builds, get the debug counter info and adjust if need be
423 #ifndef NDEBUG
424  if (DebugCounter::isCounterSet(ForceExpCounter) &&
425  DebugCounter::shouldExecute(ForceExpCounter)) {
426  ForceEmitWaitcnt[EXP_CNT] = true;
427  } else {
428  ForceEmitWaitcnt[EXP_CNT] = false;
429  }
430 
431  if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
432  DebugCounter::shouldExecute(ForceLgkmCounter)) {
433  ForceEmitWaitcnt[LGKM_CNT] = true;
434  } else {
435  ForceEmitWaitcnt[LGKM_CNT] = false;
436  }
437 
438  if (DebugCounter::isCounterSet(ForceVMCounter) &&
439  DebugCounter::shouldExecute(ForceVMCounter)) {
440  ForceEmitWaitcnt[VM_CNT] = true;
441  } else {
442  ForceEmitWaitcnt[VM_CNT] = false;
443  }
444 #endif // NDEBUG
445  }
446 
447  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
448  bool generateWaitcntInstBefore(MachineInstr &MI,
449  WaitcntBrackets &ScoreBrackets,
450  MachineInstr *OldWaitcntInstr);
451  void updateEventWaitcntAfter(MachineInstr &Inst,
452  WaitcntBrackets *ScoreBrackets);
453  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
454  WaitcntBrackets &ScoreBrackets);
455 };
456 
457 } // end anonymous namespace
458 
459 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
460  const SIInstrInfo *TII,
461  const MachineRegisterInfo *MRI,
462  const SIRegisterInfo *TRI,
463  unsigned OpNo, bool Def) const {
464  const MachineOperand &Op = MI->getOperand(OpNo);
465  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
466  (Def && !Op.isDef()) || TRI->isAGPR(*MRI, Op.getReg()))
467  return {-1, -1};
468 
469  // A use via a PW operand does not need a waitcnt.
470  // A partial write is not a WAW.
471  assert(!Op.getSubReg() || !Op.isUndef());
472 
473  RegInterval Result;
474  const MachineRegisterInfo &MRIA = *MRI;
475 
476  unsigned Reg = TRI->getEncodingValue(Op.getReg());
477 
478  if (TRI->isVGPR(MRIA, Op.getReg())) {
479  assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
480  Result.first = Reg - RegisterEncoding.VGPR0;
481  assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
482  } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
483  assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
484  Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
485  assert(Result.first >= NUM_ALL_VGPRS &&
486  Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
487  }
488  // TODO: Handle TTMP
489  // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
490  else
491  return {-1, -1};
492 
493  const MachineInstr &MIA = *MI;
494  const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
495  unsigned Size = TRI->getRegSizeInBits(*RC);
496  Result.second = Result.first + (Size / 32);
497 
498  return Result;
499 }
500 
501 void WaitcntBrackets::setExpScore(const MachineInstr *MI,
502  const SIInstrInfo *TII,
503  const SIRegisterInfo *TRI,
504  const MachineRegisterInfo *MRI, unsigned OpNo,
505  uint32_t Val) {
506  RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
507  LLVM_DEBUG({
508  const MachineOperand &Opnd = MI->getOperand(OpNo);
509  assert(TRI->isVGPR(*MRI, Opnd.getReg()));
510  });
511  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
512  setRegScore(RegNo, EXP_CNT, Val);
513  }
514 }
515 
516 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
517  const SIRegisterInfo *TRI,
518  const MachineRegisterInfo *MRI,
519  WaitEventType E, MachineInstr &Inst) {
520  const MachineRegisterInfo &MRIA = *MRI;
521  InstCounterType T = eventCounter(E);
522  uint32_t CurrScore = getScoreUB(T) + 1;
523  if (CurrScore == 0)
524  report_fatal_error("InsertWaitcnt score wraparound");
525  // PendingEvents and ScoreUB need to be update regardless if this event
526  // changes the score of a register or not.
527  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
528  if (!hasPendingEvent(E)) {
529  if (PendingEvents & WaitEventMaskForInst[T])
530  MixedPendingEvents[T] = true;
531  PendingEvents |= 1 << E;
532  }
533  setScoreUB(T, CurrScore);
534 
535  if (T == EXP_CNT) {
536  // Put score on the source vgprs. If this is a store, just use those
537  // specific register(s).
538  if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
539  int AddrOpIdx =
540  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
541  // All GDS operations must protect their address register (same as
542  // export.)
543  if (AddrOpIdx != -1) {
544  setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
545  }
546 
547  if (Inst.mayStore()) {
549  AMDGPU::OpName::data0) != -1) {
550  setExpScore(
551  &Inst, TII, TRI, MRI,
552  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
553  CurrScore);
554  }
556  AMDGPU::OpName::data1) != -1) {
557  setExpScore(&Inst, TII, TRI, MRI,
559  AMDGPU::OpName::data1),
560  CurrScore);
561  }
562  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
563  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
564  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
565  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
566  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
567  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
568  Inst.getOpcode() != AMDGPU::DS_APPEND &&
569  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
571  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
572  const MachineOperand &Op = Inst.getOperand(I);
573  if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
574  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
575  }
576  }
577  }
578  } else if (TII->isFLAT(Inst)) {
579  if (Inst.mayStore()) {
580  setExpScore(
581  &Inst, TII, TRI, MRI,
582  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
583  CurrScore);
584  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
585  setExpScore(
586  &Inst, TII, TRI, MRI,
587  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
588  CurrScore);
589  }
590  } else if (TII->isMIMG(Inst)) {
591  if (Inst.mayStore()) {
592  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
593  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
594  setExpScore(
595  &Inst, TII, TRI, MRI,
596  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
597  CurrScore);
598  }
599  } else if (TII->isMTBUF(Inst)) {
600  if (Inst.mayStore()) {
601  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
602  }
603  } else if (TII->isMUBUF(Inst)) {
604  if (Inst.mayStore()) {
605  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
606  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
607  setExpScore(
608  &Inst, TII, TRI, MRI,
609  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
610  CurrScore);
611  }
612  } else {
613  if (TII->isEXP(Inst)) {
614  // For export the destination registers are really temps that
615  // can be used as the actual source after export patching, so
616  // we need to treat them like sources and set the EXP_CNT
617  // score.
618  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
619  MachineOperand &DefMO = Inst.getOperand(I);
620  if (DefMO.isReg() && DefMO.isDef() &&
621  TRI->isVGPR(MRIA, DefMO.getReg())) {
622  setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
623  CurrScore);
624  }
625  }
626  }
627  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
628  MachineOperand &MO = Inst.getOperand(I);
629  if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
630  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
631  }
632  }
633  }
634 #if 0 // TODO: check if this is handled by MUBUF code above.
635  } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
636  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
637  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
638  MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
639  unsigned OpNo;//TODO: find the OpNo for this operand;
640  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
641  for (signed RegNo = Interval.first; RegNo < Interval.second;
642  ++RegNo) {
643  setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
644  }
645 #endif
646  } else {
647  // Match the score to the destination registers.
648  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
649  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
650  if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
651  continue;
652  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
653  setRegScore(RegNo, T, CurrScore);
654  }
655  }
656  if (TII->isDS(Inst) && Inst.mayStore()) {
657  setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
658  }
659  }
660 }
661 
663  OS << '\n';
664  for (auto T : inst_counter_types()) {
665  uint32_t LB = getScoreLB(T);
666  uint32_t UB = getScoreUB(T);
667 
668  switch (T) {
669  case VM_CNT:
670  OS << " VM_CNT(" << UB - LB << "): ";
671  break;
672  case LGKM_CNT:
673  OS << " LGKM_CNT(" << UB - LB << "): ";
674  break;
675  case EXP_CNT:
676  OS << " EXP_CNT(" << UB - LB << "): ";
677  break;
678  case VS_CNT:
679  OS << " VS_CNT(" << UB - LB << "): ";
680  break;
681  default:
682  OS << " UNKNOWN(" << UB - LB << "): ";
683  break;
684  }
685 
686  if (LB < UB) {
687  // Print vgpr scores.
688  for (int J = 0; J <= getMaxVGPR(); J++) {
689  uint32_t RegScore = getRegScore(J, T);
690  if (RegScore <= LB)
691  continue;
692  uint32_t RelScore = RegScore - LB - 1;
693  if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
694  OS << RelScore << ":v" << J << " ";
695  } else {
696  OS << RelScore << ":ds ";
697  }
698  }
699  // Also need to print sgpr scores for lgkm_cnt.
700  if (T == LGKM_CNT) {
701  for (int J = 0; J <= getMaxSGPR(); J++) {
702  uint32_t RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
703  if (RegScore <= LB)
704  continue;
705  uint32_t RelScore = RegScore - LB - 1;
706  OS << RelScore << ":s" << J << " ";
707  }
708  }
709  }
710  OS << '\n';
711  }
712  OS << '\n';
713 }
714 
715 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
716 /// whether a waitcnt instruction is needed at all.
717 bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
718  return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
719  simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
720  simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) |
721  simplifyWaitcnt(VS_CNT, Wait.VsCnt);
722 }
723 
724 bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
725  unsigned &Count) const {
726  const uint32_t LB = getScoreLB(T);
727  const uint32_t UB = getScoreUB(T);
728  if (Count < UB && UB - Count > LB)
729  return true;
730 
731  Count = ~0u;
732  return false;
733 }
734 
735 void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait,
736  AMDGPU::Waitcnt &Wait) const {
737  // If the score of src_operand falls within the bracket, we need an
738  // s_waitcnt instruction.
739  const uint32_t LB = getScoreLB(T);
740  const uint32_t UB = getScoreUB(T);
741  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
742  if ((T == VM_CNT || T == LGKM_CNT) &&
743  hasPendingFlat() &&
744  !ST->hasFlatLgkmVMemCountInOrder()) {
745  // If there is a pending FLAT operation, and this is a VMem or LGKM
746  // waitcnt and the target can report early completion, then we need
747  // to force a waitcnt 0.
748  addWait(Wait, T, 0);
749  } else if (counterOutOfOrder(T)) {
750  // Counter can get decremented out-of-order when there
751  // are multiple types event in the bracket. Also emit an s_wait counter
752  // with a conservative value of 0 for the counter.
753  addWait(Wait, T, 0);
754  } else {
755  addWait(Wait, T, UB - ScoreToWait);
756  }
757  }
758 }
759 
760 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
761  applyWaitcnt(VM_CNT, Wait.VmCnt);
762  applyWaitcnt(EXP_CNT, Wait.ExpCnt);
763  applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
764  applyWaitcnt(VS_CNT, Wait.VsCnt);
765 }
766 
767 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
768  const uint32_t UB = getScoreUB(T);
769  if (Count >= UB)
770  return;
771  if (Count != 0) {
772  if (counterOutOfOrder(T))
773  return;
774  setScoreLB(T, std::max(getScoreLB(T), UB - Count));
775  } else {
776  setScoreLB(T, UB);
777  MixedPendingEvents[T] = false;
778  PendingEvents &= ~WaitEventMaskForInst[T];
779  }
780 }
781 
782 // Where there are multiple types of event in the bracket of a counter,
783 // the decrement may go out of order.
784 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
785  // Scalar memory read always can go out of order.
786  if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
787  return true;
788  return MixedPendingEvents[T];
789 }
790 
791 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
792  false)
793 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
794  false)
795 
796 char SIInsertWaitcnts::ID = 0;
797 
798 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
799 
801  return new SIInsertWaitcnts();
802 }
803 
804 static bool readsVCCZ(const MachineInstr &MI) {
805  unsigned Opc = MI.getOpcode();
806  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
807  !MI.getOperand(1).isUndef();
808 }
809 
810 /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
812  // Currently all conventions wait, but this may not always be the case.
813  //
814  // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
815  // senses to omit the wait and do it in the caller.
816  return true;
817 }
818 
819 /// \returns true if the callee is expected to wait for any outstanding waits
820 /// before returning.
822  return true;
823 }
824 
825 /// Generate s_waitcnt instruction to be placed before cur_Inst.
826 /// Instructions of a given type are returned in order,
827 /// but instructions of different types can complete out of order.
828 /// We rely on this in-order completion
829 /// and simply assign a score to the memory access instructions.
830 /// We keep track of the active "score bracket" to determine
831 /// if an access of a memory read requires an s_waitcnt
832 /// and if so what the value of each counter is.
833 /// The "score bracket" is bound by the lower bound and upper bound
834 /// scores (*_score_LB and *_score_ub respectively).
835 bool SIInsertWaitcnts::generateWaitcntInstBefore(
836  MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
837  MachineInstr *OldWaitcntInstr) {
838  setForceEmitWaitcnt();
839  bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
840 
841  if (MI.isDebugInstr())
842  return false;
843 
844  AMDGPU::Waitcnt Wait;
845 
846  // See if this instruction has a forced S_WAITCNT VM.
847  // TODO: Handle other cases of NeedsWaitcntVmBefore()
848  if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
849  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
850  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
851  MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
852  MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
853  Wait.VmCnt = 0;
854  }
855 
856  // All waits must be resolved at call return.
857  // NOTE: this could be improved with knowledge of all call sites or
858  // with knowledge of the called routines.
859  if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
860  MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
861  (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
862  Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
863  }
864  // Resolve vm waits before gs-done.
865  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
866  MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
869  Wait.VmCnt = 0;
870  }
871 #if 0 // TODO: the following blocks of logic when we have fence.
872  else if (MI.getOpcode() == SC_FENCE) {
873  const unsigned int group_size =
874  context->shader_info->GetMaxThreadGroupSize();
875  // group_size == 0 means thread group size is unknown at compile time
876  const bool group_is_multi_wave =
877  (group_size == 0 || group_size > target_info->GetWaveFrontSize());
878  const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
879 
880  for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
881  SCRegType src_type = Inst->GetSrcType(i);
882  switch (src_type) {
883  case SCMEM_LDS:
884  if (group_is_multi_wave ||
885  context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
886  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
887  ScoreBrackets->getScoreUB(LGKM_CNT));
888  // LDS may have to wait for VM_CNT after buffer load to LDS
889  if (target_info->HasBufferLoadToLDS()) {
890  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
891  ScoreBrackets->getScoreUB(VM_CNT));
892  }
893  }
894  break;
895 
896  case SCMEM_GDS:
897  if (group_is_multi_wave || fence_is_global) {
898  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
899  ScoreBrackets->getScoreUB(EXP_CNT));
900  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
901  ScoreBrackets->getScoreUB(LGKM_CNT));
902  }
903  break;
904 
905  case SCMEM_UAV:
906  case SCMEM_TFBUF:
907  case SCMEM_RING:
908  case SCMEM_SCATTER:
909  if (group_is_multi_wave || fence_is_global) {
910  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
911  ScoreBrackets->getScoreUB(EXP_CNT));
912  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
913  ScoreBrackets->getScoreUB(VM_CNT));
914  }
915  break;
916 
917  case SCMEM_SCRATCH:
918  default:
919  break;
920  }
921  }
922  }
923 #endif
924 
925  // Export & GDS instructions do not read the EXEC mask until after the export
926  // is granted (which can occur well after the instruction is issued).
927  // The shader program must flush all EXP operations on the export-count
928  // before overwriting the EXEC mask.
929  else {
930  if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
931  // Export and GDS are tracked individually, either may trigger a waitcnt
932  // for EXEC.
933  if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
934  ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
935  ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
936  ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
937  Wait.ExpCnt = 0;
938  }
939  }
940 
941  if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
942  // Don't bother waiting on anything except the call address. The function
943  // is going to insert a wait on everything in its prolog. This still needs
944  // to be careful if the call target is a load (e.g. a GOT load).
945  Wait = AMDGPU::Waitcnt();
946 
947  int CallAddrOpIdx =
948  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
949  RegInterval Interval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI,
950  CallAddrOpIdx, false);
951  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
952  ScoreBrackets.determineWait(
953  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
954  }
955  } else {
956  // FIXME: Should not be relying on memoperands.
957  // Look at the source operands of every instruction to see if
958  // any of them results from a previous memory operation that affects
959  // its current usage. If so, an s_waitcnt instruction needs to be
960  // emitted.
961  // If the source operand was defined by a load, add the s_waitcnt
962  // instruction.
963  for (const MachineMemOperand *Memop : MI.memoperands()) {
964  unsigned AS = Memop->getAddrSpace();
965  if (AS != AMDGPUAS::LOCAL_ADDRESS)
966  continue;
967  unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
968  // VM_CNT is only relevant to vgpr or LDS.
969  ScoreBrackets.determineWait(
970  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
971  }
972 
973  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
974  const MachineOperand &Op = MI.getOperand(I);
975  const MachineRegisterInfo &MRIA = *MRI;
976  RegInterval Interval =
977  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
978  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
979  if (TRI->isVGPR(MRIA, Op.getReg())) {
980  // VM_CNT is only relevant to vgpr or LDS.
981  ScoreBrackets.determineWait(
982  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
983  }
984  ScoreBrackets.determineWait(
985  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
986  }
987  }
988  // End of for loop that looks at all source operands to decide vm_wait_cnt
989  // and lgk_wait_cnt.
990 
991  // Two cases are handled for destination operands:
992  // 1) If the destination operand was defined by a load, add the s_waitcnt
993  // instruction to guarantee the right WAW order.
994  // 2) If a destination operand that was used by a recent export/store ins,
995  // add s_waitcnt on exp_cnt to guarantee the WAR order.
996  if (MI.mayStore()) {
997  // FIXME: Should not be relying on memoperands.
998  for (const MachineMemOperand *Memop : MI.memoperands()) {
999  unsigned AS = Memop->getAddrSpace();
1000  if (AS != AMDGPUAS::LOCAL_ADDRESS)
1001  continue;
1002  unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1003  ScoreBrackets.determineWait(
1004  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1005  ScoreBrackets.determineWait(
1006  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1007  }
1008  }
1009  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1010  MachineOperand &Def = MI.getOperand(I);
1011  const MachineRegisterInfo &MRIA = *MRI;
1012  RegInterval Interval =
1013  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
1014  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1015  if (TRI->isVGPR(MRIA, Def.getReg())) {
1016  ScoreBrackets.determineWait(
1017  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1018  ScoreBrackets.determineWait(
1019  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1020  }
1021  ScoreBrackets.determineWait(
1022  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1023  }
1024  } // End of for loop that looks at all dest operands.
1025  }
1026  }
1027 
1028  // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1029  // occurs before the instruction. Doing it here prevents any additional
1030  // S_WAITCNTs from being emitted if the instruction was marked as
1031  // requiring a WAITCNT beforehand.
1032  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1033  !ST->hasAutoWaitcntBeforeBarrier()) {
1034  Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
1035  }
1036 
1037  // TODO: Remove this work-around, enable the assert for Bug 457939
1038  // after fixing the scheduler. Also, the Shader Compiler code is
1039  // independent of target.
1040  if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1041  if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1042  ScoreBrackets.getScoreUB(LGKM_CNT) &&
1043  ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1044  Wait.LgkmCnt = 0;
1045  }
1046  }
1047 
1048  // Early-out if no wait is indicated.
1049  if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
1050  bool Modified = false;
1051  if (OldWaitcntInstr) {
1052  for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
1053  &*II != &MI; II = NextI, ++NextI) {
1054  if (II->isDebugInstr())
1055  continue;
1056 
1057  if (TrackedWaitcntSet.count(&*II)) {
1058  TrackedWaitcntSet.erase(&*II);
1059  II->eraseFromParent();
1060  Modified = true;
1061  } else if (II->getOpcode() == AMDGPU::S_WAITCNT) {
1062  int64_t Imm = II->getOperand(0).getImm();
1063  ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1064  } else {
1065  assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
1066  assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1067  ScoreBrackets.applyWaitcnt(
1068  AMDGPU::Waitcnt(0, 0, 0, II->getOperand(1).getImm()));
1069  }
1070  }
1071  }
1072  return Modified;
1073  }
1074 
1075  if (ForceEmitZeroWaitcnts)
1076  Wait = AMDGPU::Waitcnt::allZero(IV);
1077 
1078  if (ForceEmitWaitcnt[VM_CNT])
1079  Wait.VmCnt = 0;
1080  if (ForceEmitWaitcnt[EXP_CNT])
1081  Wait.ExpCnt = 0;
1082  if (ForceEmitWaitcnt[LGKM_CNT])
1083  Wait.LgkmCnt = 0;
1084  if (ForceEmitWaitcnt[VS_CNT])
1085  Wait.VsCnt = 0;
1086 
1087  ScoreBrackets.applyWaitcnt(Wait);
1088 
1089  AMDGPU::Waitcnt OldWait;
1090  bool Modified = false;
1091 
1092  if (OldWaitcntInstr) {
1093  for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
1094  &*II != &MI; II = NextI, NextI++) {
1095  if (II->isDebugInstr())
1096  continue;
1097 
1098  if (II->getOpcode() == AMDGPU::S_WAITCNT) {
1099  unsigned IEnc = II->getOperand(0).getImm();
1100  AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1101  OldWait = OldWait.combined(IWait);
1102  if (!TrackedWaitcntSet.count(&*II))
1103  Wait = Wait.combined(IWait);
1104  unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
1105  if (IEnc != NewEnc) {
1106  II->getOperand(0).setImm(NewEnc);
1107  Modified = true;
1108  }
1109  Wait.VmCnt = ~0u;
1110  Wait.LgkmCnt = ~0u;
1111  Wait.ExpCnt = ~0u;
1112  } else {
1113  assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
1114  assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1115 
1116  unsigned ICnt = II->getOperand(1).getImm();
1117  OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt);
1118  if (!TrackedWaitcntSet.count(&*II))
1119  Wait.VsCnt = std::min(Wait.VsCnt, ICnt);
1120  if (Wait.VsCnt != ICnt) {
1121  II->getOperand(1).setImm(Wait.VsCnt);
1122  Modified = true;
1123  }
1124  Wait.VsCnt = ~0u;
1125  }
1126 
1127  LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
1128  << "Old Instr: " << MI << '\n'
1129  << "New Instr: " << *II << '\n');
1130 
1131  if (!Wait.hasWait())
1132  return Modified;
1133  }
1134  }
1135 
1136  if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) {
1137  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1138  auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1139  MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1140  .addImm(Enc);
1141  TrackedWaitcntSet.insert(SWaitInst);
1142  Modified = true;
1143 
1144  LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1145  << "Old Instr: " << MI << '\n'
1146  << "New Instr: " << *SWaitInst << '\n');
1147  }
1148 
1149  if (Wait.VsCnt != ~0u) {
1150  assert(ST->hasVscnt());
1151 
1152  auto SWaitInst =
1153  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1154  TII->get(AMDGPU::S_WAITCNT_VSCNT))
1155  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1156  .addImm(Wait.VsCnt);
1157  TrackedWaitcntSet.insert(SWaitInst);
1158  Modified = true;
1159 
1160  LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1161  << "Old Instr: " << MI << '\n'
1162  << "New Instr: " << *SWaitInst << '\n');
1163  }
1164 
1165  return Modified;
1166 }
1167 
1168 // This is a flat memory operation. Check to see if it has memory
1169 // tokens for both LDS and Memory, and if so mark it as a flat.
1170 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1171  if (MI.memoperands_empty())
1172  return true;
1173 
1174  for (const MachineMemOperand *Memop : MI.memoperands()) {
1175  unsigned AS = Memop->getAddrSpace();
1177  return true;
1178  }
1179 
1180  return false;
1181 }
1182 
1183 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1184  WaitcntBrackets *ScoreBrackets) {
1185  // Now look at the instruction opcode. If it is a memory access
1186  // instruction, update the upper-bound of the appropriate counter's
1187  // bracket and the destination operand scores.
1188  // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1189  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1190  if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1191  TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1192  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1193  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1194  } else {
1195  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1196  }
1197  } else if (TII->isFLAT(Inst)) {
1198  assert(Inst.mayLoad() || Inst.mayStore());
1199 
1200  if (TII->usesVM_CNT(Inst)) {
1201  if (!ST->hasVscnt())
1202  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1203  else if (Inst.mayLoad() &&
1204  AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1)
1205  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1206  else
1207  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1208  }
1209 
1210  if (TII->usesLGKM_CNT(Inst)) {
1211  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1212 
1213  // This is a flat memory operation, so note it - it will require
1214  // that both the VM and LGKM be flushed to zero if it is pending when
1215  // a VM or LGKM dependency occurs.
1216  if (mayAccessLDSThroughFlat(Inst))
1217  ScoreBrackets->setPendingFlat();
1218  }
1219  } else if (SIInstrInfo::isVMEM(Inst) &&
1220  // TODO: get a better carve out.
1221  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1222  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1223  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL &&
1224  Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV &&
1225  Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) {
1226  if (!ST->hasVscnt())
1227  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1228  else if ((Inst.mayLoad() &&
1229  AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) ||
1230  /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
1231  (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
1232  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1233  else if (Inst.mayStore())
1234  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1235 
1236  if (ST->vmemWriteNeedsExpWaitcnt() &&
1237  (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
1238  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1239  }
1240  } else if (TII->isSMRD(Inst)) {
1241  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1242  } else if (Inst.isCall()) {
1243  if (callWaitsOnFunctionReturn(Inst)) {
1244  // Act as a wait on everything
1245  ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(IV));
1246  } else {
1247  // May need to way wait for anything.
1248  ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
1249  }
1250  } else {
1251  switch (Inst.getOpcode()) {
1252  case AMDGPU::S_SENDMSG:
1253  case AMDGPU::S_SENDMSGHALT:
1254  ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1255  break;
1256  case AMDGPU::EXP:
1257  case AMDGPU::EXP_DONE: {
1258  int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1259  if (Imm >= 32 && Imm <= 63)
1260  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1261  else if (Imm >= 12 && Imm <= 15)
1262  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1263  else
1264  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1265  break;
1266  }
1267  case AMDGPU::S_MEMTIME:
1268  case AMDGPU::S_MEMREALTIME:
1269  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1270  break;
1271  default:
1272  break;
1273  }
1274  }
1275 }
1276 
1277 bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score,
1278  uint32_t OtherScore) {
1279  uint32_t MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
1280  uint32_t OtherShifted =
1281  OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
1282  Score = std::max(MyShifted, OtherShifted);
1283  return OtherShifted > MyShifted;
1284 }
1285 
1286 /// Merge the pending events and associater score brackets of \p Other into
1287 /// this brackets status.
1288 ///
1289 /// Returns whether the merge resulted in a change that requires tighter waits
1290 /// (i.e. the merged brackets strictly dominate the original brackets).
1291 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
1292  bool StrictDom = false;
1293 
1294  for (auto T : inst_counter_types()) {
1295  // Merge event flags for this counter
1296  const bool OldOutOfOrder = counterOutOfOrder(T);
1297  const uint32_t OldEvents = PendingEvents & WaitEventMaskForInst[T];
1298  const uint32_t OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
1299  if (OtherEvents & ~OldEvents)
1300  StrictDom = true;
1301  if (Other.MixedPendingEvents[T] ||
1302  (OldEvents && OtherEvents && OldEvents != OtherEvents))
1303  MixedPendingEvents[T] = true;
1304  PendingEvents |= OtherEvents;
1305 
1306  // Merge scores for this counter
1307  const uint32_t MyPending = ScoreUBs[T] - ScoreLBs[T];
1308  const uint32_t OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
1309  MergeInfo M;
1310  M.OldLB = ScoreLBs[T];
1311  M.OtherLB = Other.ScoreLBs[T];
1312  M.MyShift = OtherPending > MyPending ? OtherPending - MyPending : 0;
1313  M.OtherShift = ScoreUBs[T] - Other.ScoreUBs[T] + M.MyShift;
1314 
1315  const uint32_t NewUB = ScoreUBs[T] + M.MyShift;
1316  if (NewUB < ScoreUBs[T])
1317  report_fatal_error("waitcnt score overflow");
1318  ScoreUBs[T] = NewUB;
1319  ScoreLBs[T] = std::min(M.OldLB + M.MyShift, M.OtherLB + M.OtherShift);
1320 
1321  StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
1322 
1323  bool RegStrictDom = false;
1324  for (int J = 0, E = std::max(getMaxVGPR(), Other.getMaxVGPR()) + 1; J != E;
1325  J++) {
1326  RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
1327  }
1328 
1329  if (T == LGKM_CNT) {
1330  for (int J = 0, E = std::max(getMaxSGPR(), Other.getMaxSGPR()) + 1;
1331  J != E; J++) {
1332  RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
1333  }
1334  }
1335 
1336  if (RegStrictDom && !OldOutOfOrder)
1337  StrictDom = true;
1338  }
1339 
1340  VgprUB = std::max(getMaxVGPR(), Other.getMaxVGPR());
1341  SgprUB = std::max(getMaxSGPR(), Other.getMaxSGPR());
1342 
1343  return StrictDom;
1344 }
1345 
1346 // Generate s_waitcnt instructions where needed.
1347 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1348  MachineBasicBlock &Block,
1349  WaitcntBrackets &ScoreBrackets) {
1350  bool Modified = false;
1351 
1352  LLVM_DEBUG({
1353  dbgs() << "*** Block" << Block.getNumber() << " ***";
1354  ScoreBrackets.dump();
1355  });
1356 
1357  // Walk over the instructions.
1358  MachineInstr *OldWaitcntInstr = nullptr;
1359 
1360  for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
1361  E = Block.instr_end();
1362  Iter != E;) {
1363  MachineInstr &Inst = *Iter;
1364 
1365  // Track pre-existing waitcnts from earlier iterations.
1366  if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
1367  (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1368  Inst.getOperand(0).isReg() &&
1369  Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL)) {
1370  if (!OldWaitcntInstr)
1371  OldWaitcntInstr = &Inst;
1372  ++Iter;
1373  continue;
1374  }
1375 
1376  bool VCCZBugWorkAround = false;
1377  if (readsVCCZ(Inst) &&
1378  (!VCCZBugHandledSet.count(&Inst))) {
1379  if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1380  ScoreBrackets.getScoreUB(LGKM_CNT) &&
1381  ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1382  if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
1383  VCCZBugWorkAround = true;
1384  }
1385  }
1386 
1387  // Generate an s_waitcnt instruction to be placed before
1388  // cur_Inst, if needed.
1389  Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
1390  OldWaitcntInstr = nullptr;
1391 
1392  updateEventWaitcntAfter(Inst, &ScoreBrackets);
1393 
1394 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1395  // If this instruction generates a S_SETVSKIP because it is an
1396  // indexed resource, and we are on Tahiti, then it will also force
1397  // an S_WAITCNT vmcnt(0)
1398  if (RequireCheckResourceType(Inst, context)) {
1399  // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1400  ScoreBrackets->setScoreLB(VM_CNT,
1401  ScoreBrackets->getScoreUB(VM_CNT));
1402  }
1403 #endif
1404 
1405  LLVM_DEBUG({
1406  Inst.print(dbgs());
1407  ScoreBrackets.dump();
1408  });
1409 
1410  // TODO: Remove this work-around after fixing the scheduler and enable the
1411  // assert above.
1412  if (VCCZBugWorkAround) {
1413  // Restore the vccz bit. Any time a value is written to vcc, the vcc
1414  // bit is updated, so we can restore the bit by reading the value of
1415  // vcc and then writing it back to the register.
1416  BuildMI(Block, Inst, Inst.getDebugLoc(),
1417  TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1418  TRI->getVCC())
1419  .addReg(TRI->getVCC());
1420  VCCZBugHandledSet.insert(&Inst);
1421  Modified = true;
1422  }
1423 
1424  ++Iter;
1425  }
1426 
1427  return Modified;
1428 }
1429 
1430 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1431  ST = &MF.getSubtarget<GCNSubtarget>();
1432  TII = ST->getInstrInfo();
1433  TRI = &TII->getRegisterInfo();
1434  MRI = &MF.getRegInfo();
1435  IV = AMDGPU::getIsaVersion(ST->getCPU());
1437 
1438  ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1439  for (auto T : inst_counter_types())
1440  ForceEmitWaitcnt[T] = false;
1441 
1442  HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1443  HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1444  HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1445  HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;
1446 
1447  HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1448  HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1449  assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1450  assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1451 
1452  RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1453  RegisterEncoding.VGPRL =
1454  RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1455  RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1456  RegisterEncoding.SGPRL =
1457  RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1458 
1459  TrackedWaitcntSet.clear();
1460  VCCZBugHandledSet.clear();
1461  RpotIdxMap.clear();
1462  BlockInfos.clear();
1463 
1464  // Keep iterating over the blocks in reverse post order, inserting and
1465  // updating s_waitcnt where needed, until a fix point is reached.
1466  for (MachineBasicBlock *MBB :
1468  RpotIdxMap[MBB] = BlockInfos.size();
1469  BlockInfos.emplace_back(MBB);
1470  }
1471 
1472  std::unique_ptr<WaitcntBrackets> Brackets;
1473  bool Modified = false;
1474  bool Repeat;
1475  do {
1476  Repeat = false;
1477 
1478  for (BlockInfo &BI : BlockInfos) {
1479  if (!BI.Dirty)
1480  continue;
1481 
1482  unsigned Idx = std::distance(&*BlockInfos.begin(), &BI);
1483 
1484  if (BI.Incoming) {
1485  if (!Brackets)
1486  Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
1487  else
1488  *Brackets = *BI.Incoming;
1489  } else {
1490  if (!Brackets)
1491  Brackets = std::make_unique<WaitcntBrackets>(ST);
1492  else
1493  Brackets->clear();
1494  }
1495 
1496  Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
1497  BI.Dirty = false;
1498 
1499  if (Brackets->hasPending()) {
1500  BlockInfo *MoveBracketsToSucc = nullptr;
1501  for (MachineBasicBlock *Succ : BI.MBB->successors()) {
1502  unsigned SuccIdx = RpotIdxMap[Succ];
1503  BlockInfo &SuccBI = BlockInfos[SuccIdx];
1504  if (!SuccBI.Incoming) {
1505  SuccBI.Dirty = true;
1506  if (SuccIdx <= Idx)
1507  Repeat = true;
1508  if (!MoveBracketsToSucc) {
1509  MoveBracketsToSucc = &SuccBI;
1510  } else {
1511  SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
1512  }
1513  } else if (SuccBI.Incoming->merge(*Brackets)) {
1514  SuccBI.Dirty = true;
1515  if (SuccIdx <= Idx)
1516  Repeat = true;
1517  }
1518  }
1519  if (MoveBracketsToSucc)
1520  MoveBracketsToSucc->Incoming = std::move(Brackets);
1521  }
1522  }
1523  } while (Repeat);
1524 
1526 
1527  bool HaveScalarStores = false;
1528 
1529  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1530  ++BI) {
1531  MachineBasicBlock &MBB = *BI;
1532 
1533  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1534  ++I) {
1535  if (!HaveScalarStores && TII->isScalarStore(*I))
1536  HaveScalarStores = true;
1537 
1538  if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1539  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1540  EndPgmBlocks.push_back(&MBB);
1541  }
1542  }
1543 
1544  if (HaveScalarStores) {
1545  // If scalar writes are used, the cache must be flushed or else the next
1546  // wave to reuse the same scratch memory can be clobbered.
1547  //
1548  // Insert s_dcache_wb at wave termination points if there were any scalar
1549  // stores, and only if the cache hasn't already been flushed. This could be
1550  // improved by looking across blocks for flushes in postdominating blocks
1551  // from the stores but an explicitly requested flush is probably very rare.
1552  for (MachineBasicBlock *MBB : EndPgmBlocks) {
1553  bool SeenDCacheWB = false;
1554 
1555  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1556  ++I) {
1557  if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1558  SeenDCacheWB = true;
1559  else if (TII->isScalarStore(*I))
1560  SeenDCacheWB = false;
1561 
1562  // FIXME: It would be better to insert this before a waitcnt if any.
1563  if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1564  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1565  !SeenDCacheWB) {
1566  Modified = true;
1567  BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1568  }
1569  }
1570  }
1571  }
1572 
1573  if (!MFI->isEntryFunction()) {
1574  // Wait for any outstanding memory operations that the input registers may
1575  // depend on. We can't track them and it's better to the wait after the
1576  // costly call sequence.
1577 
1578  // TODO: Could insert earlier and schedule more liberally with operations
1579  // that only use caller preserved registers.
1580  MachineBasicBlock &EntryBB = MF.front();
1581  if (ST->hasVscnt())
1582  BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(),
1583  TII->get(AMDGPU::S_WAITCNT_VSCNT))
1584  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1585  .addImm(0);
1586  BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1587  .addImm(0);
1588 
1589  Modified = true;
1590  }
1591 
1592  return Modified;
1593 }
Interface definition for SIRegisterInfo.
instr_iterator instr_begin()
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
AMDGPU specific subclass of TargetSubtarget.
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:651
instr_iterator instr_end()
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:139
This class represents lattice values for constants.
Definition: AllocatorList.h:23
SI Insert Waitcnts
unsigned getExpcntBitMask(const IsaVersion &Version)
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:36
Implements a dense probed hash-table based set.
Definition: DenseSet.h:249
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:385
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
unsigned Reg
unsigned getSubReg() const
Optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:952
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:33
bool isAGPR(const MachineRegisterInfo &MRI, unsigned Reg) const
LLVM_READONLY int getAtomicNoRetOp(uint16_t Opcode)
FunctionPass * createSIInsertWaitcntsPass()
DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", "Force emit s_waitcnt expcnt(0) instrs")
iterator_range< succ_iterator > successors()
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:462
static const AMDGPUSubtarget & get(const MachineFunction &MF)
A description of a memory reference used in the backend.
LLVM_READONLY int getAtomicRetOp(uint16_t Opcode)
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:488
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:414
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:472
static bool readsVCCZ(const MachineInstr &MI)
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:411
This file provides an implementation of debug counters.
APInt operator*(APInt a, uint64_t RHS)
Definition: APInt.h:2099
InstCounterType
bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:436
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:101
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they&#39;re not in a MachineFuncti...
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition: iterator.h:67
bool isReturn(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:641
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:838
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
unsigned const MachineRegisterInfo * MRI
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:534
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Represent the analysis usage information of a pass.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
self_iterator getIterator()
Definition: ilist_node.h:81
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:73
RegisterMapping
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
const MachineBasicBlock & front() const
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
bool isDebugInstr() const
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Address space for local memory.
Definition: AMDGPU.h:274
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:509
IsaVersion getIsaVersion(StringRef GPU)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
char & SIInsertWaitcntsID
Iterator for intrusive lists based on ilist_node.
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register...
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:301
int64_t getImm() const
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
A range adaptor for a pair of iterators.
static Waitcnt allZero(const IsaVersion &Version)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:225
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:256
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Representation of each machine instruction.
Definition: MachineInstr.h:64
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) INITIALIZE_PASS_END(SIInsertWaitcnts
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:444
#define DEBUG_TYPE
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:340
uint32_t Size
Definition: Profile.cpp:46
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Address space for flat memory.
Definition: AMDGPU.h:269
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:825
bool memoperands_empty() const
Return true if we don&#39;t have any memory operands which described the memory access done by this instr...
Definition: MachineInstr.h:564
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:45
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
unsigned getLgkmcntBitMask(const IsaVersion &Version)
ProcessInfo Wait(const ProcessInfo &PI, unsigned SecondsToWait, bool WaitUntilTerminates, std::string *ErrMsg=nullptr)
This function waits for the process specified by PI to finish.
bool operator==(uint64_t V1, const APInt &V2)
Definition: APInt.h:1975
Register getReg() const
getReg - Returns the register number.
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:416
WaitEventType
unsigned getVmcntBitMask(const IsaVersion &Version)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
const SIRegisterInfo * getRegisterInfo() const override