LLVM  16.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
16 #include "SIMachineFunctionInfo.h"
20 
21 using namespace llvm;
22 
23 namespace {
24 
25 struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
26  MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
27 
28  bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
29  if (Arg.getAsInteger(0, Value))
30  return O.error("'" + Arg + "' value invalid for uint argument!");
31 
32  if (Value > 100)
33  return O.error("'" + Arg + "' value must be in the range [0, 100]!");
34 
35  return false;
36  }
37 };
38 
39 } // end anonymous namespace
40 
42  MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
43  cl::desc("Fill a percentage of the latency between "
44  "neighboring MFMA with s_nops."));
45 
46 //===----------------------------------------------------------------------===//
47 // Hazard Recognizer Implementation
48 //===----------------------------------------------------------------------===//
49 
51  const GCNSubtarget &ST);
52 
54  IsHazardRecognizerMode(false),
55  CurrCycleInstr(nullptr),
56  MF(MF),
57  ST(MF.getSubtarget<GCNSubtarget>()),
58  TII(*ST.getInstrInfo()),
59  TRI(TII.getRegisterInfo()),
60  ClauseUses(TRI.getNumRegUnits()),
61  ClauseDefs(TRI.getNumRegUnits()) {
62  MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
63  TSchedModel.init(&ST);
64  RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
65 }
66 
68  EmittedInstrs.clear();
69 }
70 
73 }
74 
76  CurrCycleInstr = MI;
77 }
78 
79 static bool isDivFMas(unsigned Opcode) {
80  return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
81 }
82 
83 static bool isSGetReg(unsigned Opcode) {
84  return Opcode == AMDGPU::S_GETREG_B32;
85 }
86 
87 static bool isSSetReg(unsigned Opcode) {
88  switch (Opcode) {
89  case AMDGPU::S_SETREG_B32:
90  case AMDGPU::S_SETREG_B32_mode:
91  case AMDGPU::S_SETREG_IMM32_B32:
92  case AMDGPU::S_SETREG_IMM32_B32_mode:
93  return true;
94  }
95  return false;
96 }
97 
98 static bool isRWLane(unsigned Opcode) {
99  return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
100 }
101 
102 static bool isRFE(unsigned Opcode) {
103  return Opcode == AMDGPU::S_RFE_B64;
104 }
105 
106 static bool isSMovRel(unsigned Opcode) {
107  switch (Opcode) {
108  case AMDGPU::S_MOVRELS_B32:
109  case AMDGPU::S_MOVRELS_B64:
110  case AMDGPU::S_MOVRELD_B32:
111  case AMDGPU::S_MOVRELD_B64:
112  return true;
113  default:
114  return false;
115  }
116 }
117 
118 static bool isDGEMM(unsigned Opcode) {
119  return AMDGPU::getMAIIsDGEMM(Opcode);
120 }
121 
122 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
123  unsigned Opcode = MI.getOpcode();
124 
125  if (!SIInstrInfo::isMAI(MI) ||
126  isDGEMM(Opcode) ||
127  Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
128  Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
129  return false;
130 
131  if (!ST.hasGFX940Insts())
132  return true;
133 
134  return AMDGPU::getMAIIsGFX940XDL(Opcode);
135 }
136 
138  const MachineInstr &MI) {
139  if (TII.isAlwaysGDS(MI.getOpcode()))
140  return true;
141 
142  switch (MI.getOpcode()) {
143  case AMDGPU::S_SENDMSG:
144  case AMDGPU::S_SENDMSGHALT:
145  case AMDGPU::S_TTRACEDATA:
146  return true;
147  // These DS opcodes don't support GDS.
148  case AMDGPU::DS_NOP:
149  case AMDGPU::DS_PERMUTE_B32:
150  case AMDGPU::DS_BPERMUTE_B32:
151  return false;
152  default:
153  if (TII.isDS(MI.getOpcode())) {
154  int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
155  AMDGPU::OpName::gds);
156  if (MI.getOperand(GDS).getImm())
157  return true;
158  }
159  return false;
160  }
161 }
162 
163 static bool isPermlane(const MachineInstr &MI) {
164  unsigned Opcode = MI.getOpcode();
165  return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
166  Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
167 }
168 
169 static bool isLdsDma(const MachineInstr &MI) {
170  return SIInstrInfo::isVALU(MI) &&
172 }
173 
174 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
175  const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
176  AMDGPU::OpName::simm16);
177  return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
178 }
179 
182  MachineInstr *MI = SU->getInstr();
183  // If we are not in "HazardRecognizerMode" and therefore not being run from
184  // the scheduler, track possible stalls from hazards but don't insert noops.
185  auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
186 
187  if (MI->isBundle())
188  return NoHazard;
189 
190  if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
191  return HazardType;
192 
193  if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
194  return HazardType;
195 
196  if (checkFPAtomicToDenormModeHazard(MI) > 0)
197  return HazardType;
198 
199  if (ST.hasNoDataDepHazard())
200  return NoHazard;
201 
202  // FIXME: Should flat be considered vmem?
203  if ((SIInstrInfo::isVMEM(*MI) ||
205  && checkVMEMHazards(MI) > 0)
206  return HazardType;
207 
208  if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
209  return HazardType;
210 
211  if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
212  return HazardType;
213 
214  if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
215  return HazardType;
216 
217  if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
218  return HazardType;
219 
222  SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
223  return HazardType;
224 
225  if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
226  return HazardType;
227 
228  if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
229  return HazardType;
230 
231  if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
232  return HazardType;
233 
234  if (((ST.hasReadM0MovRelInterpHazard() &&
235  (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
236  MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
237  MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
239  (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
240  (ST.hasReadM0LdsDirectHazard() &&
241  MI->readsRegister(AMDGPU::LDS_DIRECT))) &&
242  checkReadM0Hazards(MI) > 0)
243  return HazardType;
244 
245  if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
246  return HazardType;
247 
248  if ((SIInstrInfo::isVMEM(*MI) ||
250  SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
251  return HazardType;
252 
253  if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
254  return HazardType;
255 
256  return NoHazard;
257 }
258 
260  unsigned Quantity) {
261  while (Quantity > 0) {
262  unsigned Arg = std::min(Quantity, 8u);
263  Quantity -= Arg;
264  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
265  .addImm(Arg - 1);
266  }
267 }
268 
269 unsigned
270 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
271  const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
272  assert(TSchedModel.getWriteProcResBegin(SC) !=
273  TSchedModel.getWriteProcResEnd(SC));
274  return TSchedModel.getWriteProcResBegin(SC)->Cycles;
275 }
276 
277 void GCNHazardRecognizer::processBundle() {
278  MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
280  // Check bundled MachineInstr's for hazards.
281  for (; MI != E && MI->isInsideBundle(); ++MI) {
282  CurrCycleInstr = &*MI;
283  unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
284 
285  if (IsHazardRecognizerMode) {
286  fixHazards(CurrCycleInstr);
287 
288  insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
289  }
290 
291  // It’s unnecessary to track more than MaxLookAhead instructions. Since we
292  // include the bundled MI directly after, only add a maximum of
293  // (MaxLookAhead - 1) noops to EmittedInstrs.
294  for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
295  EmittedInstrs.push_front(nullptr);
296 
297  EmittedInstrs.push_front(CurrCycleInstr);
298  EmittedInstrs.resize(MaxLookAhead);
299  }
300  CurrCycleInstr = nullptr;
301 }
302 
303 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
304  assert(IsHazardRecognizerMode);
305 
306  unsigned NumPreNoops = PreEmitNoops(MI);
307  EmitNoops(NumPreNoops);
308  if (MI->isInsideBundle())
309  insertNoopsInBundle(MI, TII, NumPreNoops);
310  else
311  TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
312  NumPreNoops);
314  AdvanceCycle();
315 }
316 
318  IsHazardRecognizerMode = true;
319  CurrCycleInstr = MI;
320  unsigned W = PreEmitNoopsCommon(MI);
321  fixHazards(MI);
322  CurrCycleInstr = nullptr;
323  return W;
324 }
325 
327  if (MI->isBundle())
328  return 0;
329 
330  int WaitStates = 0;
331 
332  if (SIInstrInfo::isSMRD(*MI))
333  return std::max(WaitStates, checkSMRDHazards(MI));
334 
335  if (ST.hasNSAtoVMEMBug())
336  WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
337 
338  WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
339 
340  if (ST.hasNoDataDepHazard())
341  return WaitStates;
342 
344  WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
345 
346  if (SIInstrInfo::isVALU(*MI))
347  WaitStates = std::max(WaitStates, checkVALUHazards(MI));
348 
349  if (SIInstrInfo::isDPP(*MI))
350  WaitStates = std::max(WaitStates, checkDPPHazards(MI));
351 
352  if (isDivFMas(MI->getOpcode()))
353  WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
354 
355  if (isRWLane(MI->getOpcode()))
356  WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
357 
360  SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
361  WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
362 
363  if (MI->isInlineAsm())
364  return std::max(WaitStates, checkInlineAsmHazards(MI));
365 
366  if (isSGetReg(MI->getOpcode()))
367  return std::max(WaitStates, checkGetRegHazards(MI));
368 
369  if (isSSetReg(MI->getOpcode()))
370  return std::max(WaitStates, checkSetRegHazards(MI));
371 
372  if (isRFE(MI->getOpcode()))
373  return std::max(WaitStates, checkRFEHazards(MI));
374 
375  if ((ST.hasReadM0MovRelInterpHazard() &&
376  (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
377  MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
378  MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
380  (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
381  (ST.hasReadM0LdsDirectHazard() && MI->readsRegister(AMDGPU::LDS_DIRECT)))
382  return std::max(WaitStates, checkReadM0Hazards(MI));
383 
384  if (SIInstrInfo::isMAI(*MI))
385  return std::max(WaitStates, checkMAIHazards(MI));
386 
387  if (SIInstrInfo::isVMEM(*MI) ||
390  return std::max(WaitStates, checkMAILdStHazards(MI));
391 
392  return WaitStates;
393 }
394 
396  EmittedInstrs.push_front(nullptr);
397 }
398 
400  // When the scheduler detects a stall, it will call AdvanceCycle() without
401  // emitting any instructions.
402  if (!CurrCycleInstr) {
403  EmittedInstrs.push_front(nullptr);
404  return;
405  }
406 
407  if (CurrCycleInstr->isBundle()) {
408  processBundle();
409  return;
410  }
411 
412  unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
413  if (!NumWaitStates) {
414  CurrCycleInstr = nullptr;
415  return;
416  }
417 
418  // Keep track of emitted instructions
419  EmittedInstrs.push_front(CurrCycleInstr);
420 
421  // Add a nullptr for each additional wait state after the first. Make sure
422  // not to add more than getMaxLookAhead() items to the list, since we
423  // truncate the list to that size right after this loop.
424  for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
425  i < e; ++i) {
426  EmittedInstrs.push_front(nullptr);
427  }
428 
429  // getMaxLookahead() is the largest number of wait states we will ever need
430  // to insert, so there is no point in keeping track of more than that many
431  // wait states.
432  EmittedInstrs.resize(getMaxLookAhead());
433 
434  CurrCycleInstr = nullptr;
435 }
436 
438  llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
439 }
440 
441 //===----------------------------------------------------------------------===//
442 // Helper Functions
443 //===----------------------------------------------------------------------===//
444 
446 
447 typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
448 typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn;
449 
450 // Search for a hazard in a block and its predecessors.
451 template <typename StateT>
452 static bool
453 hasHazard(StateT State,
454  function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
455  function_ref<void(StateT &, const MachineInstr &)> UpdateState,
456  const MachineBasicBlock *MBB,
459  for (auto E = MBB->instr_rend(); I != E; ++I) {
460  // No need to look at parent BUNDLE instructions.
461  if (I->isBundle())
462  continue;
463 
464  switch (IsHazard(State, *I)) {
465  case HazardFound:
466  return true;
467  case HazardExpired:
468  return false;
469  default:
470  // Continue search
471  break;
472  }
473 
474  if (I->isInlineAsm() || I->isMetaInstruction())
475  continue;
476 
477  UpdateState(State, *I);
478  }
479 
480  for (MachineBasicBlock *Pred : MBB->predecessors()) {
481  if (!Visited.insert(Pred).second)
482  continue;
483 
484  if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
485  Visited))
486  return true;
487  }
488 
489  return false;
490 }
491 
492 // Returns a minimum wait states since \p I walking all predecessors.
493 // Only scans until \p IsExpired does not return true.
494 // Can only be run in a hazard recognizer mode.
500  for (auto E = MBB->instr_rend(); I != E; ++I) {
501  // Don't add WaitStates for parent BUNDLE instructions.
502  if (I->isBundle())
503  continue;
504 
505  if (IsHazard(*I))
506  return WaitStates;
507 
508  if (I->isInlineAsm())
509  continue;
510 
511  WaitStates += GetNumWaitStates(*I);
512 
513  if (IsExpired(*I, WaitStates))
515  }
516 
517  int MinWaitStates = std::numeric_limits<int>::max();
518  for (MachineBasicBlock *Pred : MBB->predecessors()) {
519  if (!Visited.insert(Pred).second)
520  continue;
521 
522  int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
523  IsExpired, Visited, GetNumWaitStates);
524 
525  MinWaitStates = std::min(MinWaitStates, W);
526  }
527 
528  return MinWaitStates;
529 }
530 
532  const MachineInstr *MI, IsExpiredFn IsExpired) {
534  return getWaitStatesSince(IsHazard, MI->getParent(),
535  std::next(MI->getReverseIterator()),
536  0, IsExpired, Visited);
537 }
538 
539 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
540  if (IsHazardRecognizerMode) {
541  auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
542  return WaitStates >= Limit;
543  };
544  return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
545  }
546 
547  int WaitStates = 0;
548  for (MachineInstr *MI : EmittedInstrs) {
549  if (MI) {
550  if (IsHazard(*MI))
551  return WaitStates;
552 
553  if (MI->isInlineAsm())
554  continue;
555  }
556  ++WaitStates;
557 
558  if (WaitStates >= Limit)
559  break;
560  }
562 }
563 
564 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
565  IsHazardFn IsHazardDef,
566  int Limit) {
567  const SIRegisterInfo *TRI = ST.getRegisterInfo();
568 
569  auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
570  return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
571  };
572 
573  return getWaitStatesSince(IsHazardFn, Limit);
574 }
575 
576 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
577  int Limit) {
578  auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
579  return isSSetReg(MI.getOpcode()) && IsHazard(MI);
580  };
581 
582  return getWaitStatesSince(IsHazardFn, Limit);
583 }
584 
585 //===----------------------------------------------------------------------===//
586 // No-op Hazard Detection
587 //===----------------------------------------------------------------------===//
588 
589 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
590  MCRegister Reg) {
591  for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
592  BV.set(*RUI);
593 }
594 
595 static void addRegsToSet(const SIRegisterInfo &TRI,
597  BitVector &Set) {
598  for (const MachineOperand &Op : Ops) {
599  if (Op.isReg())
600  addRegUnits(TRI, Set, Op.getReg().asMCReg());
601  }
602 }
603 
604 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
605  // XXX: Do we need to worry about implicit operands
606  addRegsToSet(TRI, MI.defs(), ClauseDefs);
607  addRegsToSet(TRI, MI.uses(), ClauseUses);
608 }
609 
611  return !SIInstrInfo::isSMRD(*MI);
612 }
613 
616 }
617 
618 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
619  // SMEM soft clause are only present on VI+, and only matter if xnack is
620  // enabled.
621  if (!ST.isXNACKEnabled())
622  return 0;
623 
624  bool IsSMRD = TII.isSMRD(*MEM);
625 
626  resetClause();
627 
628  // A soft-clause is any group of consecutive SMEM instructions. The
629  // instructions in this group may return out of order and/or may be
630  // replayed (i.e. the same instruction issued more than once).
631  //
632  // In order to handle these situations correctly we need to make sure that
633  // when a clause has more than one instruction, no instruction in the clause
634  // writes to a register that is read by another instruction in the clause
635  // (including itself). If we encounter this situation, we need to break the
636  // clause by inserting a non SMEM instruction.
637 
638  for (MachineInstr *MI : EmittedInstrs) {
639  // When we hit a non-SMEM instruction then we have passed the start of the
640  // clause and we can stop.
641  if (!MI)
642  break;
643 
645  break;
646 
647  addClauseInst(*MI);
648  }
649 
650  if (ClauseDefs.none())
651  return 0;
652 
653  // We need to make sure not to put loads and stores in the same clause if they
654  // use the same address. For now, just start a new clause whenever we see a
655  // store.
656  if (MEM->mayStore())
657  return 1;
658 
659  addClauseInst(*MEM);
660 
661  // If the set of defs and uses intersect then we cannot add this instruction
662  // to the clause, so we have a hazard.
663  return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
664 }
665 
666 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
667  int WaitStatesNeeded = 0;
668 
669  WaitStatesNeeded = checkSoftClauseHazards(SMRD);
670 
671  // This SMRD hazard only affects SI.
672  if (!ST.hasSMRDReadVALUDefHazard())
673  return WaitStatesNeeded;
674 
675  // A read of an SGPR by SMRD instruction requires 4 wait states when the
676  // SGPR was written by a VALU instruction.
677  int SmrdSgprWaitStates = 4;
678  auto IsHazardDefFn = [this](const MachineInstr &MI) {
679  return TII.isVALU(MI);
680  };
681  auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
682  return TII.isSALU(MI);
683  };
684 
685  bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
686 
687  for (const MachineOperand &Use : SMRD->uses()) {
688  if (!Use.isReg())
689  continue;
690  int WaitStatesNeededForUse =
691  SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
692  SmrdSgprWaitStates);
693  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
694 
695  // This fixes what appears to be undocumented hardware behavior in SI where
696  // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
697  // needs some number of nops in between. We don't know how many we need, but
698  // let's use 4. This wasn't discovered before probably because the only
699  // case when this happens is when we expand a 64-bit pointer into a full
700  // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
701  // probably never encountered in the closed-source land.
702  if (IsBufferSMRD) {
703  int WaitStatesNeededForUse =
704  SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
705  IsBufferHazardDefFn,
706  SmrdSgprWaitStates);
707  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
708  }
709  }
710 
711  return WaitStatesNeeded;
712 }
713 
714 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
716  return 0;
717 
718  int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
719 
720  // A read of an SGPR by a VMEM instruction requires 5 wait states when the
721  // SGPR was written by a VALU Instruction.
722  const int VmemSgprWaitStates = 5;
723  auto IsHazardDefFn = [this](const MachineInstr &MI) {
724  return TII.isVALU(MI);
725  };
726  for (const MachineOperand &Use : VMEM->uses()) {
727  if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
728  continue;
729 
730  int WaitStatesNeededForUse =
731  VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
732  VmemSgprWaitStates);
733  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
734  }
735  return WaitStatesNeeded;
736 }
737 
738 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
739  const SIRegisterInfo *TRI = ST.getRegisterInfo();
740  const SIInstrInfo *TII = ST.getInstrInfo();
741 
742  // Check for DPP VGPR read after VALU VGPR write and EXEC write.
743  int DppVgprWaitStates = 2;
744  int DppExecWaitStates = 5;
745  int WaitStatesNeeded = 0;
746  auto IsHazardDefFn = [TII](const MachineInstr &MI) {
747  return TII->isVALU(MI);
748  };
749 
750  for (const MachineOperand &Use : DPP->uses()) {
751  if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
752  continue;
753  int WaitStatesNeededForUse =
754  DppVgprWaitStates - getWaitStatesSinceDef(
755  Use.getReg(),
756  [](const MachineInstr &) { return true; },
757  DppVgprWaitStates);
758  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
759  }
760 
761  WaitStatesNeeded = std::max(
762  WaitStatesNeeded,
763  DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
764  DppExecWaitStates));
765 
766  return WaitStatesNeeded;
767 }
768 
769 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
770  const SIInstrInfo *TII = ST.getInstrInfo();
771 
772  // v_div_fmas requires 4 wait states after a write to vcc from a VALU
773  // instruction.
774  const int DivFMasWaitStates = 4;
775  auto IsHazardDefFn = [TII](const MachineInstr &MI) {
776  return TII->isVALU(MI);
777  };
778  int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
779  DivFMasWaitStates);
780 
781  return DivFMasWaitStates - WaitStatesNeeded;
782 }
783 
784 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
785  const SIInstrInfo *TII = ST.getInstrInfo();
786  unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
787 
788  const int GetRegWaitStates = 2;
789  auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
790  return GetRegHWReg == getHWReg(TII, MI);
791  };
792  int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
793 
794  return GetRegWaitStates - WaitStatesNeeded;
795 }
796 
797 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
798  const SIInstrInfo *TII = ST.getInstrInfo();
799  unsigned HWReg = getHWReg(TII, *SetRegInstr);
800 
801  const int SetRegWaitStates = ST.getSetRegWaitStates();
802  auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
803  return HWReg == getHWReg(TII, MI);
804  };
805  int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
806  return SetRegWaitStates - WaitStatesNeeded;
807 }
808 
809 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
810  if (!MI.mayStore())
811  return -1;
812 
813  const SIInstrInfo *TII = ST.getInstrInfo();
814  unsigned Opcode = MI.getOpcode();
815  const MCInstrDesc &Desc = MI.getDesc();
816 
817  int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
818  int VDataRCID = -1;
819  if (VDataIdx != -1)
820  VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
821 
822  if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
823  // There is no hazard if the instruction does not use vector regs
824  // (like wbinvl1)
825  if (VDataIdx == -1)
826  return -1;
827  // For MUBUF/MTBUF instructions this hazard only exists if the
828  // instruction is not using a register in the soffset field.
829  const MachineOperand *SOffset =
830  TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
831  // If we have no soffset operand, then assume this field has been
832  // hardcoded to zero.
833  if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
834  (!SOffset || !SOffset->isReg()))
835  return VDataIdx;
836  }
837 
838  // MIMG instructions create a hazard if they don't use a 256-bit T# and
839  // the store size is greater than 8 bytes and they have more than two bits
840  // of their dmask set.
841  // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
842  if (TII->isMIMG(MI)) {
843  int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
844  assert(SRsrcIdx != -1 &&
845  AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
846  (void)SRsrcIdx;
847  }
848 
849  if (TII->isFLAT(MI)) {
850  int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
851  if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
852  return DataIdx;
853  }
854 
855  return -1;
856 }
857 
858 int
859 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
860  const MachineRegisterInfo &MRI) {
861  // Helper to check for the hazard where VMEM instructions that store more than
862  // 8 bytes can have there store data over written by the next instruction.
863  const SIRegisterInfo *TRI = ST.getRegisterInfo();
864 
865  const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
866  int WaitStatesNeeded = 0;
867 
868  if (!TRI->isVectorRegister(MRI, Def.getReg()))
869  return WaitStatesNeeded;
870  Register Reg = Def.getReg();
871  auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
872  int DataIdx = createsVALUHazard(MI);
873  return DataIdx >= 0 &&
874  TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
875  };
876  int WaitStatesNeededForDef =
877  VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
878  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
879 
880  return WaitStatesNeeded;
881 }
882 
883 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
884  int WaitStatesNeeded = 0;
885 
886  if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
887  const int TransDefWaitstates = 1;
888 
889  auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
890  if (!SIInstrInfo::isTRANS(MI))
891  return false;
892  const SIRegisterInfo *TRI = ST.getRegisterInfo();
893  const SIInstrInfo *TII = ST.getInstrInfo();
894  Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
895 
896  for (const MachineOperand &Use : VALU->explicit_uses()) {
897  if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
898  return true;
899  }
900 
901  return false;
902  };
903 
904  int WaitStatesNeededForDef =
905  TransDefWaitstates -
906  getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
907  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
908  }
909 
910  if (ST.hasDstSelForwardingHazard()) {
911  const int Shift16DefWaitstates = 1;
912 
913  auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
914  if (!SIInstrInfo::isVALU(MI))
915  return false;
916  const SIInstrInfo *TII = ST.getInstrInfo();
917  if (SIInstrInfo::isSDWA(MI)) {
918  if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
919  if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
920  return false;
921  } else {
922  if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
923  !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
924  ->getImm() &
926  return false;
927  }
928  const SIRegisterInfo *TRI = ST.getRegisterInfo();
929  if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
930  Register Def = Dst->getReg();
931 
932  for (const MachineOperand &Use : VALU->explicit_uses()) {
933  if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
934  return true;
935  }
936  }
937 
938  return false;
939  };
940 
941  int WaitStatesNeededForDef =
942  Shift16DefWaitstates -
943  getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
944  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
945  }
946 
947  if (ST.hasVDecCoExecHazard()) {
948  const int VALUWriteSGPRVALUReadWaitstates = 2;
949  const int VALUWriteEXECRWLane = 4;
950  const int VALUWriteVGPRReadlaneRead = 1;
951 
952  const SIRegisterInfo *TRI = ST.getRegisterInfo();
953  const MachineRegisterInfo &MRI = MF.getRegInfo();
955  auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
956  if (!SIInstrInfo::isVALU(MI))
957  return false;
958  return MI.modifiesRegister(UseReg, TRI);
959  };
960 
961  for (const MachineOperand &Use : VALU->explicit_uses()) {
962  if (!Use.isReg())
963  continue;
964 
965  UseReg = Use.getReg();
966  if (TRI->isSGPRReg(MRI, UseReg)) {
967  int WaitStatesNeededForDef =
968  VALUWriteSGPRVALUReadWaitstates -
969  getWaitStatesSince(IsVALUDefSGPRFn,
970  VALUWriteSGPRVALUReadWaitstates);
971  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
972  }
973  }
974 
975  if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
976  UseReg = AMDGPU::VCC;
977  int WaitStatesNeededForDef =
978  VALUWriteSGPRVALUReadWaitstates -
979  getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
980  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
981  }
982 
983  switch (VALU->getOpcode()) {
984  case AMDGPU::V_READLANE_B32:
985  case AMDGPU::V_READFIRSTLANE_B32: {
986  MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
987  UseReg = Src->getReg();
988  int WaitStatesNeededForDef =
989  VALUWriteVGPRReadlaneRead -
990  getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
991  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
992  }
993  [[fallthrough]];
994  case AMDGPU::V_WRITELANE_B32: {
995  UseReg = AMDGPU::EXEC;
996  int WaitStatesNeededForDef =
997  VALUWriteEXECRWLane -
998  getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
999  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1000  break;
1001  }
1002  default:
1003  break;
1004  }
1005  }
1006 
1007  // This checks for the hazard where VMEM instructions that store more than
1008  // 8 bytes can have there store data over written by the next instruction.
1009  if (!ST.has12DWordStoreHazard())
1010  return WaitStatesNeeded;
1011 
1012  const MachineRegisterInfo &MRI = MF.getRegInfo();
1013 
1014  for (const MachineOperand &Def : VALU->defs()) {
1015  WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1016  }
1017 
1018  return WaitStatesNeeded;
1019 }
1020 
1021 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1022  // This checks for hazards associated with inline asm statements.
1023  // Since inline asms can contain just about anything, we use this
1024  // to call/leverage other check*Hazard routines. Note that
1025  // this function doesn't attempt to address all possible inline asm
1026  // hazards (good luck), but is a collection of what has been
1027  // problematic thus far.
1028 
1029  // see checkVALUHazards()
1030  if (!ST.has12DWordStoreHazard())
1031  return 0;
1032 
1033  const MachineRegisterInfo &MRI = MF.getRegInfo();
1034  int WaitStatesNeeded = 0;
1035 
1036  for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
1037  I != E; ++I) {
1038  const MachineOperand &Op = IA->getOperand(I);
1039  if (Op.isReg() && Op.isDef()) {
1040  WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1041  }
1042  }
1043 
1044  return WaitStatesNeeded;
1045 }
1046 
1047 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1048  const SIInstrInfo *TII = ST.getInstrInfo();
1049  const SIRegisterInfo *TRI = ST.getRegisterInfo();
1050  const MachineRegisterInfo &MRI = MF.getRegInfo();
1051 
1052  const MachineOperand *LaneSelectOp =
1053  TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1054 
1055  if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1056  return 0;
1057 
1058  Register LaneSelectReg = LaneSelectOp->getReg();
1059  auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1060 
1061  const int RWLaneWaitStates = 4;
1062  int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1063  RWLaneWaitStates);
1064  return RWLaneWaitStates - WaitStatesSince;
1065 }
1066 
1067 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1068  if (!ST.hasRFEHazards())
1069  return 0;
1070 
1071  const SIInstrInfo *TII = ST.getInstrInfo();
1072 
1073  const int RFEWaitStates = 1;
1074 
1075  auto IsHazardFn = [TII](const MachineInstr &MI) {
1076  return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1077  };
1078  int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1079  return RFEWaitStates - WaitStatesNeeded;
1080 }
1081 
1082 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1083  const SIInstrInfo *TII = ST.getInstrInfo();
1084  const int ReadM0WaitStates = 1;
1085  auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1086  return ReadM0WaitStates -
1087  getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1088 }
1089 
1090 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1091  fixVMEMtoScalarWriteHazards(MI);
1092  fixVcmpxPermlaneHazards(MI);
1093  fixSMEMtoVectorWriteHazards(MI);
1094  fixVcmpxExecWARHazard(MI);
1095  fixLdsBranchVmemWARHazard(MI);
1096  if (ST.hasLdsDirect()) {
1097  fixLdsDirectVALUHazard(MI);
1098  fixLdsDirectVMEMHazard(MI);
1099  }
1100  fixVALUPartialForwardingHazard(MI);
1101  fixVALUTransUseHazard(MI);
1102  fixWMMAHazards(MI);
1103  fixShift64HighRegBug(MI);
1104  fixVALUMaskWriteHazard(MI);
1105 }
1106 
1107 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1108  if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1109  return false;
1110 
1111  const SIInstrInfo *TII = ST.getInstrInfo();
1112  const SIRegisterInfo *TRI = ST.getRegisterInfo();
1113  auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1114  return (TII->isVOPC(MI) ||
1115  ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
1116  MI.modifiesRegister(AMDGPU::EXEC, TRI);
1117  };
1118 
1119  auto IsExpiredFn = [](const MachineInstr &MI, int) {
1120  unsigned Opc = MI.getOpcode();
1121  return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1122  Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1123  };
1124 
1125  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1127  return false;
1128 
1129  // V_NOP will be discarded by SQ.
1130  // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1131  // which is always a VGPR and available.
1132  auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1133  Register Reg = Src0->getReg();
1134  bool IsUndef = Src0->isUndef();
1135  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1136  TII->get(AMDGPU::V_MOV_B32_e32))
1137  .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1138  .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1139 
1140  return true;
1141 }
1142 
1143 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1144  if (!ST.hasVMEMtoScalarWriteHazard())
1145  return false;
1146 
1148  return false;
1149 
1150  if (MI->getNumDefs() == 0)
1151  return false;
1152 
1153  const SIRegisterInfo *TRI = ST.getRegisterInfo();
1154 
1155  auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1158  return false;
1159 
1160  for (const MachineOperand &Def : MI->defs()) {
1161  const MachineOperand *Op =
1162  I.findRegisterUseOperand(Def.getReg(), false, TRI);
1163  if (!Op)
1164  continue;
1165  return true;
1166  }
1167  return false;
1168  };
1169 
1170  auto IsExpiredFn = [](const MachineInstr &MI, int) {
1171  return SIInstrInfo::isVALU(MI) ||
1172  (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1173  !MI.getOperand(0).getImm()) ||
1174  (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1175  MI.getOperand(0).getImm() == 0xffe3);
1176  };
1177 
1178  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1180  return false;
1181 
1182  const SIInstrInfo *TII = ST.getInstrInfo();
1183  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1184  TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1185  .addImm(0xffe3);
1186  return true;
1187 }
1188 
1189 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1190  if (!ST.hasSMEMtoVectorWriteHazard())
1191  return false;
1192 
1193  if (!SIInstrInfo::isVALU(*MI))
1194  return false;
1195 
1196  unsigned SDSTName;
1197  switch (MI->getOpcode()) {
1198  case AMDGPU::V_READLANE_B32:
1199  case AMDGPU::V_READFIRSTLANE_B32:
1200  SDSTName = AMDGPU::OpName::vdst;
1201  break;
1202  default:
1203  SDSTName = AMDGPU::OpName::sdst;
1204  break;
1205  }
1206 
1207  const SIInstrInfo *TII = ST.getInstrInfo();
1208  const SIRegisterInfo *TRI = ST.getRegisterInfo();
1209  const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1210  const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1211  if (!SDST) {
1212  for (const auto &MO : MI->implicit_operands()) {
1213  if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
1214  SDST = &MO;
1215  break;
1216  }
1217  }
1218  }
1219 
1220  if (!SDST)
1221  return false;
1222 
1223  const Register SDSTReg = SDST->getReg();
1224  auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1225  return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1226  };
1227 
1228  auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1229  if (TII->isSALU(MI)) {
1230  switch (MI.getOpcode()) {
1231  case AMDGPU::S_SETVSKIP:
1232  case AMDGPU::S_VERSION:
1233  case AMDGPU::S_WAITCNT_VSCNT:
1234  case AMDGPU::S_WAITCNT_VMCNT:
1235  case AMDGPU::S_WAITCNT_EXPCNT:
1236  // These instructions cannot not mitigate the hazard.
1237  return false;
1238  case AMDGPU::S_WAITCNT_LGKMCNT:
1239  // Reducing lgkmcnt count to 0 always mitigates the hazard.
1240  return (MI.getOperand(1).getImm() == 0) &&
1241  (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1242  case AMDGPU::S_WAITCNT: {
1243  const int64_t Imm = MI.getOperand(0).getImm();
1244  AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1245  return (Decoded.LgkmCnt == 0);
1246  }
1247  default:
1248  // SOPP instructions cannot mitigate the hazard.
1249  if (TII->isSOPP(MI))
1250  return false;
1251  // At this point the SALU can be assumed to mitigate the hazard
1252  // because either:
1253  // (a) it is independent of the at risk SMEM (breaking chain),
1254  // or
1255  // (b) it is dependent on the SMEM, in which case an appropriate
1256  // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1257  // SMEM instruction.
1258  return true;
1259  }
1260  }
1261  return false;
1262  };
1263 
1264  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1266  return false;
1267 
1268  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1269  TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1270  .addImm(0);
1271  return true;
1272 }
1273 
1274 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1276  return false;
1277 
1278  const SIRegisterInfo *TRI = ST.getRegisterInfo();
1279  if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1280  return false;
1281 
1282  auto IsHazardFn = [TRI](const MachineInstr &I) {
1283  if (SIInstrInfo::isVALU(I))
1284  return false;
1285  return I.readsRegister(AMDGPU::EXEC, TRI);
1286  };
1287 
1288  const SIInstrInfo *TII = ST.getInstrInfo();
1289  auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1290  if (SIInstrInfo::isVALU(MI)) {
1291  if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1292  return true;
1293  for (auto MO : MI.implicit_operands())
1294  if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1295  return true;
1296  }
1297  if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1298  (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe)
1299  return true;
1300  return false;
1301  };
1302 
1303  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1305  return false;
1306 
1307  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1308  TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1309  .addImm(0xfffe);
1310  return true;
1311 }
1312 
1314  const GCNSubtarget &ST) {
1315  if (!ST.hasLdsBranchVmemWARHazard())
1316  return false;
1317 
1318  // Check if the necessary condition for the hazard is met: both LDS and VMEM
1319  // instructions need to appear in the same function.
1320  bool HasLds = false;
1321  bool HasVmem = false;
1322  for (auto &MBB : MF) {
1323  for (auto &MI : MBB) {
1324  HasLds |= SIInstrInfo::isDS(MI);
1325  HasVmem |=
1327  if (HasLds && HasVmem)
1328  return true;
1329  }
1330  }
1331  return false;
1332 }
1333 
1334 static bool isStoreCountWaitZero(const MachineInstr &I) {
1335  return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1336  I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1337  !I.getOperand(1).getImm();
1338 }
1339 
1340 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1341  if (!RunLdsBranchVmemWARHazardFixup)
1342  return false;
1343 
1345 
1346  auto IsHazardInst = [](const MachineInstr &MI) {
1347  if (SIInstrInfo::isDS(MI))
1348  return 1;
1350  return 2;
1351  return 0;
1352  };
1353 
1354  auto InstType = IsHazardInst(*MI);
1355  if (!InstType)
1356  return false;
1357 
1358  auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1359  return IsHazardInst(I) || isStoreCountWaitZero(I);
1360  };
1361 
1362  auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1363  if (!I.isBranch())
1364  return false;
1365 
1366  auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1367  auto InstType2 = IsHazardInst(I);
1368  return InstType2 && InstType != InstType2;
1369  };
1370 
1371  auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1372  auto InstType2 = IsHazardInst(I);
1373  if (InstType == InstType2)
1374  return true;
1375 
1376  return isStoreCountWaitZero(I);
1377  };
1378 
1381  };
1382 
1383  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1385  return false;
1386 
1387  const SIInstrInfo *TII = ST.getInstrInfo();
1388  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1389  TII->get(AMDGPU::S_WAITCNT_VSCNT))
1390  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1391  .addImm(0);
1392 
1393  return true;
1394 }
1395 
1396 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1397  if (!SIInstrInfo::isLDSDIR(*MI))
1398  return false;
1399 
1400  const int NoHazardWaitStates = 15;
1401  const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1402  const Register VDSTReg = VDST->getReg();
1403 
1404  bool VisitedTrans = false;
1405  auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1406  if (!SIInstrInfo::isVALU(I))
1407  return false;
1408  VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1409  // Cover both WAR and WAW
1410  return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1411  };
1412  auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1413  if (WaitStates >= NoHazardWaitStates)
1414  return true;
1415  // Instructions which cause va_vdst==0 expire hazard
1418  };
1419  auto GetWaitStatesFn = [](const MachineInstr &MI) {
1420  return SIInstrInfo::isVALU(MI) ? 1 : 0;
1421  };
1422 
1424  auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1425  std::next(MI->getReverseIterator()), 0,
1426  IsExpiredFn, Visited, GetWaitStatesFn);
1427 
1428  // Transcendentals can execute in parallel to other VALUs.
1429  // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1430  if (VisitedTrans)
1431  Count = 0;
1432 
1433  MachineOperand *WaitVdstOp =
1434  TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1435  WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1436 
1437  return true;
1438 }
1439 
1440 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1441  if (!SIInstrInfo::isLDSDIR(*MI))
1442  return false;
1443 
1444  const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1445  const Register VDSTReg = VDST->getReg();
1446 
1447  auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1449  !SIInstrInfo::isDS(I))
1450  return false;
1451  return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1452  };
1453  auto IsExpiredFn = [](const MachineInstr &I, int) {
1454  return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
1455  (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1456  (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1457  I.getOperand(0).getImm() == 0xffe3);
1458  };
1459 
1460  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1462  return false;
1463 
1464  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1465  TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1466  .addImm(0xffe3);
1467 
1468  return true;
1469 }
1470 
1471 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1472  if (!ST.isWave64())
1473  return false;
1475  return false;
1476  if (!SIInstrInfo::isVALU(*MI))
1477  return false;
1478 
1479  SmallSetVector<Register, 4> SrcVGPRs;
1480 
1481  for (const MachineOperand &Use : MI->explicit_uses()) {
1482  if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1483  SrcVGPRs.insert(Use.getReg());
1484  }
1485 
1486  // Only applies with >= 2 unique VGPR sources
1487  if (SrcVGPRs.size() <= 1)
1488  return false;
1489 
1490  // Look for the following pattern:
1491  // Va <- VALU [PreExecPos]
1492  // intv1
1493  // Exec <- SALU [ExecPos]
1494  // intv2
1495  // Vb <- VALU [PostExecPos]
1496  // intv3
1497  // MI Va, Vb (WaitState = 0)
1498  //
1499  // Where:
1500  // intv1 + intv2 <= 2 VALUs
1501  // intv3 <= 4 VALUs
1502  //
1503  // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1504 
1505  const int Intv1plus2MaxVALUs = 2;
1506  const int Intv3MaxVALUs = 4;
1507  const int IntvMaxVALUs = 6;
1508  const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1509 
1510  struct StateType {
1512  int ExecPos = std::numeric_limits<int>::max();
1513  int VALUs = 0;
1514  };
1515 
1516  StateType State;
1517 
1518  // This overloads expiry testing with all the hazard detection
1519  auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1520  // Too many VALU states have passed
1521  if (State.VALUs > NoHazardVALUWaitStates)
1522  return HazardExpired;
1523 
1524  // Instructions which cause va_vdst==0 expire hazard
1527  (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1528  I.getOperand(0).getImm() == 0x0fff))
1529  return HazardExpired;
1530 
1531  // Track registers writes
1532  bool Changed = false;
1533  if (SIInstrInfo::isVALU(I)) {
1534  for (Register Src : SrcVGPRs) {
1535  if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1536  State.DefPos[Src] = State.VALUs;
1537  Changed = true;
1538  }
1539  }
1540  } else if (SIInstrInfo::isSALU(I)) {
1541  if (State.ExecPos == std::numeric_limits<int>::max()) {
1542  if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1543  State.ExecPos = State.VALUs;
1544  Changed = true;
1545  }
1546  }
1547  }
1548 
1549  // Early expiration: too many VALUs in intv3
1550  if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1551  return HazardExpired;
1552 
1553  // Only evaluate state if something changed
1554  if (!Changed)
1555  return NoHazardFound;
1556 
1557  // Determine positions of VALUs pre/post exec change
1558  if (State.ExecPos == std::numeric_limits<int>::max())
1559  return NoHazardFound;
1560 
1561  int PreExecPos = std::numeric_limits<int>::max();
1562  int PostExecPos = std::numeric_limits<int>::max();
1563 
1564  for (auto Entry : State.DefPos) {
1565  int DefVALUs = Entry.second;
1566  if (DefVALUs != std::numeric_limits<int>::max()) {
1567  if (DefVALUs >= State.ExecPos)
1568  PreExecPos = std::min(PreExecPos, DefVALUs);
1569  else if (DefVALUs < State.ExecPos)
1570  PostExecPos = std::min(PostExecPos, DefVALUs);
1571  }
1572  }
1573 
1574  // Need a VALUs post exec change
1575  if (PostExecPos == std::numeric_limits<int>::max())
1576  return NoHazardFound;
1577 
1578  // Too many VALUs in intv3?
1579  int Intv3VALUs = PostExecPos;
1580  if (Intv3VALUs > Intv3MaxVALUs)
1581  return HazardExpired;
1582 
1583  // Too many VALUs in intv2?
1584  int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1585  if (Intv2VALUs > Intv1plus2MaxVALUs)
1586  return HazardExpired;
1587 
1588  // Need a VALUs pre exec change
1589  if (PreExecPos == std::numeric_limits<int>::max())
1590  return NoHazardFound;
1591 
1592  // Too many VALUs in intv1?
1593  int Intv1VALUs = PreExecPos - State.ExecPos;
1594  if (Intv1VALUs > Intv1plus2MaxVALUs)
1595  return HazardExpired;
1596 
1597  // Too many VALUs in intv1 + intv2
1598  if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1599  return HazardExpired;
1600 
1601  return HazardFound;
1602  };
1603  auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1604  if (SIInstrInfo::isVALU(MI))
1605  State.VALUs += 1;
1606  };
1607 
1609  if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1610  std::next(MI->getReverseIterator()), Visited))
1611  return false;
1612 
1613  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1614  TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1615  .addImm(0x0fff);
1616 
1617  return true;
1618 }
1619 
1620 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1621  if (!ST.hasVALUTransUseHazard())
1622  return false;
1623  if (!SIInstrInfo::isVALU(*MI))
1624  return false;
1625 
1626  SmallSet<Register, 4> SrcVGPRs;
1627 
1628  for (const MachineOperand &Use : MI->explicit_uses()) {
1629  if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1630  SrcVGPRs.insert(Use.getReg());
1631  }
1632 
1633  // Look for the following pattern:
1634  // Va <- TRANS VALU
1635  // intv
1636  // MI Va (WaitState = 0)
1637  //
1638  // Where:
1639  // intv <= 5 VALUs / 1 TRANS
1640  //
1641  // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1642 
1643  const int IntvMaxVALUs = 5;
1644  const int IntvMaxTRANS = 1;
1645 
1646  struct StateType {
1647  int VALUs = 0;
1648  int TRANS = 0;
1649  };
1650 
1651  StateType State;
1652 
1653  // This overloads expiry testing with all the hazard detection
1654  auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1655  // Too many VALU states have passed
1656  if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1657  return HazardExpired;
1658 
1659  // Instructions which cause va_vdst==0 expire hazard
1662  (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1663  I.getOperand(0).getImm() == 0x0fff))
1664  return HazardExpired;
1665 
1666  // Track registers writes
1667  if (SIInstrInfo::isTRANS(I)) {
1668  for (Register Src : SrcVGPRs) {
1669  if (I.modifiesRegister(Src, &TRI)) {
1670  return HazardFound;
1671  }
1672  }
1673  }
1674 
1675  return NoHazardFound;
1676  };
1677  auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1678  if (SIInstrInfo::isVALU(MI))
1679  State.VALUs += 1;
1681  State.TRANS += 1;
1682  };
1683 
1685  if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1686  std::next(MI->getReverseIterator()), Visited))
1687  return false;
1688 
1689  // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1690  // avoided (mask 0x0fff achieves this).
1691  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1692  TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1693  .addImm(0x0fff);
1694 
1695  return true;
1696 }
1697 
1698 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1699  if (!SIInstrInfo::isWMMA(*MI))
1700  return false;
1701 
1702  const SIInstrInfo *TII = ST.getInstrInfo();
1703  const SIRegisterInfo *TRI = ST.getRegisterInfo();
1704 
1705  auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1706  if (!SIInstrInfo::isWMMA(I))
1707  return false;
1708 
1709  // Src0 or Src1 of the current wmma instruction overlaps with the dest of
1710  // the previous wmma.
1711  const Register CurSrc0Reg =
1712  TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1713  const Register CurSrc1Reg =
1714  TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1715 
1716  const Register PrevDstReg =
1717  TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1718 
1719  if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1720  TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1721  return true;
1722  }
1723 
1724  // Src2 of the current wmma instruction overlaps with the dest of the
1725  // previous wmma.
1726  const MachineOperand *Src2 =
1727  TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
1728  const Register CurSrc2Reg = Src2->isReg() ? Src2->getReg() : Register();
1729 
1730  if (CurSrc2Reg != AMDGPU::NoRegister &&
1731  TRI->regsOverlap(PrevDstReg, CurSrc2Reg)) {
1732 
1733  const MachineOperand *Src2Mods =
1734  TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers);
1735  const bool NoSrc2Mods =
1736  (Src2Mods->getImm() & (SISrcMods::NEG | SISrcMods::NEG_HI)) == 0;
1737  // Exception: there is no hazard if the wmma instructions are of the same
1738  // type and there is no input modifier on src2 of the current instruction.
1739  return !(NoSrc2Mods && (TII->pseudoToMCOpcode(I.getOpcode()) ==
1740  TII->pseudoToMCOpcode(MI->getOpcode())));
1741  }
1742 
1743  return false;
1744  };
1745 
1746  auto IsExpiredFn = [](const MachineInstr &I, int) {
1747  return SIInstrInfo::isVALU(I);
1748  };
1749 
1750  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1752  return false;
1753 
1754  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1755 
1756  return true;
1757 }
1758 
1759 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1760  if (!ST.hasShift64HighRegBug())
1761  return false;
1762 
1763  switch (MI->getOpcode()) {
1764  default:
1765  return false;
1766  case AMDGPU::V_LSHLREV_B64_e64:
1767  case AMDGPU::V_LSHRREV_B64_e64:
1768  case AMDGPU::V_ASHRREV_I64_e64:
1769  break;
1770  }
1771 
1772  MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1773  if (!Amt->isReg())
1774  return false;
1775 
1776  Register AmtReg = Amt->getReg();
1777  const MachineRegisterInfo &MRI = MF.getRegInfo();
1778  // Check if this is a last VGPR in the allocation block.
1779  if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1780  return false;
1781 
1782  if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1783  return false;
1784 
1785  MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1786  bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1787  bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1788  bool Overlapped = OverlappedSrc || OverlappedDst;
1789 
1790  assert(!OverlappedDst || !OverlappedSrc ||
1791  Src1->getReg() == MI->getOperand(0).getReg());
1792  assert(ST.needsAlignedVGPRs());
1793  static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1794 
1795  Register NewReg;
1796  for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1797  : AMDGPU::VGPR_32RegClass) {
1798  if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1799  NewReg = Reg;
1800  break;
1801  }
1802  }
1803 
1804  Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1805  : NewReg;
1806  Register NewAmtLo;
1807 
1808  if (Overlapped)
1809  NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1810 
1811  DebugLoc DL = MI->getDebugLoc();
1812  MachineBasicBlock *MBB = MI->getParent();
1813  // Insert a full wait count because found register might be pending a wait.
1814  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1815  .addImm(0);
1816 
1817  // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1818  if (Overlapped)
1819  runOnInstruction(
1820  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1821  .addDef(AmtReg - 1)
1822  .addReg(AmtReg - 1, RegState::Undef)
1823  .addReg(NewAmtLo, RegState::Undef));
1824  runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1825  .addDef(AmtReg)
1826  .addReg(AmtReg, RegState::Undef)
1827  .addReg(NewAmt, RegState::Undef));
1828 
1829  // Instructions emitted after the current instruction will be processed by the
1830  // parent loop of the hazard recognizer in a natural way.
1831  BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1832  AmtReg)
1833  .addDef(NewAmt)
1834  .addReg(NewAmt)
1835  .addReg(AmtReg);
1836  if (Overlapped)
1837  BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1838  AmtReg - 1)
1839  .addDef(NewAmtLo)
1840  .addReg(NewAmtLo)
1841  .addReg(AmtReg - 1);
1842 
1843  // Re-running hazard recognizer on the modified instruction is not necessary,
1844  // inserted V_SWAP_B32 has already both read and write new registers so
1845  // hazards related to these register has already been handled.
1846  Amt->setReg(NewAmt);
1847  Amt->setIsKill(false);
1848  // We do not update liveness, so verifier may see it as undef.
1849  Amt->setIsUndef();
1850  if (OverlappedDst)
1851  MI->getOperand(0).setReg(NewReg);
1852  if (OverlappedSrc) {
1853  Src1->setReg(NewReg);
1854  Src1->setIsKill(false);
1855  Src1->setIsUndef();
1856  }
1857 
1858  return true;
1859 }
1860 
1861 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1862  int NSAtoVMEMWaitStates = 1;
1863 
1864  if (!ST.hasNSAtoVMEMBug())
1865  return 0;
1866 
1868  return 0;
1869 
1870  const SIInstrInfo *TII = ST.getInstrInfo();
1871  const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1872  if (!Offset || (Offset->getImm() & 6) == 0)
1873  return 0;
1874 
1875  auto IsHazardFn = [TII](const MachineInstr &I) {
1876  if (!SIInstrInfo::isMIMG(I))
1877  return false;
1878  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1879  return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1880  TII->getInstSizeInBytes(I) >= 16;
1881  };
1882 
1883  return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1884 }
1885 
1886 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1887  int FPAtomicToDenormModeWaitStates = 3;
1888 
1890  return 0;
1891 
1892  if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1893  return 0;
1894 
1895  auto IsHazardFn = [](const MachineInstr &I) {
1897  return false;
1898  return SIInstrInfo::isFPAtomic(I);
1899  };
1900 
1901  auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1902  if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
1903  return true;
1904 
1905  switch (MI.getOpcode()) {
1906  case AMDGPU::S_WAITCNT:
1907  case AMDGPU::S_WAITCNT_VSCNT:
1908  case AMDGPU::S_WAITCNT_VMCNT:
1909  case AMDGPU::S_WAITCNT_EXPCNT:
1910  case AMDGPU::S_WAITCNT_LGKMCNT:
1911  case AMDGPU::S_WAIT_IDLE:
1912  return true;
1913  default:
1914  break;
1915  }
1916 
1917  return false;
1918  };
1919 
1920  return FPAtomicToDenormModeWaitStates -
1921  ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1922 }
1923 
1924 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1926 
1927  return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1928 }
1929 
1930 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
1931  // Early exit if no padding is requested.
1932  if (MFMAPaddingRatio == 0)
1933  return 0;
1934 
1936  if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
1937  return 0;
1938 
1939  int NeighborMFMALatency = 0;
1940  auto IsNeighboringMFMA = [&NeighborMFMALatency,
1941  this](const MachineInstr &MI) {
1942  if (!SIInstrInfo::isMFMA(MI))
1943  return false;
1944 
1945  NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
1946  return true;
1947  };
1948 
1949  const int MaxMFMAPipelineWaitStates = 16;
1950  int WaitStatesSinceNeighborMFMA =
1951  getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
1952 
1953  int NeighborMFMAPaddingNeeded =
1954  (NeighborMFMALatency * MFMAPaddingRatio / 100) -
1955  WaitStatesSinceNeighborMFMA;
1956 
1957  return std::max(0, NeighborMFMAPaddingNeeded);
1958 }
1959 
1960 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1961  int WaitStatesNeeded = 0;
1962  unsigned Opc = MI->getOpcode();
1963 
1964  auto IsVALUFn = [](const MachineInstr &MI) {
1965  return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
1966  };
1967 
1968  if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1969  const int LegacyVALUWritesVGPRWaitStates = 2;
1970  const int VALUWritesExecWaitStates = 4;
1971  const int MaxWaitStates = 4;
1972 
1973  int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1974  getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1975  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1976 
1977  if (WaitStatesNeeded < MaxWaitStates) {
1978  for (const MachineOperand &Use : MI->explicit_uses()) {
1979  const int MaxWaitStates = 2;
1980 
1981  if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1982  continue;
1983 
1984  int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1985  getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1986  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1987 
1988  if (WaitStatesNeeded == MaxWaitStates)
1989  break;
1990  }
1991  }
1992  }
1993 
1994  for (const MachineOperand &Op : MI->explicit_operands()) {
1995  if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1996  continue;
1997 
1998  if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1999  continue;
2000 
2001  const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2002  const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2003  const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2004  const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2005  const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2006  const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2007  const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2008  const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2009  const int MaxWaitStates = 18;
2010  Register Reg = Op.getReg();
2011  unsigned HazardDefLatency = 0;
2012 
2013  auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2014  this](const MachineInstr &MI) {
2015  if (!SIInstrInfo::isMFMA(MI))
2016  return false;
2017  Register DstReg = MI.getOperand(0).getReg();
2018  if (DstReg == Reg)
2019  return false;
2020  HazardDefLatency =
2021  std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2022  return TRI.regsOverlap(DstReg, Reg);
2023  };
2024 
2025  int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2026  MaxWaitStates);
2027  int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2028  int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2029  int OpNo = MI->getOperandNo(&Op);
2030  if (OpNo == SrcCIdx) {
2031  NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2032  } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2033  switch (HazardDefLatency) {
2034  case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2035  break;
2036  case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2037  break;
2038  case 16: [[fallthrough]];
2039  default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2040  break;
2041  }
2042  } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2043  switch (HazardDefLatency) {
2044  case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2045  break;
2046  case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2047  break;
2048  case 16: [[fallthrough]];
2049  default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2050  break;
2051  }
2052  }
2053 
2054  int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2055  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2056 
2057  if (WaitStatesNeeded == MaxWaitStates)
2058  return WaitStatesNeeded; // Early exit.
2059 
2060  auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2061  if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2062  return false;
2063  Register DstReg = MI.getOperand(0).getReg();
2064  return TRI.regsOverlap(Reg, DstReg);
2065  };
2066 
2067  const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2068  const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2069  const int AccVGPRWriteAccVgprReadWaitStates = 3;
2070  NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2071  if (OpNo == SrcCIdx)
2072  NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2073  else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2074  NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2075 
2076  WaitStatesNeededForUse = NeedWaitStates -
2077  getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2078  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2079 
2080  if (WaitStatesNeeded == MaxWaitStates)
2081  return WaitStatesNeeded; // Early exit.
2082  }
2083 
2084  if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2085  const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2086  const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2087  const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2088  const int MaxWaitStates = 13;
2089  Register DstReg = MI->getOperand(0).getReg();
2090  unsigned HazardDefLatency = 0;
2091 
2092  auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2093  this](const MachineInstr &MI) {
2094  if (!SIInstrInfo::isMFMA(MI))
2095  return false;
2096  Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2097  HazardDefLatency =
2098  std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2099  return TRI.regsOverlap(Reg, DstReg);
2100  };
2101 
2102  int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2103  int NeedWaitStates;
2104  switch (HazardDefLatency) {
2105  case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2106  break;
2107  case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2108  break;
2109  case 16: [[fallthrough]];
2110  default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2111  break;
2112  }
2113 
2114  int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2115  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2116  }
2117 
2118  // Pad neighboring MFMA with noops for better inter-wave performance.
2119  WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2120 
2121  return WaitStatesNeeded;
2122 }
2123 
2124 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2125  int WaitStatesNeeded = 0;
2126  unsigned Opc = MI->getOpcode();
2127 
2128  auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2130  };
2131 
2132  auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2135  };
2136 
2137  if (!SIInstrInfo::isMFMA(*MI))
2138  return WaitStatesNeeded;
2139 
2140  const int VALUWritesExecWaitStates = 4;
2141  int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2142  getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2143  VALUWritesExecWaitStates);
2144  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2145 
2146  int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2147 
2148  // Loop for both DGEMM and S/HGEMM 2nd instruction.
2149  for (const MachineOperand &Use : MI->explicit_uses()) {
2150  const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2151  const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2152  const int GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates = 3;
2153  const int GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates = 5;
2154  const int GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates = 4;
2155  const int GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates = 9;
2156  const int GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates = 8;
2157  const int GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates = 17;
2158  const int GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates = 16;
2159  const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2160  const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2161  const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2162  const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2163  const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2164  const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2165  const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2166  const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2167  const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2168  const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2169  const int GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates = 4;
2170  const int GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates = 6;
2171  const int GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates = 10;
2172  const int GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates = 18;
2173  const int GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates = 5;
2174  const int GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates = 7;
2175  const int GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates = 11;
2176  const int GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates = 19;
2177  const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2178  const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2179  const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2180  const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2181  const int MaxWaitStates = 19;
2182 
2183  if (!Use.isReg())
2184  continue;
2185  Register Reg = Use.getReg();
2186  bool FullReg;
2187  const MachineInstr *MI1;
2188 
2189  auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2190  this](const MachineInstr &MI) {
2191  if (!SIInstrInfo::isMFMA(MI))
2192  return false;
2193  Register DstReg = MI.getOperand(0).getReg();
2194  FullReg = (DstReg == Reg);
2195  MI1 = &MI;
2196  return TRI.regsOverlap(DstReg, Reg);
2197  };
2198 
2199  WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2200  getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2201  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2202 
2203  int NumWaitStates =
2204  getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2205  if (NumWaitStates == std::numeric_limits<int>::max())
2206  continue;
2207 
2208  int OpNo = MI->getOperandNo(&Use);
2209  unsigned Opc1 = MI1->getOpcode();
2210  int NeedWaitStates = 0;
2211  if (OpNo == SrcCIdx) {
2212  if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2213  NeedWaitStates = 0;
2214  } else if (FullReg) {
2215  if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2216  Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2217  (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2218  Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2219  NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2220  else if (ST.hasGFX940Insts() &&
2221  TSchedModel.computeInstrLatency(MI1) == 2)
2222  NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2223  } else {
2224  switch (Opc1) {
2225  case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2226  case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2227  case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2228  case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2229  if (!isXDL(ST, *MI))
2230  NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2231  break;
2232  case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2233  case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2234  if (!isXDL(ST, *MI))
2235  NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2236  break;
2237  default:
2238  if (ST.hasGFX940Insts() && isXDL(ST, *MI) && !isXDL(ST, *MI1))
2239  break;
2240  switch (TSchedModel.computeInstrLatency(MI1)) {
2241  case 2:
2242  NeedWaitStates = ST.hasGFX940Insts()
2243  ? isXDL(ST, *MI1)
2244  ? GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates
2245  : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates
2246  : isDGEMM(Opc)
2247  ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2248  : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2249  break;
2250  case 4:
2251  assert(ST.hasGFX940Insts());
2252  NeedWaitStates = isXDL(ST, *MI1)
2253  ? GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates
2254  : GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates;
2255  break;
2256  case 8:
2257  NeedWaitStates = ST.hasGFX940Insts()
2258  ? isXDL(ST, *MI1)
2259  ? GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates
2260  : GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates
2261  : isDGEMM(Opc)
2262  ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2263  : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2264  break;
2265  case 16: [[fallthrough]];
2266  default:
2267  NeedWaitStates = ST.hasGFX940Insts()
2268  ? isXDL(ST, *MI1)
2269  ? GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates
2270  : GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates
2271  : isDGEMM(Opc)
2272  ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2273  : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2274  }
2275  }
2276  }
2277  } else {
2278  switch (Opc1) {
2279  case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2280  case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2281  case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2282  case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2283  NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2284  break;
2285  case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2286  case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2287  NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2288  break;
2289  default:
2290  switch (TSchedModel.computeInstrLatency(MI1)) {
2291  case 2:
2292  NeedWaitStates = ST.hasGFX940Insts()
2293  ? isXDL(ST, *MI1)
2294  ? GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates
2295  : GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates
2296  : SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2297  break;
2298  case 4:
2299  assert(ST.hasGFX940Insts());
2300  NeedWaitStates = isXDL(ST, *MI1)
2301  ? GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates
2302  : GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates;
2303  break;
2304  case 8:
2305  NeedWaitStates = ST.hasGFX940Insts()
2306  ? isXDL(ST, *MI1)
2307  ? GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates
2308  : GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates
2309  : SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2310  break;
2311  case 16: [[fallthrough]];
2312  default:
2313  NeedWaitStates = ST.hasGFX940Insts()
2314  ? isXDL(ST, *MI1)
2315  ? GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates
2316  : GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates
2317  : SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2318  }
2319  }
2320  }
2321  if (WaitStatesNeeded >= NeedWaitStates)
2322  continue;
2323 
2324  WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2325  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2326 
2327  if (WaitStatesNeeded == MaxWaitStates)
2328  break;
2329  }
2330 
2331  return WaitStatesNeeded;
2332 }
2333 
2334 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2335  // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2336  if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2337  return 0;
2338 
2339  int WaitStatesNeeded = 0;
2340 
2341  auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2342  return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2343  };
2344 
2345  for (const MachineOperand &Op : MI->explicit_uses()) {
2346  if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2347  continue;
2348 
2349  Register Reg = Op.getReg();
2350 
2351  const int AccVgprReadLdStWaitStates = 2;
2352  const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2353  const int MaxWaitStates = 2;
2354 
2355  int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2356  getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2357  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2358 
2359  if (WaitStatesNeeded == MaxWaitStates)
2360  return WaitStatesNeeded; // Early exit.
2361 
2362  auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2363  if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2364  MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2365  return false;
2366  auto IsVALUFn = [](const MachineInstr &MI) {
2368  };
2369  return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2371  };
2372 
2373  WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2374  getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2375  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2376  }
2377 
2378  return WaitStatesNeeded;
2379 }
2380 
2381 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2382  if (!ST.hasGFX90AInsts())
2383  return 0;
2384 
2385  auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2386  return isDGEMM(MI.getOpcode());
2387  };
2388 
2389  // This is checked in checkMAIHazards90A()
2390  if (SIInstrInfo::isMFMA(*MI))
2391  return 0;
2392 
2393  const MachineRegisterInfo &MRI = MF.getRegInfo();
2394 
2395  int WaitStatesNeeded = 0;
2396 
2397  bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2398  SIInstrInfo::isFLAT(*MI) ||
2400  bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2401  bool IsVALU = SIInstrInfo::isVALU(*MI);
2402 
2403  const MachineInstr *MFMA = nullptr;
2404  unsigned Reg;
2405  auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2406  if (!SIInstrInfo::isMFMA(MI) ||
2407  !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2408  return false;
2409  MFMA = &MI;
2410  return true;
2411  };
2412 
2413  const MachineInstr *DOT = nullptr;
2414  auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2415  if (!SIInstrInfo::isDOT(MI) ||
2416  !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2417  return false;
2418  DOT = &MI;
2419  return true;
2420  };
2421 
2422  bool DGEMMAfterVALUWrite = false;
2423  auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2424  // Found DGEMM on reverse traversal to def.
2425  if (isDGEMM(MI.getOpcode()))
2426  DGEMMAfterVALUWrite = true;
2427 
2428  // Only hazard if register is defined by a VALU and a DGEMM is found after
2429  // after the def.
2430  if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2431  return false;
2432 
2433  return true;
2434  };
2435 
2436  int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2437  AMDGPU::OpName::src2);
2438 
2439  if (IsMemOrExport || IsVALU) {
2440  const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2441  const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2442  const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2443  const int GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates = 4;
2444  const int GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates = 6;
2445  const int GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates = 10;
2446  const int GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates = 18;
2447  const int GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates = 5;
2448  const int GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates = 7;
2449  const int GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates = 11;
2450  const int GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates = 19;
2451  const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2452  const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2453  const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2454  const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2455  const int DotWriteSameDotReadSrcAB = 3;
2456  const int DotWriteDifferentVALURead = 3;
2457  const int DMFMABetweenVALUWriteVMEMRead = 2;
2458  const int MaxWaitStates = 19;
2459 
2460  for (const MachineOperand &Use : MI->explicit_uses()) {
2461  if (!Use.isReg())
2462  continue;
2463  Reg = Use.getReg();
2464 
2465  DOT = nullptr;
2466  int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2467  MaxWaitStates);
2468  if (DOT) {
2469  int NeedWaitStates = 0;
2470  if (DOT->getOpcode() == MI->getOpcode()) {
2471  if (&Use - &MI->getOperand(0) != SrcCIdx)
2472  NeedWaitStates = DotWriteSameDotReadSrcAB;
2473  } else {
2474  NeedWaitStates = DotWriteDifferentVALURead;
2475  }
2476 
2477  int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2478  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2479  }
2480 
2481  // Workaround for HW data hazard bug observed only in GFX90A. When there
2482  // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2483  // causes the SQ to incorrectly not insert two wait states between the two
2484  // instructions needed to avoid data hazard.
2485  if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2486  DGEMMAfterVALUWrite = false;
2487  if (TRI.isVectorRegister(MRI, Reg)) {
2488  int WaitStatesNeededForUse =
2489  DMFMABetweenVALUWriteVMEMRead -
2490  getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2491  DMFMABetweenVALUWriteVMEMRead);
2492 
2493  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2494  }
2495  }
2496 
2497  MFMA = nullptr;
2498  WaitStatesSinceDef =
2499  getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2500  if (!MFMA)
2501  continue;
2502 
2503  unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2504  int NeedWaitStates = MaxWaitStates;
2505  switch (HazardDefLatency) {
2506  case 2:
2507  NeedWaitStates =
2508  ST.hasGFX940Insts()
2509  ? isXDL(ST, *MFMA)
2510  ? GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates
2511  : GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates
2512  : SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2513  break;
2514  case 4:
2515  assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
2516  NeedWaitStates =
2517  isDGEMM(MFMA->getOpcode())
2518  ? IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2519  : DMFMA4x4WriteVgprVALUReadWaitStates
2520  : isXDL(ST, *MFMA)
2521  ? GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates
2522  : GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates;
2523  break;
2524  case 8:
2525  NeedWaitStates =
2526  ST.hasGFX940Insts()
2527  ? isXDL(ST, *MFMA)
2528  ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates
2529  : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates
2530  : SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2531  break;
2532  case 16: [[fallthrough]];
2533  default:
2534  NeedWaitStates =
2535  isDGEMM(MFMA->getOpcode())
2536  ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
2537  : DMFMA16x16WriteVgprVALUReadWaitStates
2538  : ST.hasGFX940Insts()
2539  ? isXDL(ST, *MFMA)
2540  ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates
2541  : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates
2542  : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2543  break;
2544  }
2545 
2546  int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2547  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2548 
2549  if (WaitStatesNeeded == MaxWaitStates)
2550  break;
2551  }
2552  }
2553 
2554  unsigned Opc = MI->getOpcode();
2555  const int DMFMAToFMA64WaitStates = 2;
2556  if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2557  Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2558  Opc == AMDGPU::V_FMAC_F64_dpp) &&
2559  WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2560  int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2561  getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2562  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2563  }
2564 
2565  if (!IsVALU && !IsMemOrExport)
2566  return WaitStatesNeeded;
2567 
2568  for (const MachineOperand &Def : MI->defs()) {
2569  const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2570  const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2571  const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2572  const int GFX940_SMFMA2PassWriteVgprVALUWawWaitStates = 4;
2573  const int GFX940_SMFMA4PassWriteVgprVALUWawWaitStates = 6;
2574  const int GFX940_SMFMA8PassWriteVgprVALUWawWaitStates = 10;
2575  const int GFX940_SMFMA16PassWriteVgprVALUWawWaitStates = 18;
2576  const int GFX940_XDL2PassWriteVgprVALUWawWaitStates = 5;
2577  const int GFX940_XDL4PassWriteVgprVALUWawWaitStates = 7;
2578  const int GFX940_XDL8PassWriteVgprVALUWawWaitStates = 11;
2579  const int GFX940_XDL16PassWriteVgprVALUWawWaitStates = 19;
2580  const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2581  const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2582  const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2583  const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2584  const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2585  const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2586  const int DotWriteDifferentVALUWrite = 3;
2587  const int MaxWaitStates = 19;
2588  const int MaxWarWaitStates = 15;
2589 
2590  Reg = Def.getReg();
2591 
2592  DOT = nullptr;
2593  int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2594  MaxWaitStates);
2595  if (DOT && DOT->getOpcode() != MI->getOpcode())
2596  WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2597  WaitStatesSinceDef);
2598 
2599  MFMA = nullptr;
2600  WaitStatesSinceDef =
2601  getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2602  if (MFMA) {
2603  int NeedWaitStates = MaxWaitStates;
2604  switch (TSchedModel.computeInstrLatency(MFMA)) {
2605  case 2:
2606  NeedWaitStates = ST.hasGFX940Insts()
2607  ? isXDL(ST, *MFMA)
2608  ? GFX940_XDL2PassWriteVgprVALUWawWaitStates
2609  : GFX940_SMFMA2PassWriteVgprVALUWawWaitStates
2610  : SMFMA4x4WriteVgprVALUWawWaitStates;
2611  break;
2612  case 4:
2613  assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
2614  NeedWaitStates = isDGEMM(MFMA->getOpcode())
2615  ? DMFMA4x4WriteVgprVALUWriteWaitStates
2616  : isXDL(ST, *MFMA)
2617  ? GFX940_XDL4PassWriteVgprVALUWawWaitStates
2618  : GFX940_SMFMA4PassWriteVgprVALUWawWaitStates;
2619  break;
2620  case 8:
2621  NeedWaitStates = ST.hasGFX940Insts()
2622  ? isXDL(ST, *MFMA)
2623  ? GFX940_XDL8PassWriteVgprVALUWawWaitStates
2624  : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates
2625  : SMFMA16x16WriteVgprVALUWawWaitStates;
2626  break;
2627  case 16: [[fallthrough]];
2628  default:
2629  NeedWaitStates = isDGEMM(MFMA->getOpcode())
2630  ? DMFMA16x16WriteVgprVALUWriteWaitStates
2631  : ST.hasGFX940Insts()
2632  ? isXDL(ST, *MFMA)
2633  ? GFX940_XDL16PassWriteVgprVALUWawWaitStates
2634  : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates
2635  : SMFMA32x32WriteVgprVALUWawWaitStates;
2636  break;
2637  }
2638 
2639  int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2640  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2641 
2642  if (WaitStatesNeeded == MaxWaitStates)
2643  break;
2644  }
2645 
2646  auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2647  if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2648  !MI.readsRegister(Reg, &TRI))
2649  return false;
2650 
2651  if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2652  return false;
2653 
2654  const MachineOperand *SrcC =
2655  TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2656  assert(SrcC);
2657  if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2658  return false;
2659 
2660  MFMA = &MI;
2661  return true;
2662  };
2663 
2664  MFMA = nullptr;
2665  int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2666  MaxWarWaitStates);
2667  if (!MFMA)
2668  continue;
2669 
2670  unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2671  int NeedWaitStates = MaxWaitStates;
2672  switch (HazardDefLatency) {
2673  case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2674  break;
2675  case 4: assert(ST.hasGFX940Insts());
2676  NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2677  break;
2678  case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2679  break;
2680  case 16: [[fallthrough]];
2681  default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2682  break;
2683  }
2684 
2685  int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2686  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2687  }
2688 
2689  return WaitStatesNeeded;
2690 }
2691 
2693  if (!SU->isInstr())
2694  return false;
2695 
2696  const MachineInstr *MAI = nullptr;
2697 
2698  auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2699  MAI = nullptr;
2700  if (SIInstrInfo::isMFMA(MI))
2701  MAI = &MI;
2702  return MAI != nullptr;
2703  };
2704 
2705  MachineInstr *MI = SU->getInstr();
2706  if (IsMFMAFn(*MI)) {
2707  int W = getWaitStatesSince(IsMFMAFn, 16);
2708  if (MAI)
2709  return W < (int)TSchedModel.computeInstrLatency(MAI);
2710  }
2711 
2712  return false;
2713 }
2714 
2715 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2716  if (!ST.isWave64())
2717  return false;
2718  if (!ST.hasVALUMaskWriteHazard())
2719  return false;
2720  if (!SIInstrInfo::isSALU(*MI))
2721  return false;
2722 
2723  // The hazard sequence is three instructions:
2724  // 1. VALU reads SGPR as mask
2725  // 2. SALU writes SGPR
2726  // 3. SALU reads SGPR
2727  // The hazard can expire if the distance between 2 and 3 is sufficient.
2728  // In practice this happens <10% of the time, hence this always assumes
2729  // the hazard exists if 1 and 2 are present to avoid searching.
2730 
2731  const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2732  if (!SDSTOp || !SDSTOp->isReg())
2733  return false;
2734 
2735  const Register HazardReg = SDSTOp->getReg();
2736  if (HazardReg == AMDGPU::EXEC ||
2737  HazardReg == AMDGPU::EXEC_LO ||
2738  HazardReg == AMDGPU::EXEC_HI ||
2739  HazardReg == AMDGPU::M0)
2740  return false;
2741 
2742  auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2743  switch (I.getOpcode()) {
2744  case AMDGPU::V_ADDC_U32_e32:
2745  case AMDGPU::V_ADDC_U32_dpp:
2746  case AMDGPU::V_CNDMASK_B16_e32:
2747  case AMDGPU::V_CNDMASK_B16_dpp:
2748  case AMDGPU::V_CNDMASK_B32_e32:
2749  case AMDGPU::V_CNDMASK_B32_dpp:
2750  case AMDGPU::V_DIV_FMAS_F32_e64:
2751  case AMDGPU::V_DIV_FMAS_F64_e64:
2752  case AMDGPU::V_SUBB_U32_e32:
2753  case AMDGPU::V_SUBB_U32_dpp:
2754  case AMDGPU::V_SUBBREV_U32_e32:
2755  case AMDGPU::V_SUBBREV_U32_dpp:
2756  // These implicitly read VCC as mask source.
2757  return HazardReg == AMDGPU::VCC ||
2758  HazardReg == AMDGPU::VCC_LO ||
2759  HazardReg == AMDGPU::VCC_HI;
2760  case AMDGPU::V_ADDC_U32_e64:
2761  case AMDGPU::V_ADDC_U32_e64_dpp:
2762  case AMDGPU::V_CNDMASK_B16_e64:
2763  case AMDGPU::V_CNDMASK_B16_e64_dpp:
2764  case AMDGPU::V_CNDMASK_B32_e64:
2765  case AMDGPU::V_CNDMASK_B32_e64_dpp:
2766  case AMDGPU::V_SUBB_U32_e64:
2767  case AMDGPU::V_SUBB_U32_e64_dpp:
2768  case AMDGPU::V_SUBBREV_U32_e64:
2769  case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2770  // Only check mask register overlaps.
2771  const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
2772  assert(SSRCOp);
2773  return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
2774  }
2775  default:
2776  return false;
2777  }
2778  };
2779 
2780  const MachineRegisterInfo &MRI = MF.getRegInfo();
2781  auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
2782  // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2783  if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2784  !(I.getOperand(0).getImm() & 0x1))
2785  return true;
2786 
2787  // VALU access to any SGPR or literal constant other than HazardReg
2788  // mitigates hazard. No need to check HazardReg here as this will
2789  // only be called when !IsHazardFn.
2790  if (!SIInstrInfo::isVALU(I))
2791  return false;
2792  for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
2793  const MachineOperand &Op = I.getOperand(OpNo);
2794  if (Op.isReg()) {
2795  Register OpReg = Op.getReg();
2796  // Only consider uses
2797  if (!Op.isUse())
2798  continue;
2799  // Ignore EXEC
2800  if (OpReg == AMDGPU::EXEC ||
2801  OpReg == AMDGPU::EXEC_LO ||
2802  OpReg == AMDGPU::EXEC_HI)
2803  continue;
2804  // Ignore all implicit uses except VCC
2805  if (Op.isImplicit()) {
2806  if (OpReg == AMDGPU::VCC ||
2807  OpReg == AMDGPU::VCC_LO ||
2808  OpReg == AMDGPU::VCC_HI)
2809  return true;
2810  continue;
2811  }
2812  if (TRI.isSGPRReg(MRI, OpReg))
2813  return true;
2814  } else {
2815  const MCInstrDesc &InstDesc = I.getDesc();
2816  const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
2817  if (!TII.isInlineConstant(Op, OpInfo))
2818  return true;
2819  }
2820  }
2821  return false;
2822  };
2823 
2824  // Check for hazard
2825  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2827  return false;
2828 
2829  auto NextMI = std::next(MI->getIterator());
2830 
2831  // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
2832  BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
2833  TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2834  .addImm(0xfffe);
2835 
2836  // SALU write may be s_getpc in a bundle.
2837  if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2838  // Update offsets of any references in the bundle.
2839  while (NextMI != MI->getParent()->end() &&
2840  NextMI->isBundledWithPred()) {
2841  for (auto &Operand : NextMI->operands()) {
2842  if (Operand.isGlobal())
2843  Operand.setOffset(Operand.getOffset() + 4);
2844  }
2845  NextMI++;
2846  }
2847  }
2848 
2849  return true;
2850 }
i
i
Definition: README.txt:29
llvm::GCNSubtarget::hasReadM0LdsDmaHazard
bool hasReadM0LdsDmaHazard() const
Definition: GCNSubtarget.h:984
llvm::GCNSubtarget::hasVDecCoExecHazard
bool hasVDecCoExecHazard() const
Definition: GCNSubtarget.h:1038
llvm::SIInstrInfo::insertNoops
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
Definition: SIInstrInfo.cpp:1811
llvm::TargetSchedModel::getWriteProcResBegin
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
Definition: TargetSchedule.h:133
ScheduleDAG.h
llvm::ScheduleHazardRecognizer::getMaxLookAhead
unsigned getMaxLookAhead() const
Definition: ScheduleHazardRecognizer.h:43
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:108
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
llvm::AMDGPU::getMAIIsDGEMM
bool getMAIIsDGEMM(unsigned Opc)
Returns true if MAI operation is a double precision GEMM.
Definition: AMDGPUBaseInfo.cpp:410
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
llvm::MachineRegisterInfo::isPhysRegUsed
bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Definition: MachineRegisterInfo.cpp:587
isStoreCountWaitZero
static bool isStoreCountWaitZero(const MachineInstr &I)
Definition: GCNHazardRecognizer.cpp:1334
llvm::GCNHazardRecognizer::getHazardType
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
Definition: GCNHazardRecognizer.cpp:181
llvm::GCNSubtarget::hasDstSelForwardingHazard
bool hasDstSelForwardingHazard() const
Definition: GCNSubtarget.h:1032
llvm::AMDGPU::getIsaVersion
IsaVersion getIsaVersion(StringRef GPU)
Definition: TargetParser.cpp:193
SIMachineFunctionInfo.h
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:50
NoHazardFound
@ NoHazardFound
Definition: GCNHazardRecognizer.cpp:445
llvm::BitVector::set
BitVector & set()
Definition: BitVector.h:344
llvm::GCNSubtarget::hasVALUPartialForwardingHazard
bool hasVALUPartialForwardingHazard() const
Definition: GCNSubtarget.h:1060
llvm::BitVector::none
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:181
llvm::ScheduleHazardRecognizer::MaxLookAhead
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
Definition: ScheduleHazardRecognizer.h:31
llvm::SetVector< T, SmallVector< T, N >, SmallDenseSet< T, N > >::size
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
llvm::SIInstrInfo::isBufferSMRD
bool isBufferSMRD(const MachineInstr &MI) const
Definition: SIInstrInfo.cpp:7860
llvm::SIInstrInfo::getNumWaitStates
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
Definition: SIInstrInfo.cpp:1840
llvm::MachineOperand::setIsKill
void setIsKill(bool Val=true)
Definition: MachineOperand.h:509
llvm::RISCVFenceField::W
@ W
Definition: RISCVBaseInfo.h:266
llvm::GCNSubtarget::hasVMEMReadSGPRVALUDefHazard
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
Definition: GCNSubtarget.h:446
llvm::GCNSubtarget::needsAlignedVGPRs
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
Definition: GCNSubtarget.h:1069
llvm::X86Disassembler::Reg
Reg
All possible values of the reg field in the ModR/M byte.
Definition: X86DisassemblerDecoder.h:462
llvm::GCNHazardRecognizer::GCNHazardRecognizer
GCNHazardRecognizer(const MachineFunction &MF)
Definition: GCNHazardRecognizer.cpp:53
llvm::SmallDenseMap
Definition: DenseMap.h:880
llvm::SIInstrInfo::isEXP
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:554
llvm::RegState::Define
@ Define
Register definition.
Definition: MachineInstrBuilder.h:44
isRWLane
static bool isRWLane(unsigned Opcode)
Definition: GCNHazardRecognizer.cpp:98
llvm::GCNSubtarget::getSetRegWaitStates
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
Definition: GCNSubtarget.h:455
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:140
llvm::MachineOperand::setImm
void setImm(int64_t immVal)
Definition: MachineOperand.h:664
llvm::GCNSubtarget::hasShift64HighRegBug
bool hasShift64HighRegBug() const
Definition: GCNSubtarget.h:1022
llvm::AMDGPU::Hwreg::ID_MASK_
@ ID_MASK_
Definition: SIDefines.h:413
llvm::ScheduleHazardRecognizer::Hazard
@ Hazard
Definition: ScheduleHazardRecognizer.h:39
isXDL
static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI)
Definition: GCNHazardRecognizer.cpp:122
addRegUnits
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
Definition: GCNHazardRecognizer.cpp:589
llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:136
llvm::SIInstrFlags::VALU
@ VALU
Definition: SIDefines.h:30
isSGetReg
static bool isSGetReg(unsigned Opcode)
Definition: GCNHazardRecognizer.cpp:83
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::max
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:337
TargetParser.h
llvm::AMDGPU::getNamedOperandIdx
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1628
MFMAPaddingRatio
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
llvm::GCNSubtarget::getRegisterInfo
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:228
hasHazard
static bool hasHazard(StateT State, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, DenseSet< const MachineBasicBlock * > &Visited)
Definition: GCNHazardRecognizer.cpp:453
llvm::GCNSubtarget::hasLdsDirect
bool hasLdsDirect() const
Definition: GCNSubtarget.h:1058
llvm::GCNSubtarget::getInstrInfo
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:216
llvm::TargetSchedModel::init
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
Definition: TargetSchedule.cpp:47
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:187
llvm::SIInstrFlags::DPP
@ DPP
Definition: SIDefines.h:50
llvm::SIInstrInfo::isWMMA
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:677
HazardFound
@ HazardFound
Definition: GCNHazardRecognizer.cpp:445
llvm::MCOperandInfo
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:84
llvm::MachineInstrBuilder::addDef
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Definition: MachineInstrBuilder.h:116
llvm::AMDGPU::hasNamedOperand
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
Definition: AMDGPUBaseInfo.h:303
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:667
llvm::SIInstrInfo::isMIMG
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:501
llvm::PPCISD::SC
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
Definition: PPCISelLowering.h:420
GCNSubtarget.h
getHWReg
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
Definition: GCNHazardRecognizer.cpp:174
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineOperand::getImm
int64_t getImm() const
Definition: MachineOperand.h:546
llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:755
int
Clang compiles this i1 i64 store i64 i64 store i64 i64 store i64 i64 store i64 align Which gets codegen d xmm0 movaps rbp movaps rbp movaps rbp movaps rbp rbp rbp rbp rbp It would be better to have movq s of instead of the movaps s LLVM produces ret int
Definition: README.txt:536
llvm::SIInstrInfo::isMAI
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:660
llvm::InlineAsm::MIOp_FirstOperand
@ MIOp_FirstOperand
Definition: InlineAsm.h:227
llvm::MCWriteProcResEntry::Cycles
uint16_t Cycles
Definition: MCSchedule.h:65
llvm::MCSchedClassDesc
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:109
shouldRunLdsBranchVmemWARHazardFixup
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
Definition: GCNHazardRecognizer.cpp:1313
llvm::GCNSubtarget::hasSMRDReadVALUDefHazard
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
Definition: GCNSubtarget.h:440
isRFE
static bool isRFE(unsigned Opcode)
Definition: GCNHazardRecognizer.cpp:102
llvm::AMDGPU::getMAIIsGFX940XDL
bool getMAIIsGFX940XDL(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:415
false
Definition: StackSlotColoring.cpp:141
llvm::GCNSubtarget::hasGFX940Insts
bool hasGFX940Insts() const
Definition: GCNSubtarget.h:1095
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125
llvm::GraphProgram::DOT
@ DOT
Definition: GraphWriter.h:51
llvm::MCInstrDesc
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:197
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:48
getWaitStatesSince
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
Definition: GCNHazardRecognizer.cpp:495
llvm::M0
unsigned M0(unsigned Val)
Definition: VE.h:465
llvm::SIInstrFlags::SMRD
@ SMRD
Definition: SIDefines.h:56
llvm::cl::parser
Definition: CommandLine.h:817
llvm::AMDGPU::decodeWaitcnt
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
Definition: AMDGPUBaseInfo.cpp:1223
llvm::AArch64PACKey::IA
@ IA
Definition: AArch64BaseInfo.h:819
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:30
llvm::TargetSchedModel::getWriteProcResEnd
ProcResIter getWriteProcResEnd(const MCSchedClassDesc *SC) const
Definition: TargetSchedule.h:137
llvm::BitVector
Definition: BitVector.h:75
llvm::MCOperandInfo::RegClass
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:90
llvm::GCNSubtarget::hasTransForwardingHazard
bool hasTransForwardingHazard() const
Definition: GCNSubtarget.h:1028
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:94
llvm::GCNSubtarget::hasVcmpxExecWARHazard
bool hasVcmpxExecWARHazard() const
Definition: GCNSubtarget.h:1012
llvm::GCNSubtarget::isWave64
bool isWave64() const
Definition: GCNSubtarget.h:1272
llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLFunctionalExtras.h:36
llvm::SISrcMods::DST_OP_SEL
@ DST_OP_SEL
Definition: SIDefines.h:225
llvm::TargetSchedModel::resolveSchedClass
const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
Definition: TargetSchedule.cpp:117
isSendMsgTraceDataOrGDS
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
Definition: GCNHazardRecognizer.cpp:137
isPermlane
static bool isPermlane(const MachineInstr &MI)
Definition: GCNHazardRecognizer.cpp:163
llvm::DenseSet
Implements a dense probed hash-table based set.
Definition: DenseSet.h:268
llvm::cl::opt
Definition: CommandLine.h:1412
llvm::AMDGPU::getRegBitWidth
unsigned getRegBitWidth(unsigned RCID)
Get the size in bits of a register from the register class RC.
Definition: AMDGPUBaseInfo.cpp:2124
llvm::RISCVFenceField::O
@ O
Definition: RISCVBaseInfo.h:264
llvm::cl::Option
Definition: CommandLine.h:250
llvm::SIInstrInfo::isDPP
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
llvm::ScheduleHazardRecognizer::HazardType
HazardType
Definition: ScheduleHazardRecognizer.h:37
llvm::SISrcMods::NEG_HI
@ NEG_HI
Definition: SIDefines.h:222
AMDGPUMCTargetDesc.h
llvm::TargetRegisterInfo::regsOverlap
bool regsOverlap(Register RegA, Register RegB) const
Returns true if the two registers are equal or alias each other.
Definition: TargetRegisterInfo.h:422
llvm::GCNHazardRecognizer::PreEmitNoopsCommon
unsigned PreEmitNoopsCommon(MachineInstr *)
Definition: GCNHazardRecognizer.cpp:326
llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition: MachineInstrBuilder.h:52
llvm::AMDGPU::Hwreg::Offset
Offset
Definition: SIDefines.h:416
llvm::MachineBasicBlock::instr_rend
reverse_instr_iterator instr_rend()
Definition: MachineBasicBlock.h:295
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:320
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:66
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
UseReg
static Register UseReg(const MachineOperand &MO)
Definition: HexagonCopyToCombine.cpp:252
llvm::GCNSubtarget::hasVMEMtoScalarWriteHazard
bool hasVMEMtoScalarWriteHazard() const
Definition: GCNSubtarget.h:996
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:53
llvm::SIInstrInfo::isDS
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:491
GetNumWaitStatesFn
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
Definition: GCNHazardRecognizer.cpp:448
llvm::ScheduleHazardRecognizer::EmitNoops
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
Definition: ScheduleHazardRecognizer.h:120
llvm::SIInstrInfo::isSMRD
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:481
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::SUnit::getInstr
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:447
isLdsDma
static bool isLdsDma(const MachineInstr &MI)
Definition: GCNHazardRecognizer.cpp:169
llvm::AMDGPU::SDWA::DWORD
@ DWORD
Definition: SIDefines.h:766
addRegsToSet
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &Set)
Definition: GCNHazardRecognizer.cpp:595
llvm::GCNHazardRecognizer::PreEmitNoops
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
Definition: GCNHazardRecognizer.cpp:317
llvm::SIInstrInfo::isMFMA
static bool isMFMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:668
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
HazardExpired
@ HazardExpired
Definition: GCNHazardRecognizer.cpp:445
llvm::SIInstrInfo::isFPAtomic
static bool isFPAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:771
llvm::GCNSubtarget::hasReadM0MovRelInterpHazard
bool hasReadM0MovRelInterpHazard() const
Definition: GCNSubtarget.h:975
llvm::MCInstrDesc::OpInfo
const MCOperandInfo * OpInfo
Definition: MCInstrDesc.h:208
llvm::GCNSubtarget::hasRFEHazards
bool hasRFEHazards() const
Definition: GCNSubtarget.h:450
HazardFnResult
HazardFnResult
Definition: GCNHazardRecognizer.cpp:445
llvm::GCNSubtarget::hasGFX90AInsts
bool hasGFX90AInsts() const
Definition: GCNSubtarget.h:1050
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
llvm::AMDGPU::getMIMGInfo
const LLVM_READONLY MIMGInfo * getMIMGInfo(unsigned Opc)
llvm::GCNSubtarget::hasNoDataDepHazard
bool hasNoDataDepHazard() const
Definition: GCNSubtarget.h:778
llvm::GCNHazardRecognizer::EmitInstruction
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
Definition: GCNHazardRecognizer.cpp:71
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:359
llvm::MachineBasicBlock::predecessors
iterator_range< pred_iterator > predecessors()
Definition: MachineBasicBlock.h:386
IsExpiredFn
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
Definition: GCNHazardRecognizer.cpp:447
llvm::GCNHazardRecognizer::EmitNoop
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
Definition: GCNHazardRecognizer.cpp:395
breaksVMEMSoftClause
static bool breaksVMEMSoftClause(MachineInstr *MI)
Definition: GCNHazardRecognizer.cpp:614
llvm::GCNSubtarget::hasLdsBranchVmemWARHazard
bool hasLdsBranchVmemWARHazard() const
Definition: GCNSubtarget.h:1016
llvm::SIMachineFunctionInfo::getOccupancy
unsigned getOccupancy() const
Definition: SIMachineFunctionInfo.h:927
llvm::SIInstrInfo::isSALU
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:353
llvm::MachineBasicBlock::instr_end
instr_iterator instr_end()
Definition: MachineBasicBlock.h:291
llvm::MachineFunction
Definition: MachineFunction.h:257
llvm::SetVector< T, SmallVector< T, N >, SmallDenseSet< T, N > >::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
llvm::GCNSubtarget::hasFPAtomicToDenormModeHazard
bool hasFPAtomicToDenormModeHazard() const
Definition: GCNSubtarget.h:1052
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
insertNoopsInBundle
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
Definition: GCNHazardRecognizer.cpp:259
cl
http eax xorl edx cl sete al setne dl sall cl
Definition: README.txt:25
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::SIInstrInfo::isSDWA
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:449
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
llvm::MachineOperand::setIsUndef
void setIsUndef(bool Val=true)
Definition: MachineOperand.h:520
parse
static llvm::Error parse(DataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
Definition: LineTable.cpp:54
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:82
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::GCNSubtarget::hasVALUTransUseHazard
bool hasVALUTransUseHazard() const
Definition: GCNSubtarget.h:1064
isSSetReg
static bool isSSetReg(unsigned Opcode)
Definition: GCNHazardRecognizer.cpp:87
llvm::SIInstrInfo::isMUBUF
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:465
llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:313
llvm::SIInstrInfo::isDOT
static bool isDOT(const MachineInstr &MI)
Definition: SIInstrInfo.h:673
llvm::GCNHazardRecognizer::Reset
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
Definition: GCNHazardRecognizer.cpp:67
MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105
isDGEMM
static bool isDGEMM(unsigned Opcode)
Definition: GCNHazardRecognizer.cpp:118
llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
llvm::ScheduleHazardRecognizer::NoopHazard
@ NoopHazard
Definition: ScheduleHazardRecognizer.h:40
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::GCNSubtarget::hasNSAtoVMEMBug
bool hasNSAtoVMEMBug() const
Definition: GCNSubtarget.h:1042
llvm::GCNHazardRecognizer::RecedeCycle
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
Definition: GCNHazardRecognizer.cpp:437
llvm::ScheduleHazardRecognizer::NoHazard
@ NoHazard
Definition: ScheduleHazardRecognizer.h:38
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:348
llvm::SIInstrInfo::isSegmentSpecificFLAT
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:523
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::MachineInstr::mayStore
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:1069
llvm::SIInstrInfo::isVMEM
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:369
llvm::RegState::Kill
@ Kill
The last use of a register.
Definition: MachineInstrBuilder.h:48
llvm::SIInstrInfo::isMTBUF
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:473
llvm::SmallSet::insert
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:178
llvm::MachineInstr::isBundle
bool isBundle() const
Definition: MachineInstr.h:1332
llvm::RISCVMatInt::Imm
@ Imm
Definition: RISCVMatInt.h:23
llvm::SIInstrInfo
Definition: SIInstrInfo.h:44
GCNHazardRecognizer.h
llvm::GCNHazardRecognizer::AdvanceCycle
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
Definition: GCNHazardRecognizer.cpp:399
llvm::MCRegUnitIterator
Definition: MCRegisterInfo.h:680
llvm::GCNSubtarget::isXNACKEnabled
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:551
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:357
isDivFMas
static bool isDivFMas(unsigned Opcode)
Definition: GCNHazardRecognizer.cpp:79
llvm::SUnit::isInstr
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:362
llvm::SIInstrInfo::isVALU
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:361
llvm::MCRegisterInfo::DiffListIterator::isValid
bool isValid() const
isValid - returns true if this iterator is not yet at the end.
Definition: MCRegisterInfo.h:224
llvm::MachineOperand::setReg
void setReg(Register Reg)
Change the register this operand corresponds to.
Definition: MachineOperand.cpp:56
llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition: SIMachineFunctionInfo.h:325
llvm::GCNHazardRecognizer::ShouldPreferAnother
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
Definition: GCNHazardRecognizer.cpp:2692
llvm::iterator_range
A range adaptor for a pair of iterators.
Definition: iterator_range.h:30
llvm::GCNHazardRecognizer::IsHazardFn
function_ref< bool(const MachineInstr &)> IsHazardFn
Definition: GCNHazardRecognizer.h:34
IV
static const uint32_t IV[8]
Definition: blake3_impl.h:85
llvm::GCNSubtarget::hasReadM0SendMsgHazard
bool hasReadM0SendMsgHazard() const
Definition: GCNSubtarget.h:979
llvm::BitVector::anyCommon
bool anyCommon(const BitVector &RHS) const
Test if any common bits are set.
Definition: BitVector.h:482
llvm::TargetRegisterInfo::getSubReg
MCRegister getSubReg(MCRegister Reg, unsigned Idx) const
Returns the physical register number of sub-register "Index" for physical register RegNo.
Definition: TargetRegisterInfo.h:1142
llvm::SmallSetVector
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:307
llvm::SUnit
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
llvm::RegState::Dead
@ Dead
Unused definition.
Definition: MachineInstrBuilder.h:50
llvm::GCNSubtarget::hasSMEMtoVectorWriteHazard
bool hasSMEMtoVectorWriteHazard() const
Definition: GCNSubtarget.h:1000
llvm::AMDGPU::Hwreg::ID_TRAPSTS
@ ID_TRAPSTS
Definition: SIDefines.h:388
llvm::GCNSubtarget::hasVALUMaskWriteHazard
bool hasVALUMaskWriteHazard() const
Definition: GCNSubtarget.h:1066
llvm::SISrcMods::NEG
@ NEG
Definition: SIDefines.h:219
llvm::SIInstrFlags::TRANS
@ TRANS
Definition: SIDefines.h:51
llvm::SIInstrInfo::isTRANS
static bool isTRANS(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::SIInstrInfo::isFLAT
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:517
llvm::cl::desc
Definition: CommandLine.h:413
llvm::SIInstrInfo::isLDSDIR
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:689
breaksSMEMSoftClause
static bool breaksSMEMSoftClause(MachineInstr *MI)
Definition: GCNHazardRecognizer.cpp:610
llvm::GCNSubtarget::has12DWordStoreHazard
bool has12DWordStoreHazard() const
Definition: GCNSubtarget.h:966
MachineFunction.h
llvm::GCNSubtarget::hasVcmpxPermlaneHazard
bool hasVcmpxPermlaneHazard() const
Definition: GCNSubtarget.h:992
llvm::MachineInstrBundleIterator< MachineInstr >
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
isSMovRel
static bool isSMovRel(unsigned Opcode)
Definition: GCNHazardRecognizer.cpp:106
llvm::SIInstrInfo::isVINTRP
static bool isVINTRP(const MachineInstr &MI)
Definition: SIInstrInfo.h:652
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
llvm::GCNSubtarget::hasMAIInsts
bool hasMAIInsts() const
Definition: GCNSubtarget.h:730
llvm::MCRegister
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:24
llvm::GCNSubtarget::hasReadM0LdsDirectHazard
bool hasReadM0LdsDirectHazard() const
Definition: GCNSubtarget.h:988