LLVM 20.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
21
22using namespace llvm;
23
24namespace {
25
26struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
27 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
28
29 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
30 if (Arg.getAsInteger(0, Value))
31 return O.error("'" + Arg + "' value invalid for uint argument!");
32
33 if (Value > 100)
34 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
35
36 return false;
37 }
38};
39
40} // end anonymous namespace
41
43 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
44 cl::desc("Fill a percentage of the latency between "
45 "neighboring MFMA with s_nops."));
46
47//===----------------------------------------------------------------------===//
48// Hazard Recognizer Implementation
49//===----------------------------------------------------------------------===//
50
52 const GCNSubtarget &ST);
53
55 IsHazardRecognizerMode(false),
56 CurrCycleInstr(nullptr),
57 MF(MF),
58 ST(MF.getSubtarget<GCNSubtarget>()),
59 TII(*ST.getInstrInfo()),
60 TRI(TII.getRegisterInfo()),
61 ClauseUses(TRI.getNumRegUnits()),
62 ClauseDefs(TRI.getNumRegUnits()) {
63 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
64 TSchedModel.init(&ST);
65 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
66}
67
69 EmittedInstrs.clear();
70}
71
74}
75
77 CurrCycleInstr = MI;
78}
79
80static bool isDivFMas(unsigned Opcode) {
81 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
82}
83
84static bool isSGetReg(unsigned Opcode) {
85 return Opcode == AMDGPU::S_GETREG_B32;
86}
87
88static bool isSSetReg(unsigned Opcode) {
89 switch (Opcode) {
90 case AMDGPU::S_SETREG_B32:
91 case AMDGPU::S_SETREG_B32_mode:
92 case AMDGPU::S_SETREG_IMM32_B32:
93 case AMDGPU::S_SETREG_IMM32_B32_mode:
94 return true;
95 }
96 return false;
97}
98
99static bool isRWLane(unsigned Opcode) {
100 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
101}
102
103static bool isRFE(unsigned Opcode) {
104 return Opcode == AMDGPU::S_RFE_B64;
105}
106
107static bool isSMovRel(unsigned Opcode) {
108 switch (Opcode) {
109 case AMDGPU::S_MOVRELS_B32:
110 case AMDGPU::S_MOVRELS_B64:
111 case AMDGPU::S_MOVRELD_B32:
112 case AMDGPU::S_MOVRELD_B64:
113 return true;
114 default:
115 return false;
116 }
117}
118
119static bool isDGEMM(unsigned Opcode) {
120 return AMDGPU::getMAIIsDGEMM(Opcode);
121}
122
123static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
124 unsigned Opcode = MI.getOpcode();
125
126 if (!SIInstrInfo::isMAI(MI) ||
127 isDGEMM(Opcode) ||
128 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
129 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
130 return false;
131
132 if (!ST.hasGFX940Insts())
133 return true;
134
135 return AMDGPU::getMAIIsGFX940XDL(Opcode);
136}
137
139 const MachineInstr &MI) {
140 if (TII.isAlwaysGDS(MI.getOpcode()))
141 return true;
142
143 switch (MI.getOpcode()) {
144 case AMDGPU::S_SENDMSG:
145 case AMDGPU::S_SENDMSGHALT:
146 case AMDGPU::S_TTRACEDATA:
147 return true;
148 // These DS opcodes don't support GDS.
149 case AMDGPU::DS_NOP:
150 case AMDGPU::DS_PERMUTE_B32:
151 case AMDGPU::DS_BPERMUTE_B32:
152 return false;
153 default:
154 if (TII.isDS(MI.getOpcode())) {
155 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
156 AMDGPU::OpName::gds);
157 if (MI.getOperand(GDS).getImm())
158 return true;
159 }
160 return false;
161 }
162}
163
164static bool isPermlane(const MachineInstr &MI) {
165 unsigned Opcode = MI.getOpcode();
166 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
167 Opcode == AMDGPU::V_PERMLANE64_B32 ||
168 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
169 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
170 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
171}
172
173static bool isLdsDma(const MachineInstr &MI) {
174 return SIInstrInfo::isVALU(MI) &&
176}
177
178static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
179 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
180 AMDGPU::OpName::simm16);
181 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
182}
183
186 MachineInstr *MI = SU->getInstr();
187 // If we are not in "HazardRecognizerMode" and therefore not being run from
188 // the scheduler, track possible stalls from hazards but don't insert noops.
189 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
190
191 if (MI->isBundle())
192 return NoHazard;
193
194 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
195 return HazardType;
196
197 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
198 return HazardType;
199
200 if (checkFPAtomicToDenormModeHazard(MI) > 0)
201 return HazardType;
202
203 if (ST.hasNoDataDepHazard())
204 return NoHazard;
205
206 // FIXME: Should flat be considered vmem?
207 if ((SIInstrInfo::isVMEM(*MI) ||
209 && checkVMEMHazards(MI) > 0)
210 return HazardType;
211
212 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
213 return HazardType;
214
215 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
216 return HazardType;
217
218 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
219 return HazardType;
220
221 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
222 return HazardType;
223
226 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
227 return HazardType;
228
229 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
230 return HazardType;
231
232 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
233 return HazardType;
234
235 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
236 return HazardType;
237
238 if (((ST.hasReadM0MovRelInterpHazard() &&
239 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
240 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
241 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
243 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
245 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
246 checkReadM0Hazards(MI) > 0)
247 return HazardType;
248
249 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
250 return HazardType;
251
252 if ((SIInstrInfo::isVMEM(*MI) ||
254 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
255 return HazardType;
256
257 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
258 return HazardType;
259
260 return NoHazard;
261}
262
264 unsigned Quantity) {
265 while (Quantity > 0) {
266 unsigned Arg = std::min(Quantity, 8u);
267 Quantity -= Arg;
268 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
269 .addImm(Arg - 1);
270 }
271}
272
273unsigned
274GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
275 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
276 assert(TSchedModel.getWriteProcResBegin(SC) !=
277 TSchedModel.getWriteProcResEnd(SC));
278 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
279}
280
281void GCNHazardRecognizer::processBundle() {
282 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
284 // Check bundled MachineInstr's for hazards.
285 for (; MI != E && MI->isInsideBundle(); ++MI) {
286 CurrCycleInstr = &*MI;
287 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
288
289 if (IsHazardRecognizerMode) {
290 fixHazards(CurrCycleInstr);
291
292 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
293 }
294
295 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
296 // include the bundled MI directly after, only add a maximum of
297 // (MaxLookAhead - 1) noops to EmittedInstrs.
298 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
299 EmittedInstrs.push_front(nullptr);
300
301 EmittedInstrs.push_front(CurrCycleInstr);
302 EmittedInstrs.resize(MaxLookAhead);
303 }
304 CurrCycleInstr = nullptr;
305}
306
307void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
308 assert(IsHazardRecognizerMode);
309
310 unsigned NumPreNoops = PreEmitNoops(MI);
311 EmitNoops(NumPreNoops);
312 if (MI->isInsideBundle())
313 insertNoopsInBundle(MI, TII, NumPreNoops);
314 else
315 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
316 NumPreNoops);
318 AdvanceCycle();
319}
320
322 IsHazardRecognizerMode = true;
323 CurrCycleInstr = MI;
324 unsigned W = PreEmitNoopsCommon(MI);
325 fixHazards(MI);
326 CurrCycleInstr = nullptr;
327 return W;
328}
329
331 if (MI->isBundle())
332 return 0;
333
334 int WaitStates = 0;
335
337 return std::max(WaitStates, checkSMRDHazards(MI));
338
339 if (ST.hasNSAtoVMEMBug())
340 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
341
342 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
343
344 if (ST.hasNoDataDepHazard())
345 return WaitStates;
346
348 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
349
351 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
352
354 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
355
356 if (isDivFMas(MI->getOpcode()))
357 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
358
359 if (isRWLane(MI->getOpcode()))
360 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
361
364 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
365 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
366
367 if (MI->isInlineAsm())
368 return std::max(WaitStates, checkInlineAsmHazards(MI));
369
370 if (isSGetReg(MI->getOpcode()))
371 return std::max(WaitStates, checkGetRegHazards(MI));
372
373 if (isSSetReg(MI->getOpcode()))
374 return std::max(WaitStates, checkSetRegHazards(MI));
375
376 if (isRFE(MI->getOpcode()))
377 return std::max(WaitStates, checkRFEHazards(MI));
378
379 if ((ST.hasReadM0MovRelInterpHazard() &&
380 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
381 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
382 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
384 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
386 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
387 return std::max(WaitStates, checkReadM0Hazards(MI));
388
390 return std::max(WaitStates, checkMAIHazards(MI));
391
392 if (SIInstrInfo::isVMEM(*MI) ||
395 return std::max(WaitStates, checkMAILdStHazards(MI));
396
397 return WaitStates;
398}
399
401 EmittedInstrs.push_front(nullptr);
402}
403
405 // When the scheduler detects a stall, it will call AdvanceCycle() without
406 // emitting any instructions.
407 if (!CurrCycleInstr) {
408 EmittedInstrs.push_front(nullptr);
409 return;
410 }
411
412 if (CurrCycleInstr->isBundle()) {
413 processBundle();
414 return;
415 }
416
417 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
418 if (!NumWaitStates) {
419 CurrCycleInstr = nullptr;
420 return;
421 }
422
423 // Keep track of emitted instructions
424 EmittedInstrs.push_front(CurrCycleInstr);
425
426 // Add a nullptr for each additional wait state after the first. Make sure
427 // not to add more than getMaxLookAhead() items to the list, since we
428 // truncate the list to that size right after this loop.
429 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
430 i < e; ++i) {
431 EmittedInstrs.push_front(nullptr);
432 }
433
434 // getMaxLookahead() is the largest number of wait states we will ever need
435 // to insert, so there is no point in keeping track of more than that many
436 // wait states.
437 EmittedInstrs.resize(getMaxLookAhead());
438
439 CurrCycleInstr = nullptr;
440}
441
443 llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
444}
445
446//===----------------------------------------------------------------------===//
447// Helper Functions
448//===----------------------------------------------------------------------===//
449
450using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
451
452using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
453using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
454
455// Search for a hazard in a block and its predecessors.
456template <typename StateT>
457static bool
458hasHazard(StateT State,
459 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
460 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
461 const MachineBasicBlock *MBB,
464 for (auto E = MBB->instr_rend(); I != E; ++I) {
465 // No need to look at parent BUNDLE instructions.
466 if (I->isBundle())
467 continue;
468
469 switch (IsHazard(State, *I)) {
470 case HazardFound:
471 return true;
472 case HazardExpired:
473 return false;
474 default:
475 // Continue search
476 break;
477 }
478
479 if (I->isInlineAsm() || I->isMetaInstruction())
480 continue;
481
482 UpdateState(State, *I);
483 }
484
485 for (MachineBasicBlock *Pred : MBB->predecessors()) {
486 if (!Visited.insert(Pred).second)
487 continue;
488
489 if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
490 Visited))
491 return true;
492 }
493
494 return false;
495}
496
497// Returns a minimum wait states since \p I walking all predecessors.
498// Only scans until \p IsExpired does not return true.
499// Can only be run in a hazard recognizer mode.
505 for (auto E = MBB->instr_rend(); I != E; ++I) {
506 // Don't add WaitStates for parent BUNDLE instructions.
507 if (I->isBundle())
508 continue;
509
510 if (IsHazard(*I))
511 return WaitStates;
512
513 if (I->isInlineAsm())
514 continue;
515
516 WaitStates += GetNumWaitStates(*I);
517
518 if (IsExpired(*I, WaitStates))
519 return std::numeric_limits<int>::max();
520 }
521
522 int MinWaitStates = std::numeric_limits<int>::max();
523 for (MachineBasicBlock *Pred : MBB->predecessors()) {
524 if (!Visited.insert(Pred).second)
525 continue;
526
527 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
528 IsExpired, Visited, GetNumWaitStates);
529
530 MinWaitStates = std::min(MinWaitStates, W);
531 }
532
533 return MinWaitStates;
534}
535
537 const MachineInstr *MI, IsExpiredFn IsExpired) {
539 return getWaitStatesSince(IsHazard, MI->getParent(),
540 std::next(MI->getReverseIterator()),
541 0, IsExpired, Visited);
542}
543
544int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
545 if (IsHazardRecognizerMode) {
546 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
547 return WaitStates >= Limit;
548 };
549 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
550 }
551
552 int WaitStates = 0;
553 for (MachineInstr *MI : EmittedInstrs) {
554 if (MI) {
555 if (IsHazard(*MI))
556 return WaitStates;
557
558 if (MI->isInlineAsm())
559 continue;
560 }
561 ++WaitStates;
562
563 if (WaitStates >= Limit)
564 break;
565 }
566 return std::numeric_limits<int>::max();
567}
568
569int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
570 IsHazardFn IsHazardDef,
571 int Limit) {
572 const SIRegisterInfo *TRI = ST.getRegisterInfo();
573
574 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
575 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
576 };
577
578 return getWaitStatesSince(IsHazardFn, Limit);
579}
580
581int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
582 int Limit) {
583 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
584 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
585 };
586
587 return getWaitStatesSince(IsHazardFn, Limit);
588}
589
590//===----------------------------------------------------------------------===//
591// No-op Hazard Detection
592//===----------------------------------------------------------------------===//
593
594static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
595 MCRegister Reg) {
596 for (MCRegUnit Unit : TRI.regunits(Reg))
597 BV.set(Unit);
598}
599
600static void addRegsToSet(const SIRegisterInfo &TRI,
602 BitVector &DefSet, BitVector &UseSet) {
603 for (const MachineOperand &Op : Ops) {
604 if (Op.isReg())
605 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
606 }
607}
608
609void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
610 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
611}
612
614 return !SIInstrInfo::isSMRD(*MI);
615}
616
619}
620
621int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
622 // SMEM soft clause are only present on VI+, and only matter if xnack is
623 // enabled.
624 if (!ST.isXNACKEnabled())
625 return 0;
626
627 bool IsSMRD = TII.isSMRD(*MEM);
628
629 resetClause();
630
631 // A soft-clause is any group of consecutive SMEM instructions. The
632 // instructions in this group may return out of order and/or may be
633 // replayed (i.e. the same instruction issued more than once).
634 //
635 // In order to handle these situations correctly we need to make sure that
636 // when a clause has more than one instruction, no instruction in the clause
637 // writes to a register that is read by another instruction in the clause
638 // (including itself). If we encounter this situation, we need to break the
639 // clause by inserting a non SMEM instruction.
640
641 for (MachineInstr *MI : EmittedInstrs) {
642 // When we hit a non-SMEM instruction then we have passed the start of the
643 // clause and we can stop.
644 if (!MI)
645 break;
646
648 break;
649
650 addClauseInst(*MI);
651 }
652
653 if (ClauseDefs.none())
654 return 0;
655
656 // We need to make sure not to put loads and stores in the same clause if they
657 // use the same address. For now, just start a new clause whenever we see a
658 // store.
659 if (MEM->mayStore())
660 return 1;
661
662 addClauseInst(*MEM);
663
664 // If the set of defs and uses intersect then we cannot add this instruction
665 // to the clause, so we have a hazard.
666 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
667}
668
669int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
670 int WaitStatesNeeded = 0;
671
672 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
673
674 // This SMRD hazard only affects SI.
675 if (!ST.hasSMRDReadVALUDefHazard())
676 return WaitStatesNeeded;
677
678 // A read of an SGPR by SMRD instruction requires 4 wait states when the
679 // SGPR was written by a VALU instruction.
680 int SmrdSgprWaitStates = 4;
681 auto IsHazardDefFn = [this](const MachineInstr &MI) {
682 return TII.isVALU(MI);
683 };
684 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
685 return TII.isSALU(MI);
686 };
687
688 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
689
690 for (const MachineOperand &Use : SMRD->uses()) {
691 if (!Use.isReg())
692 continue;
693 int WaitStatesNeededForUse =
694 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
695 SmrdSgprWaitStates);
696 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
697
698 // This fixes what appears to be undocumented hardware behavior in SI where
699 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
700 // needs some number of nops in between. We don't know how many we need, but
701 // let's use 4. This wasn't discovered before probably because the only
702 // case when this happens is when we expand a 64-bit pointer into a full
703 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
704 // probably never encountered in the closed-source land.
705 if (IsBufferSMRD) {
706 int WaitStatesNeededForUse =
707 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
708 IsBufferHazardDefFn,
709 SmrdSgprWaitStates);
710 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
711 }
712 }
713
714 return WaitStatesNeeded;
715}
716
717int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
719 return 0;
720
721 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
722
723 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
724 // SGPR was written by a VALU Instruction.
725 const int VmemSgprWaitStates = 5;
726 auto IsHazardDefFn = [this](const MachineInstr &MI) {
727 return TII.isVALU(MI);
728 };
729 for (const MachineOperand &Use : VMEM->uses()) {
730 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
731 continue;
732
733 int WaitStatesNeededForUse =
734 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
735 VmemSgprWaitStates);
736 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
737 }
738 return WaitStatesNeeded;
739}
740
741int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
742 const SIRegisterInfo *TRI = ST.getRegisterInfo();
743 const SIInstrInfo *TII = ST.getInstrInfo();
744
745 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
746 int DppVgprWaitStates = 2;
747 int DppExecWaitStates = 5;
748 int WaitStatesNeeded = 0;
749 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
750 return TII->isVALU(MI);
751 };
752
753 for (const MachineOperand &Use : DPP->uses()) {
754 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
755 continue;
756 int WaitStatesNeededForUse =
757 DppVgprWaitStates - getWaitStatesSinceDef(
758 Use.getReg(),
759 [](const MachineInstr &) { return true; },
760 DppVgprWaitStates);
761 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
762 }
763
764 WaitStatesNeeded = std::max(
765 WaitStatesNeeded,
766 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
767 DppExecWaitStates));
768
769 return WaitStatesNeeded;
770}
771
772int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
773 const SIInstrInfo *TII = ST.getInstrInfo();
774
775 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
776 // instruction.
777 const int DivFMasWaitStates = 4;
778 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
779 return TII->isVALU(MI);
780 };
781 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
782 DivFMasWaitStates);
783
784 return DivFMasWaitStates - WaitStatesNeeded;
785}
786
787int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
788 const SIInstrInfo *TII = ST.getInstrInfo();
789 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
790
791 const int GetRegWaitStates = 2;
792 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
793 return GetRegHWReg == getHWReg(TII, MI);
794 };
795 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
796
797 return GetRegWaitStates - WaitStatesNeeded;
798}
799
800int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
801 const SIInstrInfo *TII = ST.getInstrInfo();
802 unsigned HWReg = getHWReg(TII, *SetRegInstr);
803
804 const int SetRegWaitStates = ST.getSetRegWaitStates();
805 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
806 return HWReg == getHWReg(TII, MI);
807 };
808 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
809 return SetRegWaitStates - WaitStatesNeeded;
810}
811
812int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
813 if (!MI.mayStore())
814 return -1;
815
816 const SIInstrInfo *TII = ST.getInstrInfo();
817 unsigned Opcode = MI.getOpcode();
818 const MCInstrDesc &Desc = MI.getDesc();
819
820 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
821 int VDataRCID = -1;
822 if (VDataIdx != -1)
823 VDataRCID = Desc.operands()[VDataIdx].RegClass;
824
825 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
826 // There is no hazard if the instruction does not use vector regs
827 // (like wbinvl1)
828 if (VDataIdx == -1)
829 return -1;
830 // For MUBUF/MTBUF instructions this hazard only exists if the
831 // instruction is not using a register in the soffset field.
832 const MachineOperand *SOffset =
833 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
834 // If we have no soffset operand, then assume this field has been
835 // hardcoded to zero.
836 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
837 (!SOffset || !SOffset->isReg()))
838 return VDataIdx;
839 }
840
841 // MIMG instructions create a hazard if they don't use a 256-bit T# and
842 // the store size is greater than 8 bytes and they have more than two bits
843 // of their dmask set.
844 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
845 if (TII->isMIMG(MI)) {
846 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
847 assert(SRsrcIdx != -1 &&
848 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
849 (void)SRsrcIdx;
850 }
851
852 if (TII->isFLAT(MI)) {
853 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
854 if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
855 return DataIdx;
856 }
857
858 return -1;
859}
860
861int
862GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
863 const MachineRegisterInfo &MRI) {
864 // Helper to check for the hazard where VMEM instructions that store more than
865 // 8 bytes can have there store data over written by the next instruction.
866 const SIRegisterInfo *TRI = ST.getRegisterInfo();
867
868 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
869 int WaitStatesNeeded = 0;
870
871 if (!TRI->isVectorRegister(MRI, Def.getReg()))
872 return WaitStatesNeeded;
873 Register Reg = Def.getReg();
874 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
875 int DataIdx = createsVALUHazard(MI);
876 return DataIdx >= 0 &&
877 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
878 };
879 int WaitStatesNeededForDef =
880 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
881 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
882
883 return WaitStatesNeeded;
884}
885
886int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
887 int WaitStatesNeeded = 0;
888
890 const int TransDefWaitstates = 1;
891
892 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
894 return false;
895 const SIRegisterInfo *TRI = ST.getRegisterInfo();
896 const SIInstrInfo *TII = ST.getInstrInfo();
897 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
898
899 for (const MachineOperand &Use : VALU->explicit_uses()) {
900 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
901 return true;
902 }
903
904 return false;
905 };
906
907 int WaitStatesNeededForDef =
908 TransDefWaitstates -
909 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
910 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
911 }
912
913 if (ST.hasDstSelForwardingHazard()) {
914 const int Shift16DefWaitstates = 1;
915
916 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
918 return false;
919 const SIInstrInfo *TII = ST.getInstrInfo();
920 if (SIInstrInfo::isSDWA(MI)) {
921 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
922 if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
923 return false;
924 } else {
925 if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
926 !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
927 ->getImm() &
929 return false;
930 }
931 const SIRegisterInfo *TRI = ST.getRegisterInfo();
932 if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
933 Register Def = Dst->getReg();
934
935 for (const MachineOperand &Use : VALU->explicit_uses()) {
936 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
937 return true;
938 }
939 }
940
941 return false;
942 };
943
944 int WaitStatesNeededForDef =
945 Shift16DefWaitstates -
946 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
947 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
948 }
949
950 if (ST.hasVDecCoExecHazard()) {
951 const int VALUWriteSGPRVALUReadWaitstates = 2;
952 const int VALUWriteEXECRWLane = 4;
953 const int VALUWriteVGPRReadlaneRead = 1;
954
955 const SIRegisterInfo *TRI = ST.getRegisterInfo();
956 const MachineRegisterInfo &MRI = MF.getRegInfo();
958 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
960 return false;
961 return MI.modifiesRegister(UseReg, TRI);
962 };
963
964 for (const MachineOperand &Use : VALU->explicit_uses()) {
965 if (!Use.isReg())
966 continue;
967
968 UseReg = Use.getReg();
969 if (TRI->isSGPRReg(MRI, UseReg)) {
970 int WaitStatesNeededForDef =
971 VALUWriteSGPRVALUReadWaitstates -
972 getWaitStatesSince(IsVALUDefSGPRFn,
973 VALUWriteSGPRVALUReadWaitstates);
974 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
975 }
976 }
977
978 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
979 UseReg = AMDGPU::VCC;
980 int WaitStatesNeededForDef =
981 VALUWriteSGPRVALUReadWaitstates -
982 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
983 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
984 }
985
986 switch (VALU->getOpcode()) {
987 case AMDGPU::V_READLANE_B32:
988 case AMDGPU::V_READFIRSTLANE_B32: {
989 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
990 UseReg = Src->getReg();
991 int WaitStatesNeededForDef =
992 VALUWriteVGPRReadlaneRead -
993 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
994 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
995 }
996 [[fallthrough]];
997 case AMDGPU::V_WRITELANE_B32: {
998 UseReg = AMDGPU::EXEC;
999 int WaitStatesNeededForDef =
1000 VALUWriteEXECRWLane -
1001 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1002 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1003 break;
1004 }
1005 default:
1006 break;
1007 }
1008 }
1009
1010 // This checks for the hazard where VMEM instructions that store more than
1011 // 8 bytes can have there store data over written by the next instruction.
1012 if (!ST.has12DWordStoreHazard())
1013 return WaitStatesNeeded;
1014
1015 const MachineRegisterInfo &MRI = MF.getRegInfo();
1016
1017 for (const MachineOperand &Def : VALU->defs()) {
1018 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1019 }
1020
1021 return WaitStatesNeeded;
1022}
1023
1024int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1025 // This checks for hazards associated with inline asm statements.
1026 // Since inline asms can contain just about anything, we use this
1027 // to call/leverage other check*Hazard routines. Note that
1028 // this function doesn't attempt to address all possible inline asm
1029 // hazards (good luck), but is a collection of what has been
1030 // problematic thus far.
1031
1032 // see checkVALUHazards()
1033 if (!ST.has12DWordStoreHazard())
1034 return 0;
1035
1036 const MachineRegisterInfo &MRI = MF.getRegInfo();
1037 int WaitStatesNeeded = 0;
1038
1039 for (const MachineOperand &Op :
1041 if (Op.isReg() && Op.isDef()) {
1042 WaitStatesNeeded =
1043 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1044 }
1045 }
1046
1047 return WaitStatesNeeded;
1048}
1049
1050int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1051 const SIInstrInfo *TII = ST.getInstrInfo();
1052 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1053 const MachineRegisterInfo &MRI = MF.getRegInfo();
1054
1055 const MachineOperand *LaneSelectOp =
1056 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1057
1058 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1059 return 0;
1060
1061 Register LaneSelectReg = LaneSelectOp->getReg();
1062 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1063
1064 const int RWLaneWaitStates = 4;
1065 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1066 RWLaneWaitStates);
1067 return RWLaneWaitStates - WaitStatesSince;
1068}
1069
1070int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1071 if (!ST.hasRFEHazards())
1072 return 0;
1073
1074 const SIInstrInfo *TII = ST.getInstrInfo();
1075
1076 const int RFEWaitStates = 1;
1077
1078 auto IsHazardFn = [TII](const MachineInstr &MI) {
1079 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1080 };
1081 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1082 return RFEWaitStates - WaitStatesNeeded;
1083}
1084
1085int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1086 const SIInstrInfo *TII = ST.getInstrInfo();
1087 const int ReadM0WaitStates = 1;
1088 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1089 return ReadM0WaitStates -
1090 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1091}
1092
1093void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1094 fixVMEMtoScalarWriteHazards(MI);
1095 fixVcmpxPermlaneHazards(MI);
1096 fixSMEMtoVectorWriteHazards(MI);
1097 fixVcmpxExecWARHazard(MI);
1098 fixLdsBranchVmemWARHazard(MI);
1099 if (ST.hasLdsDirect()) {
1100 fixLdsDirectVALUHazard(MI);
1101 fixLdsDirectVMEMHazard(MI);
1102 }
1103 fixVALUPartialForwardingHazard(MI);
1104 fixVALUTransUseHazard(MI);
1105 fixWMMAHazards(MI);
1106 fixShift64HighRegBug(MI);
1107 fixVALUMaskWriteHazard(MI);
1108 fixRequiredExportPriority(MI);
1109}
1110
1111bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1112 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1113 return false;
1114
1115 const SIInstrInfo *TII = ST.getInstrInfo();
1116 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1117 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1118 return (TII->isVOPC(MI) ||
1119 ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
1120 MI.modifiesRegister(AMDGPU::EXEC, TRI);
1121 };
1122
1123 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1124 unsigned Opc = MI.getOpcode();
1125 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1126 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1127 };
1128
1129 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1130 std::numeric_limits<int>::max())
1131 return false;
1132
1133 // V_NOP will be discarded by SQ.
1134 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1135 // which is always a VGPR and available.
1136 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1137 Register Reg = Src0->getReg();
1138 bool IsUndef = Src0->isUndef();
1139 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1140 TII->get(AMDGPU::V_MOV_B32_e32))
1141 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1142 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1143
1144 return true;
1145}
1146
1147bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1149 return false;
1151
1153 return false;
1154
1155 if (MI->getNumDefs() == 0)
1156 return false;
1157
1158 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1159
1160 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1163 return false;
1164
1165 for (const MachineOperand &Def : MI->defs()) {
1166 const MachineOperand *Op =
1167 I.findRegisterUseOperand(Def.getReg(), TRI, false);
1168 if (!Op)
1169 continue;
1170 return true;
1171 }
1172 return false;
1173 };
1174
1175 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1176 return SIInstrInfo::isVALU(MI) ||
1177 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1178 !MI.getOperand(0).getImm()) ||
1179 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1180 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1181 };
1182
1183 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1184 std::numeric_limits<int>::max())
1185 return false;
1186
1187 const SIInstrInfo *TII = ST.getInstrInfo();
1188 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1189 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1191 return true;
1192}
1193
1194bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1196 return false;
1198
1199 if (!SIInstrInfo::isVALU(*MI))
1200 return false;
1201
1202 unsigned SDSTName;
1203 switch (MI->getOpcode()) {
1204 case AMDGPU::V_READLANE_B32:
1205 case AMDGPU::V_READFIRSTLANE_B32:
1206 SDSTName = AMDGPU::OpName::vdst;
1207 break;
1208 default:
1209 SDSTName = AMDGPU::OpName::sdst;
1210 break;
1211 }
1212
1213 const SIInstrInfo *TII = ST.getInstrInfo();
1214 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1215 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1216 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1217 if (!SDST) {
1218 for (const auto &MO : MI->implicit_operands()) {
1219 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1220 SDST = &MO;
1221 break;
1222 }
1223 }
1224 }
1225
1226 if (!SDST)
1227 return false;
1228
1229 const Register SDSTReg = SDST->getReg();
1230 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1231 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1232 };
1233
1234 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1235 if (TII->isSALU(MI)) {
1236 switch (MI.getOpcode()) {
1237 case AMDGPU::S_SETVSKIP:
1238 case AMDGPU::S_VERSION:
1239 case AMDGPU::S_WAITCNT_VSCNT:
1240 case AMDGPU::S_WAITCNT_VMCNT:
1241 case AMDGPU::S_WAITCNT_EXPCNT:
1242 // These instructions cannot not mitigate the hazard.
1243 return false;
1244 case AMDGPU::S_WAITCNT_LGKMCNT:
1245 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1246 return (MI.getOperand(1).getImm() == 0) &&
1247 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1248 case AMDGPU::S_WAITCNT: {
1249 const int64_t Imm = MI.getOperand(0).getImm();
1251 // DsCnt corresponds to LGKMCnt here.
1252 return (Decoded.DsCnt == 0);
1253 }
1254 default:
1255 // SOPP instructions cannot mitigate the hazard.
1256 if (TII->isSOPP(MI))
1257 return false;
1258 // At this point the SALU can be assumed to mitigate the hazard
1259 // because either:
1260 // (a) it is independent of the at risk SMEM (breaking chain),
1261 // or
1262 // (b) it is dependent on the SMEM, in which case an appropriate
1263 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1264 // SMEM instruction.
1265 return true;
1266 }
1267 }
1268 return false;
1269 };
1270
1271 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1272 std::numeric_limits<int>::max())
1273 return false;
1274
1275 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1276 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1277 .addImm(0);
1278 return true;
1279}
1280
1281bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1282 if (!ST.hasVcmpxExecWARHazard())
1283 return false;
1285
1286 if (!SIInstrInfo::isVALU(*MI))
1287 return false;
1288
1289 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1290 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1291 return false;
1292
1293 auto IsHazardFn = [TRI](const MachineInstr &I) {
1295 return false;
1296 return I.readsRegister(AMDGPU::EXEC, TRI);
1297 };
1298
1299 const SIInstrInfo *TII = ST.getInstrInfo();
1300 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1301 if (SIInstrInfo::isVALU(MI)) {
1302 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1303 return true;
1304 for (auto MO : MI.implicit_operands())
1305 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1306 return true;
1307 }
1308 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1309 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1310 return true;
1311 return false;
1312 };
1313
1314 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1315 std::numeric_limits<int>::max())
1316 return false;
1317
1318 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1319 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1321 return true;
1322}
1323
1325 const GCNSubtarget &ST) {
1326 if (!ST.hasLdsBranchVmemWARHazard())
1327 return false;
1328
1329 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1330 // instructions need to appear in the same function.
1331 bool HasLds = false;
1332 bool HasVmem = false;
1333 for (auto &MBB : MF) {
1334 for (auto &MI : MBB) {
1335 HasLds |= SIInstrInfo::isDS(MI);
1336 HasVmem |=
1338 if (HasLds && HasVmem)
1339 return true;
1340 }
1341 }
1342 return false;
1343}
1344
1346 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1347 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1348 !I.getOperand(1).getImm();
1349}
1350
1351bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1352 if (!RunLdsBranchVmemWARHazardFixup)
1353 return false;
1354
1357
1358 auto IsHazardInst = [](const MachineInstr &MI) {
1359 if (SIInstrInfo::isDS(MI))
1360 return 1;
1362 return 2;
1363 return 0;
1364 };
1365
1366 auto InstType = IsHazardInst(*MI);
1367 if (!InstType)
1368 return false;
1369
1370 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1371 return IsHazardInst(I) || isStoreCountWaitZero(I);
1372 };
1373
1374 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1375 if (!I.isBranch())
1376 return false;
1377
1378 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1379 auto InstType2 = IsHazardInst(I);
1380 return InstType2 && InstType != InstType2;
1381 };
1382
1383 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1384 auto InstType2 = IsHazardInst(I);
1385 if (InstType == InstType2)
1386 return true;
1387
1388 return isStoreCountWaitZero(I);
1389 };
1390
1391 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1392 std::numeric_limits<int>::max();
1393 };
1394
1395 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1396 std::numeric_limits<int>::max())
1397 return false;
1398
1399 const SIInstrInfo *TII = ST.getInstrInfo();
1400 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1401 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1402 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1403 .addImm(0);
1404
1405 return true;
1406}
1407
1408bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1410 return false;
1411
1412 const int NoHazardWaitStates = 15;
1413 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1414 const Register VDSTReg = VDST->getReg();
1415
1416 bool VisitedTrans = false;
1417 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1418 if (!SIInstrInfo::isVALU(I))
1419 return false;
1420 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1421 // Cover both WAR and WAW
1422 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1423 };
1424 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1425 if (WaitStates >= NoHazardWaitStates)
1426 return true;
1427 // Instructions which cause va_vdst==0 expire hazard
1430 };
1431 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1432 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1433 };
1434
1436 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1437 std::next(MI->getReverseIterator()), 0,
1438 IsExpiredFn, Visited, GetWaitStatesFn);
1439
1440 // Transcendentals can execute in parallel to other VALUs.
1441 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1442 if (VisitedTrans)
1443 Count = 0;
1444
1445 MachineOperand *WaitVdstOp =
1446 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1447 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1448
1449 return true;
1450}
1451
1452bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1454 return false;
1455
1456 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1457 const Register VDSTReg = VDST->getReg();
1458
1459 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1462 return false;
1463 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1464 };
1465 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1466 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1467 // according to the type of VMEM instruction.
1468 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1470 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1471 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1472 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1473 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1474 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1475 };
1476
1477 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1478 std::numeric_limits<int>::max())
1479 return false;
1480
1481 if (LdsdirCanWait) {
1482 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1483 } else {
1484 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1485 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1487 }
1488
1489 return true;
1490}
1491
1492bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1494 return false;
1496
1497 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1498 return false;
1499
1501
1502 for (const MachineOperand &Use : MI->explicit_uses()) {
1503 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1504 SrcVGPRs.insert(Use.getReg());
1505 }
1506
1507 // Only applies with >= 2 unique VGPR sources
1508 if (SrcVGPRs.size() <= 1)
1509 return false;
1510
1511 // Look for the following pattern:
1512 // Va <- VALU [PreExecPos]
1513 // intv1
1514 // Exec <- SALU [ExecPos]
1515 // intv2
1516 // Vb <- VALU [PostExecPos]
1517 // intv3
1518 // MI Va, Vb (WaitState = 0)
1519 //
1520 // Where:
1521 // intv1 + intv2 <= 2 VALUs
1522 // intv3 <= 4 VALUs
1523 //
1524 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1525
1526 const int Intv1plus2MaxVALUs = 2;
1527 const int Intv3MaxVALUs = 4;
1528 const int IntvMaxVALUs = 6;
1529 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1530
1531 struct StateType {
1533 int ExecPos = std::numeric_limits<int>::max();
1534 int VALUs = 0;
1535 };
1536
1537 StateType State;
1538
1539 // This overloads expiry testing with all the hazard detection
1540 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1541 // Too many VALU states have passed
1542 if (State.VALUs > NoHazardVALUWaitStates)
1543 return HazardExpired;
1544
1545 // Instructions which cause va_vdst==0 expire hazard
1548 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1549 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1550 return HazardExpired;
1551
1552 // Track registers writes
1553 bool Changed = false;
1554 if (SIInstrInfo::isVALU(I)) {
1555 for (Register Src : SrcVGPRs) {
1556 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1557 State.DefPos[Src] = State.VALUs;
1558 Changed = true;
1559 }
1560 }
1561 } else if (SIInstrInfo::isSALU(I)) {
1562 if (State.ExecPos == std::numeric_limits<int>::max()) {
1563 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1564 State.ExecPos = State.VALUs;
1565 Changed = true;
1566 }
1567 }
1568 }
1569
1570 // Early expiration: too many VALUs in intv3
1571 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1572 return HazardExpired;
1573
1574 // Only evaluate state if something changed
1575 if (!Changed)
1576 return NoHazardFound;
1577
1578 // Determine positions of VALUs pre/post exec change
1579 if (State.ExecPos == std::numeric_limits<int>::max())
1580 return NoHazardFound;
1581
1582 int PreExecPos = std::numeric_limits<int>::max();
1583 int PostExecPos = std::numeric_limits<int>::max();
1584
1585 for (auto Entry : State.DefPos) {
1586 int DefVALUs = Entry.second;
1587 if (DefVALUs != std::numeric_limits<int>::max()) {
1588 if (DefVALUs >= State.ExecPos)
1589 PreExecPos = std::min(PreExecPos, DefVALUs);
1590 else
1591 PostExecPos = std::min(PostExecPos, DefVALUs);
1592 }
1593 }
1594
1595 // Need a VALUs post exec change
1596 if (PostExecPos == std::numeric_limits<int>::max())
1597 return NoHazardFound;
1598
1599 // Too many VALUs in intv3?
1600 int Intv3VALUs = PostExecPos;
1601 if (Intv3VALUs > Intv3MaxVALUs)
1602 return HazardExpired;
1603
1604 // Too many VALUs in intv2?
1605 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1606 if (Intv2VALUs > Intv1plus2MaxVALUs)
1607 return HazardExpired;
1608
1609 // Need a VALUs pre exec change
1610 if (PreExecPos == std::numeric_limits<int>::max())
1611 return NoHazardFound;
1612
1613 // Too many VALUs in intv1?
1614 int Intv1VALUs = PreExecPos - State.ExecPos;
1615 if (Intv1VALUs > Intv1plus2MaxVALUs)
1616 return HazardExpired;
1617
1618 // Too many VALUs in intv1 + intv2
1619 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1620 return HazardExpired;
1621
1622 return HazardFound;
1623 };
1624 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1626 State.VALUs += 1;
1627 };
1628
1630 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1631 std::next(MI->getReverseIterator()), Visited))
1632 return false;
1633
1634 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1635 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1636 .addImm(0x0fff);
1637
1638 return true;
1639}
1640
1641bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1642 if (!ST.hasVALUTransUseHazard())
1643 return false;
1645
1646 if (!SIInstrInfo::isVALU(*MI))
1647 return false;
1648
1649 SmallSet<Register, 4> SrcVGPRs;
1650
1651 for (const MachineOperand &Use : MI->explicit_uses()) {
1652 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1653 SrcVGPRs.insert(Use.getReg());
1654 }
1655
1656 // Look for the following pattern:
1657 // Va <- TRANS VALU
1658 // intv
1659 // MI Va (WaitState = 0)
1660 //
1661 // Where:
1662 // intv <= 5 VALUs / 1 TRANS
1663 //
1664 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1665
1666 const int IntvMaxVALUs = 5;
1667 const int IntvMaxTRANS = 1;
1668
1669 struct StateType {
1670 int VALUs = 0;
1671 int TRANS = 0;
1672 };
1673
1674 StateType State;
1675
1676 // This overloads expiry testing with all the hazard detection
1677 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1678 // Too many VALU states have passed
1679 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1680 return HazardExpired;
1681
1682 // Instructions which cause va_vdst==0 expire hazard
1685 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1686 I.getOperand(0).getImm() == 0x0fff))
1687 return HazardExpired;
1688
1689 // Track registers writes
1690 if (SIInstrInfo::isTRANS(I)) {
1691 for (Register Src : SrcVGPRs) {
1692 if (I.modifiesRegister(Src, &TRI)) {
1693 return HazardFound;
1694 }
1695 }
1696 }
1697
1698 return NoHazardFound;
1699 };
1700 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1702 State.VALUs += 1;
1704 State.TRANS += 1;
1705 };
1706
1708 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1709 std::next(MI->getReverseIterator()), Visited))
1710 return false;
1711
1712 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1713 // avoided.
1714 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1715 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1717
1718 return true;
1719}
1720
1721bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1723 return false;
1724
1725 const SIInstrInfo *TII = ST.getInstrInfo();
1726 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1727
1728 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1730 return false;
1731
1732 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1733 // with the dest(matrix D) of the previous wmma.
1734 const Register CurSrc0Reg =
1735 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1736 const Register CurSrc1Reg =
1737 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1738
1739 const Register PrevDstReg =
1740 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1741
1742 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1743 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1744 return true;
1745 }
1746
1747 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1748 // but Index can't overlap with PrevDstReg.
1749 if (AMDGPU::isGFX12Plus(ST)) {
1750 if (SIInstrInfo::isSWMMAC(*MI)) {
1751 const Register CurIndex =
1752 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1753 if (TRI->regsOverlap(PrevDstReg, CurIndex))
1754 return true;
1755 }
1756 return false;
1757 }
1758
1759 return false;
1760 };
1761
1762 auto IsExpiredFn = [](const MachineInstr &I, int) {
1763 return SIInstrInfo::isVALU(I);
1764 };
1765
1766 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1767 std::numeric_limits<int>::max())
1768 return false;
1769
1770 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1771
1772 return true;
1773}
1774
1775bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1776 if (!ST.hasShift64HighRegBug())
1777 return false;
1779
1780 switch (MI->getOpcode()) {
1781 default:
1782 return false;
1783 case AMDGPU::V_LSHLREV_B64_e64:
1784 case AMDGPU::V_LSHRREV_B64_e64:
1785 case AMDGPU::V_ASHRREV_I64_e64:
1786 break;
1787 }
1788
1789 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1790 if (!Amt->isReg())
1791 return false;
1792
1793 Register AmtReg = Amt->getReg();
1794 const MachineRegisterInfo &MRI = MF.getRegInfo();
1795 // Check if this is a last VGPR in the allocation block.
1796 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1797 return false;
1798
1799 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1800 return false;
1801
1802 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1803 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1804 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1805 bool Overlapped = OverlappedSrc || OverlappedDst;
1806
1807 assert(!OverlappedDst || !OverlappedSrc ||
1808 Src1->getReg() == MI->getOperand(0).getReg());
1810 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1811
1812 Register NewReg;
1813 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1814 : AMDGPU::VGPR_32RegClass) {
1815 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1816 NewReg = Reg;
1817 break;
1818 }
1819 }
1820
1821 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1822 : NewReg;
1823 Register NewAmtLo;
1824
1825 if (Overlapped)
1826 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1827
1828 DebugLoc DL = MI->getDebugLoc();
1829 MachineBasicBlock *MBB = MI->getParent();
1830 // Insert a full wait count because found register might be pending a wait.
1831 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1832 .addImm(0);
1833
1834 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1835 if (Overlapped)
1836 runOnInstruction(
1837 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1838 .addDef(AmtReg - 1)
1839 .addReg(AmtReg - 1, RegState::Undef)
1840 .addReg(NewAmtLo, RegState::Undef));
1841 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1842 .addDef(AmtReg)
1843 .addReg(AmtReg, RegState::Undef)
1844 .addReg(NewAmt, RegState::Undef));
1845
1846 // Instructions emitted after the current instruction will be processed by the
1847 // parent loop of the hazard recognizer in a natural way.
1848 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1849 AmtReg)
1850 .addDef(NewAmt)
1851 .addReg(NewAmt)
1852 .addReg(AmtReg);
1853 if (Overlapped)
1854 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1855 AmtReg - 1)
1856 .addDef(NewAmtLo)
1857 .addReg(NewAmtLo)
1858 .addReg(AmtReg - 1);
1859
1860 // Re-running hazard recognizer on the modified instruction is not necessary,
1861 // inserted V_SWAP_B32 has already both read and write new registers so
1862 // hazards related to these register has already been handled.
1863 Amt->setReg(NewAmt);
1864 Amt->setIsKill(false);
1865 // We do not update liveness, so verifier may see it as undef.
1866 Amt->setIsUndef();
1867 if (OverlappedDst)
1868 MI->getOperand(0).setReg(NewReg);
1869 if (OverlappedSrc) {
1870 Src1->setReg(NewReg);
1871 Src1->setIsKill(false);
1872 Src1->setIsUndef();
1873 }
1874
1875 return true;
1876}
1877
1878int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1879 int NSAtoVMEMWaitStates = 1;
1880
1881 if (!ST.hasNSAtoVMEMBug())
1882 return 0;
1883
1885 return 0;
1886
1887 const SIInstrInfo *TII = ST.getInstrInfo();
1888 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1889 if (!Offset || (Offset->getImm() & 6) == 0)
1890 return 0;
1891
1892 auto IsHazardFn = [TII](const MachineInstr &I) {
1893 if (!SIInstrInfo::isMIMG(I))
1894 return false;
1895 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1896 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1897 TII->getInstSizeInBytes(I) >= 16;
1898 };
1899
1900 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1901}
1902
1903int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1904 int FPAtomicToDenormModeWaitStates = 3;
1905
1907 return 0;
1909
1910 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1911 return 0;
1912
1913 auto IsHazardFn = [](const MachineInstr &I) {
1915 return false;
1916 return SIInstrInfo::isFPAtomic(I);
1917 };
1918
1919 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1920 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
1921 return true;
1922
1923 switch (MI.getOpcode()) {
1924 case AMDGPU::S_WAITCNT:
1925 case AMDGPU::S_WAITCNT_VSCNT:
1926 case AMDGPU::S_WAITCNT_VMCNT:
1927 case AMDGPU::S_WAITCNT_EXPCNT:
1928 case AMDGPU::S_WAITCNT_LGKMCNT:
1929 case AMDGPU::S_WAIT_IDLE:
1930 return true;
1931 default:
1932 break;
1933 }
1934
1935 return false;
1936 };
1937
1938 return FPAtomicToDenormModeWaitStates -
1939 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1940}
1941
1942int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1944
1945 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1946}
1947
1948int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
1949 // Early exit if no padding is requested.
1950 if (MFMAPaddingRatio == 0)
1951 return 0;
1952
1954 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
1955 return 0;
1956
1957 int NeighborMFMALatency = 0;
1958 auto IsNeighboringMFMA = [&NeighborMFMALatency,
1959 this](const MachineInstr &MI) {
1960 if (!SIInstrInfo::isMFMA(MI))
1961 return false;
1962
1963 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
1964 return true;
1965 };
1966
1967 const int MaxMFMAPipelineWaitStates = 16;
1968 int WaitStatesSinceNeighborMFMA =
1969 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
1970
1971 int NeighborMFMAPaddingNeeded =
1972 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
1973 WaitStatesSinceNeighborMFMA;
1974
1975 return std::max(0, NeighborMFMAPaddingNeeded);
1976}
1977
1978int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1979 int WaitStatesNeeded = 0;
1980 unsigned Opc = MI->getOpcode();
1981
1982 auto IsVALUFn = [](const MachineInstr &MI) {
1983 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
1984 };
1985
1986 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1987 const int LegacyVALUWritesVGPRWaitStates = 2;
1988 const int VALUWritesExecWaitStates = 4;
1989 const int MaxWaitStates = 4;
1990
1991 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1992 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1993 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1994
1995 if (WaitStatesNeeded < MaxWaitStates) {
1996 for (const MachineOperand &Use : MI->explicit_uses()) {
1997 const int MaxWaitStates = 2;
1998
1999 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2000 continue;
2001
2002 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2003 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2004 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2005
2006 if (WaitStatesNeeded == MaxWaitStates)
2007 break;
2008 }
2009 }
2010 }
2011
2012 for (const MachineOperand &Op : MI->explicit_operands()) {
2013 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2014 continue;
2015
2016 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2017 continue;
2018
2019 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2020 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2021 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2022 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2023 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2024 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2025 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2026 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2027 const int MaxWaitStates = 18;
2028 Register Reg = Op.getReg();
2029 unsigned HazardDefLatency = 0;
2030
2031 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2032 this](const MachineInstr &MI) {
2033 if (!SIInstrInfo::isMFMA(MI))
2034 return false;
2035 Register DstReg = MI.getOperand(0).getReg();
2036 if (DstReg == Reg)
2037 return false;
2038 HazardDefLatency =
2039 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2040 return TRI.regsOverlap(DstReg, Reg);
2041 };
2042
2043 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2044 MaxWaitStates);
2045 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2046 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2047 int OpNo = Op.getOperandNo();
2048 if (OpNo == SrcCIdx) {
2049 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2050 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2051 switch (HazardDefLatency) {
2052 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2053 break;
2054 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2055 break;
2056 case 16: [[fallthrough]];
2057 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2058 break;
2059 }
2060 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2061 switch (HazardDefLatency) {
2062 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2063 break;
2064 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2065 break;
2066 case 16: [[fallthrough]];
2067 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2068 break;
2069 }
2070 }
2071
2072 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2073 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2074
2075 if (WaitStatesNeeded == MaxWaitStates)
2076 return WaitStatesNeeded; // Early exit.
2077
2078 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2079 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2080 return false;
2081 Register DstReg = MI.getOperand(0).getReg();
2082 return TRI.regsOverlap(Reg, DstReg);
2083 };
2084
2085 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2086 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2087 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2088 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2089 if (OpNo == SrcCIdx)
2090 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2091 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2092 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2093
2094 WaitStatesNeededForUse = NeedWaitStates -
2095 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2096 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2097
2098 if (WaitStatesNeeded == MaxWaitStates)
2099 return WaitStatesNeeded; // Early exit.
2100 }
2101
2102 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2103 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2104 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2105 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2106 const int MaxWaitStates = 13;
2107 Register DstReg = MI->getOperand(0).getReg();
2108 unsigned HazardDefLatency = 0;
2109
2110 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2111 this](const MachineInstr &MI) {
2112 if (!SIInstrInfo::isMFMA(MI))
2113 return false;
2114 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2115 HazardDefLatency =
2116 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2117 return TRI.regsOverlap(Reg, DstReg);
2118 };
2119
2120 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2121 int NeedWaitStates;
2122 switch (HazardDefLatency) {
2123 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2124 break;
2125 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2126 break;
2127 case 16: [[fallthrough]];
2128 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2129 break;
2130 }
2131
2132 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2133 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2134 }
2135
2136 // Pad neighboring MFMA with noops for better inter-wave performance.
2137 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2138
2139 return WaitStatesNeeded;
2140}
2141
2142static int
2144 // 2 pass -> 3
2145 // 4 pass -> 5
2146 // 8 pass -> 9
2147 // 16 pass -> 17
2148 return NumPasses + 1;
2149}
2150
2151static int
2153 // 2 pass -> 2
2154 // 4 pass -> 4
2155 // 8 pass -> 8
2156 // 16 pass -> 16
2157 return NumPasses;
2158}
2159
2160static int
2162 // 2 pass -> 4
2163 // 4 pass -> 6
2164 // 8 pass -> 10
2165 // 16 pass -> 18
2166 return NumPasses + 2;
2167}
2168
2170 // 2 pass -> 5
2171 // 4 pass -> 7
2172 // 8 pass -> 11
2173 // 16 pass -> 19
2174 return NumPasses + 3;
2175}
2176
2177int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2178 int WaitStatesNeeded = 0;
2179 unsigned Opc = MI->getOpcode();
2180
2181 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2183 };
2184
2185 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2188 };
2189
2190 if (!SIInstrInfo::isMFMA(*MI))
2191 return WaitStatesNeeded;
2192
2193 const int VALUWritesExecWaitStates = 4;
2194 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2195 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2196 VALUWritesExecWaitStates);
2197 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2198
2199 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2200
2201 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2202 for (const MachineOperand &Use : MI->explicit_uses()) {
2203 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2204 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2205 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2206 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2207 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2208 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2209 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2210 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2211 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2212 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2213 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2214 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2215 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2216 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2217 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2218 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2219 const int MaxWaitStates = 19;
2220
2221 if (!Use.isReg())
2222 continue;
2223 Register Reg = Use.getReg();
2224 bool FullReg;
2225 const MachineInstr *MI1;
2226
2227 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2228 this](const MachineInstr &MI) {
2229 if (!SIInstrInfo::isMFMA(MI))
2230 return false;
2231 Register DstReg = MI.getOperand(0).getReg();
2232 FullReg = (DstReg == Reg);
2233 MI1 = &MI;
2234 return TRI.regsOverlap(DstReg, Reg);
2235 };
2236
2237 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2238 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2239 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2240
2241 int NumWaitStates =
2242 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2243 if (NumWaitStates == std::numeric_limits<int>::max())
2244 continue;
2245
2246 int OpNo = Use.getOperandNo();
2247 unsigned Opc1 = MI1->getOpcode();
2248 int NeedWaitStates = 0;
2249 if (OpNo == SrcCIdx) {
2250 if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2251 NeedWaitStates = 0;
2252 } else if (FullReg) {
2253 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2254 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2255 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2256 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2257 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2258 else if (ST.hasGFX940Insts() &&
2259 TSchedModel.computeInstrLatency(MI1) == 2)
2260 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2261 } else {
2262 switch (Opc1) {
2263 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2264 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2265 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2266 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2267 if (!isXDL(ST, *MI))
2268 NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2269 break;
2270 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2271 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2272 if (!isXDL(ST, *MI))
2273 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2274 break;
2275 default:
2276 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2277 if (ST.hasGFX940Insts()) {
2278 if (isXDL(ST, *MI) && !isXDL(ST, *MI1))
2279 break;
2280
2281 NeedWaitStates =
2282 isXDL(ST, *MI1)
2284 NumPasses)
2286 NumPasses);
2287 break;
2288 }
2289
2290 switch (NumPasses) {
2291 case 2:
2292 NeedWaitStates =
2293 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2294 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2295 break;
2296 case 8:
2297 NeedWaitStates =
2298 isDGEMM(Opc)
2299 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2300 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2301 break;
2302 case 16:
2303 NeedWaitStates =
2304 isDGEMM(Opc)
2305 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2306 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2307 break;
2308 default:
2309 llvm_unreachable("unexpected number of passes");
2310 }
2311 }
2312 }
2313 } else {
2314 switch (Opc1) {
2315 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2316 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2317 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2318 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2319 NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2320 break;
2321 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2322 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2323 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2324 break;
2325 default:
2326 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2327
2328 if (ST.hasGFX940Insts()) {
2329 NeedWaitStates =
2330 isXDL(ST, *MI1)
2332 NumPasses)
2334 NumPasses);
2335 break;
2336 }
2337
2338 switch (NumPasses) {
2339 case 2:
2340 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2341 break;
2342 case 4:
2343 llvm_unreachable("unexpected number of passes for mfma");
2344 case 8:
2345 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2346 break;
2347 case 16:
2348 default:
2349 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2350 }
2351 }
2352 }
2353 if (WaitStatesNeeded >= NeedWaitStates)
2354 continue;
2355
2356 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2357 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2358
2359 if (WaitStatesNeeded == MaxWaitStates)
2360 break;
2361 }
2362
2363 // Pad neighboring MFMA with noops for better inter-wave performance.
2364 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2365
2366 return WaitStatesNeeded;
2367}
2368
2369int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2370 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2371 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2372 return 0;
2373
2374 int WaitStatesNeeded = 0;
2375
2376 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2377 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2378 };
2379
2380 for (const MachineOperand &Op : MI->explicit_uses()) {
2381 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2382 continue;
2383
2384 Register Reg = Op.getReg();
2385
2386 const int AccVgprReadLdStWaitStates = 2;
2387 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2388 const int MaxWaitStates = 2;
2389
2390 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2391 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2392 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2393
2394 if (WaitStatesNeeded == MaxWaitStates)
2395 return WaitStatesNeeded; // Early exit.
2396
2397 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2398 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2399 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2400 return false;
2401 auto IsVALUFn = [](const MachineInstr &MI) {
2403 };
2404 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2405 std::numeric_limits<int>::max();
2406 };
2407
2408 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2409 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2410 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2411 }
2412
2413 return WaitStatesNeeded;
2414}
2415
2417 // 2 pass -> 4
2418 // 4 pass -> 6
2419 // 8 pass -> 10
2420 // 16 pass -> 18
2421 return NumPasses + 2;
2422}
2423
2425 // 2 pass -> 5
2426 // 4 pass -> 7
2427 // 8 pass -> 11
2428 // 16 pass -> 19
2429 return NumPasses + 3;
2430}
2431
2433 // 2 pass -> 5
2434 // 4 pass -> 7
2435 // 8 pass -> 11
2436 // 16 pass -> 19
2437 return NumPasses + 3;
2438}
2439
2441 // 2 pass -> 4
2442 // 4 pass -> 6
2443 // 8 pass -> 10
2444 // 16 pass -> 18
2445 return NumPasses + 2;
2446}
2447
2448int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2449 if (!ST.hasGFX90AInsts())
2450 return 0;
2451
2452 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2453 return isDGEMM(MI.getOpcode());
2454 };
2455
2456 // This is checked in checkMAIHazards90A()
2457 if (SIInstrInfo::isMFMA(*MI))
2458 return 0;
2459
2460 const MachineRegisterInfo &MRI = MF.getRegInfo();
2461
2462 int WaitStatesNeeded = 0;
2463
2464 bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2467 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2468 bool IsVALU = SIInstrInfo::isVALU(*MI);
2469
2470 const MachineInstr *MFMA = nullptr;
2471 unsigned Reg;
2472 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2473 if (!SIInstrInfo::isMFMA(MI) ||
2474 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2475 return false;
2476 MFMA = &MI;
2477 return true;
2478 };
2479
2480 const MachineInstr *DOT = nullptr;
2481 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2482 if (!SIInstrInfo::isDOT(MI) ||
2483 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2484 return false;
2485 DOT = &MI;
2486 return true;
2487 };
2488
2489 bool DGEMMAfterVALUWrite = false;
2490 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2491 // Found DGEMM on reverse traversal to def.
2492 if (isDGEMM(MI.getOpcode()))
2493 DGEMMAfterVALUWrite = true;
2494
2495 // Only hazard if register is defined by a VALU and a DGEMM is found after
2496 // after the def.
2497 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2498 return false;
2499
2500 return true;
2501 };
2502
2503 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2504 AMDGPU::OpName::src2);
2505
2506 if (IsMemOrExport || IsVALU) {
2507 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2508 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2509 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2510 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2511 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2512 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2513 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2514 const int DotWriteSameDotReadSrcAB = 3;
2515 const int DotWriteDifferentVALURead = 3;
2516 const int DMFMABetweenVALUWriteVMEMRead = 2;
2517 const int MaxWaitStates = 19;
2518
2519 for (const MachineOperand &Use : MI->explicit_uses()) {
2520 if (!Use.isReg())
2521 continue;
2522 Reg = Use.getReg();
2523
2524 DOT = nullptr;
2525 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2526 MaxWaitStates);
2527 if (DOT) {
2528 int NeedWaitStates = 0;
2529 if (DOT->getOpcode() == MI->getOpcode()) {
2530 if (&Use - &MI->getOperand(0) != SrcCIdx)
2531 NeedWaitStates = DotWriteSameDotReadSrcAB;
2532 } else {
2533 NeedWaitStates = DotWriteDifferentVALURead;
2534 }
2535
2536 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2537 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2538 }
2539
2540 // Workaround for HW data hazard bug observed only in GFX90A. When there
2541 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2542 // causes the SQ to incorrectly not insert two wait states between the two
2543 // instructions needed to avoid data hazard.
2544 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2545 DGEMMAfterVALUWrite = false;
2546 if (TRI.isVectorRegister(MRI, Reg)) {
2547 int WaitStatesNeededForUse =
2548 DMFMABetweenVALUWriteVMEMRead -
2549 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2550 DMFMABetweenVALUWriteVMEMRead);
2551
2552 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2553 }
2554 }
2555
2556 MFMA = nullptr;
2557 WaitStatesSinceDef =
2558 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2559 if (!MFMA)
2560 continue;
2561
2562 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2563 int NumPasses = HazardDefLatency;
2564 int NeedWaitStates = MaxWaitStates;
2565
2566 if (isDGEMM(MFMA->getOpcode())) {
2567 switch (HazardDefLatency) {
2568 case 4:
2569 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2570 : DMFMA4x4WriteVgprVALUReadWaitStates;
2571 break;
2572 case 8:
2573 case 16:
2574 NeedWaitStates = IsMemOrExport
2575 ? DMFMA16x16WriteVgprMemExpReadWaitStates
2576 : DMFMA16x16WriteVgprVALUReadWaitStates;
2577 break;
2578 default:
2579 llvm_unreachable("unexpected dgemm");
2580 }
2581 } else if (ST.hasGFX940Insts()) {
2582 NeedWaitStates =
2583 isXDL(ST, *MFMA)
2586 NumPasses);
2587 } else {
2588 switch (HazardDefLatency) {
2589 case 2:
2590 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2591 break;
2592 case 8:
2593 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2594 break;
2595 case 16:
2596 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2597 break;
2598 default:
2599 llvm_unreachable("unexpected number of passes for mfma");
2600 }
2601 }
2602
2603 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2604 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2605
2606 if (WaitStatesNeeded == MaxWaitStates)
2607 break;
2608 }
2609 }
2610
2611 unsigned Opc = MI->getOpcode();
2612 const int DMFMAToFMA64WaitStates = 2;
2613 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2614 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2615 Opc == AMDGPU::V_FMAC_F64_dpp) &&
2616 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2617 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2618 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2619 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2620 }
2621
2622 if (!IsVALU && !IsMemOrExport)
2623 return WaitStatesNeeded;
2624
2625 for (const MachineOperand &Def : MI->defs()) {
2626 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2627 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2628 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2629 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2630 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2631 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2632 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2633 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2634 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2635 const int DotWriteDifferentVALUWrite = 3;
2636 const int MaxWaitStates = 19;
2637 const int MaxWarWaitStates = 15;
2638
2639 Reg = Def.getReg();
2640
2641 DOT = nullptr;
2642 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2643 MaxWaitStates);
2644 if (DOT && DOT->getOpcode() != MI->getOpcode())
2645 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2646 WaitStatesSinceDef);
2647
2648 MFMA = nullptr;
2649 WaitStatesSinceDef =
2650 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2651 if (MFMA) {
2652 int NeedWaitStates = MaxWaitStates;
2653 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2654
2655 if (isDGEMM(MFMA->getOpcode())) {
2656 switch (NumPasses) {
2657 case 4:
2658 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2659 break;
2660 case 8:
2661 case 16:
2662 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2663 break;
2664 default:
2665 llvm_unreachable("unexpected number of cycles for dgemm");
2666 }
2667 } else if (ST.hasGFX940Insts()) {
2668 NeedWaitStates =
2669 isXDL(ST, *MFMA)
2672 } else {
2673 switch (NumPasses) {
2674 case 2:
2675 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2676 break;
2677 case 8:
2678 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2679 break;
2680 case 16:
2681 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2682 break;
2683 default:
2684 llvm_unreachable("Unexpected number of passes for mfma");
2685 }
2686 }
2687
2688 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2689 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2690
2691 if (WaitStatesNeeded == MaxWaitStates)
2692 break;
2693 }
2694
2695 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2696 if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2697 !MI.readsRegister(Reg, &TRI))
2698 return false;
2699
2700 if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2701 return false;
2702
2703 const MachineOperand *SrcC =
2704 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2705 assert(SrcC);
2706 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2707 return false;
2708
2709 MFMA = &MI;
2710 return true;
2711 };
2712
2713 MFMA = nullptr;
2714 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2715 MaxWarWaitStates);
2716 if (!MFMA)
2717 continue;
2718
2719 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2720 int NeedWaitStates = MaxWaitStates;
2721 switch (HazardDefLatency) {
2722 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2723 break;
2724 case 4: assert(ST.hasGFX940Insts());
2725 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2726 break;
2727 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2728 break;
2729 case 16: [[fallthrough]];
2730 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2731 break;
2732 }
2733
2734 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2735 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2736 }
2737
2738 return WaitStatesNeeded;
2739}
2740
2742 if (!SU->isInstr())
2743 return false;
2744
2745 const MachineInstr *MAI = nullptr;
2746
2747 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2748 MAI = nullptr;
2750 MAI = &MI;
2751 return MAI != nullptr;
2752 };
2753
2754 MachineInstr *MI = SU->getInstr();
2755 if (IsMFMAFn(*MI)) {
2756 int W = getWaitStatesSince(IsMFMAFn, 16);
2757 if (MAI)
2758 return W < (int)TSchedModel.computeInstrLatency(MAI);
2759 }
2760
2761 return false;
2762}
2763
2764bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2765 if (!ST.hasVALUMaskWriteHazard())
2766 return false;
2768
2769 if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
2770 return false;
2771
2772 // The hazard sequence is three instructions:
2773 // 1. VALU reads SGPR as mask
2774 // 2. SALU writes SGPR
2775 // 3. SALU reads SGPR
2776 // The hazard can expire if the distance between 2 and 3 is sufficient.
2777 // In practice this happens <10% of the time, hence this always assumes
2778 // the hazard exists if 1 and 2 are present to avoid searching.
2779
2780 const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2781 if (!SDSTOp || !SDSTOp->isReg())
2782 return false;
2783
2784 const Register HazardReg = SDSTOp->getReg();
2785 if (HazardReg == AMDGPU::EXEC ||
2786 HazardReg == AMDGPU::EXEC_LO ||
2787 HazardReg == AMDGPU::EXEC_HI ||
2788 HazardReg == AMDGPU::M0)
2789 return false;
2790
2791 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2792 switch (I.getOpcode()) {
2793 case AMDGPU::V_ADDC_U32_e32:
2794 case AMDGPU::V_ADDC_U32_dpp:
2795 case AMDGPU::V_CNDMASK_B16_e32:
2796 case AMDGPU::V_CNDMASK_B16_dpp:
2797 case AMDGPU::V_CNDMASK_B32_e32:
2798 case AMDGPU::V_CNDMASK_B32_dpp:
2799 case AMDGPU::V_DIV_FMAS_F32_e64:
2800 case AMDGPU::V_DIV_FMAS_F64_e64:
2801 case AMDGPU::V_SUBB_U32_e32:
2802 case AMDGPU::V_SUBB_U32_dpp:
2803 case AMDGPU::V_SUBBREV_U32_e32:
2804 case AMDGPU::V_SUBBREV_U32_dpp:
2805 // These implicitly read VCC as mask source.
2806 return HazardReg == AMDGPU::VCC ||
2807 HazardReg == AMDGPU::VCC_LO ||
2808 HazardReg == AMDGPU::VCC_HI;
2809 case AMDGPU::V_ADDC_U32_e64:
2810 case AMDGPU::V_ADDC_U32_e64_dpp:
2811 case AMDGPU::V_CNDMASK_B16_e64:
2812 case AMDGPU::V_CNDMASK_B16_e64_dpp:
2813 case AMDGPU::V_CNDMASK_B32_e64:
2814 case AMDGPU::V_CNDMASK_B32_e64_dpp:
2815 case AMDGPU::V_SUBB_U32_e64:
2816 case AMDGPU::V_SUBB_U32_e64_dpp:
2817 case AMDGPU::V_SUBBREV_U32_e64:
2818 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2819 // Only check mask register overlaps.
2820 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
2821 assert(SSRCOp);
2822 return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
2823 }
2824 default:
2825 return false;
2826 }
2827 };
2828
2829 const MachineRegisterInfo &MRI = MF.getRegInfo();
2830 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
2831 // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2832 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2833 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
2834 return true;
2835
2836 // VALU access to any SGPR or literal constant other than HazardReg
2837 // mitigates hazard. No need to check HazardReg here as this will
2838 // only be called when !IsHazardFn.
2839 if (!SIInstrInfo::isVALU(I))
2840 return false;
2841 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
2842 const MachineOperand &Op = I.getOperand(OpNo);
2843 if (Op.isReg()) {
2844 Register OpReg = Op.getReg();
2845 // Only consider uses
2846 if (!Op.isUse())
2847 continue;
2848 // Ignore EXEC
2849 if (OpReg == AMDGPU::EXEC ||
2850 OpReg == AMDGPU::EXEC_LO ||
2851 OpReg == AMDGPU::EXEC_HI)
2852 continue;
2853 // Ignore all implicit uses except VCC
2854 if (Op.isImplicit()) {
2855 if (OpReg == AMDGPU::VCC ||
2856 OpReg == AMDGPU::VCC_LO ||
2857 OpReg == AMDGPU::VCC_HI)
2858 return true;
2859 continue;
2860 }
2861 if (TRI.isSGPRReg(MRI, OpReg))
2862 return true;
2863 } else {
2864 const MCInstrDesc &InstDesc = I.getDesc();
2865 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
2866 if (!TII.isInlineConstant(Op, OpInfo))
2867 return true;
2868 }
2869 }
2870 return false;
2871 };
2872
2873 // Check for hazard
2874 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2875 std::numeric_limits<int>::max())
2876 return false;
2877
2878 auto NextMI = std::next(MI->getIterator());
2879
2880 // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
2881 BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
2882 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2884
2885 // SALU write may be s_getpc in a bundle.
2886 if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2887 // Update offsets of any references in the bundle.
2888 while (NextMI != MI->getParent()->end() &&
2889 NextMI->isBundledWithPred()) {
2890 for (auto &Operand : NextMI->operands()) {
2891 if (Operand.isGlobal())
2892 Operand.setOffset(Operand.getOffset() + 4);
2893 }
2894 NextMI++;
2895 }
2896 }
2897
2898 return true;
2899}
2900
2901static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
2902 const SIInstrInfo &TII) {
2903 MachineBasicBlock &EntryMBB = MF->front();
2904 if (EntryMBB.begin() != EntryMBB.end()) {
2905 auto &EntryMI = *EntryMBB.begin();
2906 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
2907 EntryMI.getOperand(0).getImm() >= Priority)
2908 return false;
2909 }
2910
2911 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
2912 .addImm(Priority);
2913 return true;
2914}
2915
2916bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
2917 if (!ST.hasRequiredExportPriority())
2918 return false;
2919
2920 // Assume the following shader types will never have exports,
2921 // and avoid adding or adjusting S_SETPRIO.
2922 MachineBasicBlock *MBB = MI->getParent();
2923 MachineFunction *MF = MBB->getParent();
2924 auto CC = MF->getFunction().getCallingConv();
2925 switch (CC) {
2930 return false;
2931 default:
2932 break;
2933 }
2934
2935 const int MaxPriority = 3;
2936 const int NormalPriority = 2;
2937 const int PostExportPriority = 0;
2938
2939 auto It = MI->getIterator();
2940 switch (MI->getOpcode()) {
2941 case AMDGPU::S_ENDPGM:
2942 case AMDGPU::S_ENDPGM_SAVED:
2943 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
2944 case AMDGPU::SI_RETURN_TO_EPILOG:
2945 // Ensure shader with calls raises priority at entry.
2946 // This ensures correct priority if exports exist in callee.
2947 if (MF->getFrameInfo().hasCalls())
2948 return ensureEntrySetPrio(MF, NormalPriority, TII);
2949 return false;
2950 case AMDGPU::S_SETPRIO: {
2951 // Raise minimum priority unless in workaround.
2952 auto &PrioOp = MI->getOperand(0);
2953 int Prio = PrioOp.getImm();
2954 bool InWA = (Prio == PostExportPriority) &&
2955 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
2956 if (InWA || Prio >= NormalPriority)
2957 return false;
2958 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
2959 return true;
2960 }
2961 default:
2962 if (!TII.isEXP(*MI))
2963 return false;
2964 break;
2965 }
2966
2967 // Check entry priority at each export (as there will only be a few).
2968 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
2969 bool Changed = false;
2971 Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
2972
2973 auto NextMI = std::next(It);
2974 bool EndOfShader = false;
2975 if (NextMI != MBB->end()) {
2976 // Only need WA at end of sequence of exports.
2977 if (TII.isEXP(*NextMI))
2978 return Changed;
2979 // Assume appropriate S_SETPRIO after export means WA already applied.
2980 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
2981 NextMI->getOperand(0).getImm() == PostExportPriority)
2982 return Changed;
2983 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
2984 }
2985
2986 const DebugLoc &DL = MI->getDebugLoc();
2987
2988 // Lower priority.
2989 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
2990 .addImm(PostExportPriority);
2991
2992 if (!EndOfShader) {
2993 // Wait for exports to complete.
2994 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
2995 .addReg(AMDGPU::SGPR_NULL)
2996 .addImm(0);
2997 }
2998
2999 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3000 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3001
3002 if (!EndOfShader) {
3003 // Return to normal (higher) priority.
3004 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3005 .addImm(NormalPriority);
3006 }
3007
3008 return true;
3009}
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
bool End
Definition: ELF_riscv.cpp:480
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI)
static bool isLdsDma(const MachineInstr &MI)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isSSetReg(unsigned Opcode)
static bool hasHazard(StateT State, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, DenseSet< const MachineBasicBlock * > &Visited)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static bool isDGEMM(unsigned Opcode)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
if(PassOpts->AAPipeline)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static const uint32_t IV[8]
Definition: blake3_impl.h:78
bool anyCommon(const BitVector &RHS) const
Test if any common bits are set.
Definition: BitVector.h:489
BitVector & set()
Definition: BitVector.h:351
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:281
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
unsigned PreEmitNoopsCommon(MachineInstr *)
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
bool hasShift64HighRegBug() const
bool hasFPAtomicToDenormModeHazard() const
bool hasLdsBranchVmemWARHazard() const
bool hasGFX90AInsts() const
bool hasDstSelForwardingHazard() const
bool hasMAIInsts() const
Definition: GCNSubtarget.h:816
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:267
bool hasVALUMaskWriteHazard() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasVcmpxExecWARHazard() const
bool hasReadM0MovRelInterpHazard() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:279
bool hasRequiredExportPriority() const
bool hasLdsWaitVMSRC() const
bool hasExtendedWaitCounts() const
bool hasVcmpxPermlaneHazard() const
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
bool hasNoDataDepHazard() const
Definition: GCNSubtarget.h:929
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
Definition: GCNSubtarget.h:510
bool hasTransForwardingHazard() const
bool hasGFX940Insts() const
bool hasReadM0LdsDmaHazard() const
bool hasSMEMtoVectorWriteHazard() const
bool hasVMEMtoScalarWriteHazard() const
bool hasNSAtoVMEMBug() const
bool hasVDecCoExecHazard() const
bool hasReadM0SendMsgHazard() const
bool hasReadM0LdsDirectHazard() const
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:606
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
Definition: GCNSubtarget.h:495
bool hasRFEHazards() const
Definition: GCNSubtarget.h:505
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
Definition: GCNSubtarget.h:501
bool isWave64() const
bool hasVALUTransUseHazard() const
bool hasLdsDirect() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
Instructions::const_reverse_iterator const_reverse_instr_iterator
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< pred_iterator > predecessors()
bool hasCalls() const
Return true if the current function has any function calls.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
bool isBundle() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:792
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:554
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:432
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:536
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:649
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:408
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:512
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isVINTRP(const MachineInstr &MI)
Definition: SIInstrInfo.h:784
static bool isDOT(const MachineInstr &MI)
Definition: SIInstrInfo.h:805
static bool isSWMMAC(const MachineInstr &MI)
Definition: SIInstrInfo.h:821
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:833
bool isBufferSMRD(const MachineInstr &MI) const
static bool isTRANS(const MachineInstr &MI)
Definition: SIInstrInfo.h:768
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:528
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:618
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:760
static bool isMFMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:800
static bool isFPAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:916
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:580
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:809
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:416
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:378
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:390
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:455
ProcResIter getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:31
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:132
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isGFX12Plus(const MCSubtargetInfo &STI)
IsaVersion getIsaVersion(StringRef GPU)
bool getMAIIsGFX940XDL(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool getMAIIsDGEMM(unsigned Opc)
Returns true if MAI operation is a double precision GEMM.
@ Entry
Definition: COFF.h:826
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double e
Definition: MathExtras.h:47
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:480
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
Instruction set architecture version.
Definition: TargetParser.h:127
Represents the counter values to wait for in an s_waitcnt instruction.
Description of the encoding of one expression Op.
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:118
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...
Definition: MCSchedule.h:68
Definition: regcomp.c:192