LLVM 19.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
20
21using namespace llvm;
22
23namespace {
24
25struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
26 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
27
28 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
29 if (Arg.getAsInteger(0, Value))
30 return O.error("'" + Arg + "' value invalid for uint argument!");
31
32 if (Value > 100)
33 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
34
35 return false;
36 }
37};
38
39} // end anonymous namespace
40
42 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
43 cl::desc("Fill a percentage of the latency between "
44 "neighboring MFMA with s_nops."));
45
46//===----------------------------------------------------------------------===//
47// Hazard Recognizer Implementation
48//===----------------------------------------------------------------------===//
49
51 const GCNSubtarget &ST);
52
54 IsHazardRecognizerMode(false),
55 CurrCycleInstr(nullptr),
56 MF(MF),
57 ST(MF.getSubtarget<GCNSubtarget>()),
58 TII(*ST.getInstrInfo()),
59 TRI(TII.getRegisterInfo()),
60 ClauseUses(TRI.getNumRegUnits()),
61 ClauseDefs(TRI.getNumRegUnits()) {
62 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
63 TSchedModel.init(&ST);
64 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
65}
66
68 EmittedInstrs.clear();
69}
70
73}
74
76 CurrCycleInstr = MI;
77}
78
79static bool isDivFMas(unsigned Opcode) {
80 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
81}
82
83static bool isSGetReg(unsigned Opcode) {
84 return Opcode == AMDGPU::S_GETREG_B32;
85}
86
87static bool isSSetReg(unsigned Opcode) {
88 switch (Opcode) {
89 case AMDGPU::S_SETREG_B32:
90 case AMDGPU::S_SETREG_B32_mode:
91 case AMDGPU::S_SETREG_IMM32_B32:
92 case AMDGPU::S_SETREG_IMM32_B32_mode:
93 return true;
94 }
95 return false;
96}
97
98static bool isRWLane(unsigned Opcode) {
99 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
100}
101
102static bool isRFE(unsigned Opcode) {
103 return Opcode == AMDGPU::S_RFE_B64;
104}
105
106static bool isSMovRel(unsigned Opcode) {
107 switch (Opcode) {
108 case AMDGPU::S_MOVRELS_B32:
109 case AMDGPU::S_MOVRELS_B64:
110 case AMDGPU::S_MOVRELD_B32:
111 case AMDGPU::S_MOVRELD_B64:
112 return true;
113 default:
114 return false;
115 }
116}
117
118static bool isDGEMM(unsigned Opcode) {
119 return AMDGPU::getMAIIsDGEMM(Opcode);
120}
121
122static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
123 unsigned Opcode = MI.getOpcode();
124
125 if (!SIInstrInfo::isMAI(MI) ||
126 isDGEMM(Opcode) ||
127 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
128 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
129 return false;
130
131 if (!ST.hasGFX940Insts())
132 return true;
133
134 return AMDGPU::getMAIIsGFX940XDL(Opcode);
135}
136
138 const MachineInstr &MI) {
139 if (TII.isAlwaysGDS(MI.getOpcode()))
140 return true;
141
142 switch (MI.getOpcode()) {
143 case AMDGPU::S_SENDMSG:
144 case AMDGPU::S_SENDMSGHALT:
145 case AMDGPU::S_TTRACEDATA:
146 return true;
147 // These DS opcodes don't support GDS.
148 case AMDGPU::DS_NOP:
149 case AMDGPU::DS_PERMUTE_B32:
150 case AMDGPU::DS_BPERMUTE_B32:
151 return false;
152 default:
153 if (TII.isDS(MI.getOpcode())) {
154 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
155 AMDGPU::OpName::gds);
156 if (MI.getOperand(GDS).getImm())
157 return true;
158 }
159 return false;
160 }
161}
162
163static bool isPermlane(const MachineInstr &MI) {
164 unsigned Opcode = MI.getOpcode();
165 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANE64_B32 ||
167 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
169 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
170}
171
172static bool isLdsDma(const MachineInstr &MI) {
173 return SIInstrInfo::isVALU(MI) &&
175}
176
177static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
178 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
179 AMDGPU::OpName::simm16);
180 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
181}
182
185 MachineInstr *MI = SU->getInstr();
186 // If we are not in "HazardRecognizerMode" and therefore not being run from
187 // the scheduler, track possible stalls from hazards but don't insert noops.
188 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
189
190 if (MI->isBundle())
191 return NoHazard;
192
193 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
194 return HazardType;
195
196 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
197 return HazardType;
198
199 if (checkFPAtomicToDenormModeHazard(MI) > 0)
200 return HazardType;
201
202 if (ST.hasNoDataDepHazard())
203 return NoHazard;
204
205 // FIXME: Should flat be considered vmem?
206 if ((SIInstrInfo::isVMEM(*MI) ||
208 && checkVMEMHazards(MI) > 0)
209 return HazardType;
210
211 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
212 return HazardType;
213
214 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
215 return HazardType;
216
217 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
218 return HazardType;
219
220 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
221 return HazardType;
222
225 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
226 return HazardType;
227
228 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
229 return HazardType;
230
231 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
232 return HazardType;
233
234 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
235 return HazardType;
236
237 if (((ST.hasReadM0MovRelInterpHazard() &&
238 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
239 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
240 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
242 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
244 MI->readsRegister(AMDGPU::LDS_DIRECT))) &&
245 checkReadM0Hazards(MI) > 0)
246 return HazardType;
247
248 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
249 return HazardType;
250
251 if ((SIInstrInfo::isVMEM(*MI) ||
253 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
254 return HazardType;
255
256 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
257 return HazardType;
258
259 return NoHazard;
260}
261
263 unsigned Quantity) {
264 while (Quantity > 0) {
265 unsigned Arg = std::min(Quantity, 8u);
266 Quantity -= Arg;
267 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
268 .addImm(Arg - 1);
269 }
270}
271
272unsigned
273GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
274 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
275 assert(TSchedModel.getWriteProcResBegin(SC) !=
276 TSchedModel.getWriteProcResEnd(SC));
277 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
278}
279
280void GCNHazardRecognizer::processBundle() {
281 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
283 // Check bundled MachineInstr's for hazards.
284 for (; MI != E && MI->isInsideBundle(); ++MI) {
285 CurrCycleInstr = &*MI;
286 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
287
288 if (IsHazardRecognizerMode) {
289 fixHazards(CurrCycleInstr);
290
291 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
292 }
293
294 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
295 // include the bundled MI directly after, only add a maximum of
296 // (MaxLookAhead - 1) noops to EmittedInstrs.
297 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
298 EmittedInstrs.push_front(nullptr);
299
300 EmittedInstrs.push_front(CurrCycleInstr);
301 EmittedInstrs.resize(MaxLookAhead);
302 }
303 CurrCycleInstr = nullptr;
304}
305
306void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
307 assert(IsHazardRecognizerMode);
308
309 unsigned NumPreNoops = PreEmitNoops(MI);
310 EmitNoops(NumPreNoops);
311 if (MI->isInsideBundle())
312 insertNoopsInBundle(MI, TII, NumPreNoops);
313 else
314 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
315 NumPreNoops);
317 AdvanceCycle();
318}
319
321 IsHazardRecognizerMode = true;
322 CurrCycleInstr = MI;
323 unsigned W = PreEmitNoopsCommon(MI);
324 fixHazards(MI);
325 CurrCycleInstr = nullptr;
326 return W;
327}
328
330 if (MI->isBundle())
331 return 0;
332
333 int WaitStates = 0;
334
336 return std::max(WaitStates, checkSMRDHazards(MI));
337
338 if (ST.hasNSAtoVMEMBug())
339 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
340
341 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
342
343 if (ST.hasNoDataDepHazard())
344 return WaitStates;
345
347 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
348
350 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
351
353 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
354
355 if (isDivFMas(MI->getOpcode()))
356 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
357
358 if (isRWLane(MI->getOpcode()))
359 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
360
363 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
364 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
365
366 if (MI->isInlineAsm())
367 return std::max(WaitStates, checkInlineAsmHazards(MI));
368
369 if (isSGetReg(MI->getOpcode()))
370 return std::max(WaitStates, checkGetRegHazards(MI));
371
372 if (isSSetReg(MI->getOpcode()))
373 return std::max(WaitStates, checkSetRegHazards(MI));
374
375 if (isRFE(MI->getOpcode()))
376 return std::max(WaitStates, checkRFEHazards(MI));
377
378 if ((ST.hasReadM0MovRelInterpHazard() &&
379 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
380 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
381 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
383 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
384 (ST.hasReadM0LdsDirectHazard() && MI->readsRegister(AMDGPU::LDS_DIRECT)))
385 return std::max(WaitStates, checkReadM0Hazards(MI));
386
388 return std::max(WaitStates, checkMAIHazards(MI));
389
390 if (SIInstrInfo::isVMEM(*MI) ||
393 return std::max(WaitStates, checkMAILdStHazards(MI));
394
395 return WaitStates;
396}
397
399 EmittedInstrs.push_front(nullptr);
400}
401
403 // When the scheduler detects a stall, it will call AdvanceCycle() without
404 // emitting any instructions.
405 if (!CurrCycleInstr) {
406 EmittedInstrs.push_front(nullptr);
407 return;
408 }
409
410 if (CurrCycleInstr->isBundle()) {
411 processBundle();
412 return;
413 }
414
415 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
416 if (!NumWaitStates) {
417 CurrCycleInstr = nullptr;
418 return;
419 }
420
421 // Keep track of emitted instructions
422 EmittedInstrs.push_front(CurrCycleInstr);
423
424 // Add a nullptr for each additional wait state after the first. Make sure
425 // not to add more than getMaxLookAhead() items to the list, since we
426 // truncate the list to that size right after this loop.
427 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
428 i < e; ++i) {
429 EmittedInstrs.push_front(nullptr);
430 }
431
432 // getMaxLookahead() is the largest number of wait states we will ever need
433 // to insert, so there is no point in keeping track of more than that many
434 // wait states.
435 EmittedInstrs.resize(getMaxLookAhead());
436
437 CurrCycleInstr = nullptr;
438}
439
441 llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
442}
443
444//===----------------------------------------------------------------------===//
445// Helper Functions
446//===----------------------------------------------------------------------===//
447
449
450typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
451typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn;
452
453// Search for a hazard in a block and its predecessors.
454template <typename StateT>
455static bool
456hasHazard(StateT State,
457 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
458 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
459 const MachineBasicBlock *MBB,
462 for (auto E = MBB->instr_rend(); I != E; ++I) {
463 // No need to look at parent BUNDLE instructions.
464 if (I->isBundle())
465 continue;
466
467 switch (IsHazard(State, *I)) {
468 case HazardFound:
469 return true;
470 case HazardExpired:
471 return false;
472 default:
473 // Continue search
474 break;
475 }
476
477 if (I->isInlineAsm() || I->isMetaInstruction())
478 continue;
479
480 UpdateState(State, *I);
481 }
482
483 for (MachineBasicBlock *Pred : MBB->predecessors()) {
484 if (!Visited.insert(Pred).second)
485 continue;
486
487 if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
488 Visited))
489 return true;
490 }
491
492 return false;
493}
494
495// Returns a minimum wait states since \p I walking all predecessors.
496// Only scans until \p IsExpired does not return true.
497// Can only be run in a hazard recognizer mode.
503 for (auto E = MBB->instr_rend(); I != E; ++I) {
504 // Don't add WaitStates for parent BUNDLE instructions.
505 if (I->isBundle())
506 continue;
507
508 if (IsHazard(*I))
509 return WaitStates;
510
511 if (I->isInlineAsm())
512 continue;
513
514 WaitStates += GetNumWaitStates(*I);
515
516 if (IsExpired(*I, WaitStates))
517 return std::numeric_limits<int>::max();
518 }
519
520 int MinWaitStates = std::numeric_limits<int>::max();
521 for (MachineBasicBlock *Pred : MBB->predecessors()) {
522 if (!Visited.insert(Pred).second)
523 continue;
524
525 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
526 IsExpired, Visited, GetNumWaitStates);
527
528 MinWaitStates = std::min(MinWaitStates, W);
529 }
530
531 return MinWaitStates;
532}
533
535 const MachineInstr *MI, IsExpiredFn IsExpired) {
537 return getWaitStatesSince(IsHazard, MI->getParent(),
538 std::next(MI->getReverseIterator()),
539 0, IsExpired, Visited);
540}
541
542int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
543 if (IsHazardRecognizerMode) {
544 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
545 return WaitStates >= Limit;
546 };
547 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
548 }
549
550 int WaitStates = 0;
551 for (MachineInstr *MI : EmittedInstrs) {
552 if (MI) {
553 if (IsHazard(*MI))
554 return WaitStates;
555
556 if (MI->isInlineAsm())
557 continue;
558 }
559 ++WaitStates;
560
561 if (WaitStates >= Limit)
562 break;
563 }
564 return std::numeric_limits<int>::max();
565}
566
567int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
568 IsHazardFn IsHazardDef,
569 int Limit) {
570 const SIRegisterInfo *TRI = ST.getRegisterInfo();
571
572 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
573 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
574 };
575
576 return getWaitStatesSince(IsHazardFn, Limit);
577}
578
579int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
580 int Limit) {
581 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
582 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
583 };
584
585 return getWaitStatesSince(IsHazardFn, Limit);
586}
587
588//===----------------------------------------------------------------------===//
589// No-op Hazard Detection
590//===----------------------------------------------------------------------===//
591
592static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
593 MCRegister Reg) {
594 for (MCRegUnit Unit : TRI.regunits(Reg))
595 BV.set(Unit);
596}
597
598static void addRegsToSet(const SIRegisterInfo &TRI,
600 BitVector &DefSet, BitVector &UseSet) {
601 for (const MachineOperand &Op : Ops) {
602 if (Op.isReg())
603 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
604 }
605}
606
607void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
608 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
609}
610
612 return !SIInstrInfo::isSMRD(*MI);
613}
614
617}
618
619int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
620 // SMEM soft clause are only present on VI+, and only matter if xnack is
621 // enabled.
622 if (!ST.isXNACKEnabled())
623 return 0;
624
625 bool IsSMRD = TII.isSMRD(*MEM);
626
627 resetClause();
628
629 // A soft-clause is any group of consecutive SMEM instructions. The
630 // instructions in this group may return out of order and/or may be
631 // replayed (i.e. the same instruction issued more than once).
632 //
633 // In order to handle these situations correctly we need to make sure that
634 // when a clause has more than one instruction, no instruction in the clause
635 // writes to a register that is read by another instruction in the clause
636 // (including itself). If we encounter this situation, we need to break the
637 // clause by inserting a non SMEM instruction.
638
639 for (MachineInstr *MI : EmittedInstrs) {
640 // When we hit a non-SMEM instruction then we have passed the start of the
641 // clause and we can stop.
642 if (!MI)
643 break;
644
646 break;
647
648 addClauseInst(*MI);
649 }
650
651 if (ClauseDefs.none())
652 return 0;
653
654 // We need to make sure not to put loads and stores in the same clause if they
655 // use the same address. For now, just start a new clause whenever we see a
656 // store.
657 if (MEM->mayStore())
658 return 1;
659
660 addClauseInst(*MEM);
661
662 // If the set of defs and uses intersect then we cannot add this instruction
663 // to the clause, so we have a hazard.
664 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
665}
666
667int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
668 int WaitStatesNeeded = 0;
669
670 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
671
672 // This SMRD hazard only affects SI.
673 if (!ST.hasSMRDReadVALUDefHazard())
674 return WaitStatesNeeded;
675
676 // A read of an SGPR by SMRD instruction requires 4 wait states when the
677 // SGPR was written by a VALU instruction.
678 int SmrdSgprWaitStates = 4;
679 auto IsHazardDefFn = [this](const MachineInstr &MI) {
680 return TII.isVALU(MI);
681 };
682 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
683 return TII.isSALU(MI);
684 };
685
686 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
687
688 for (const MachineOperand &Use : SMRD->uses()) {
689 if (!Use.isReg())
690 continue;
691 int WaitStatesNeededForUse =
692 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
693 SmrdSgprWaitStates);
694 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
695
696 // This fixes what appears to be undocumented hardware behavior in SI where
697 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
698 // needs some number of nops in between. We don't know how many we need, but
699 // let's use 4. This wasn't discovered before probably because the only
700 // case when this happens is when we expand a 64-bit pointer into a full
701 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
702 // probably never encountered in the closed-source land.
703 if (IsBufferSMRD) {
704 int WaitStatesNeededForUse =
705 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
706 IsBufferHazardDefFn,
707 SmrdSgprWaitStates);
708 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
709 }
710 }
711
712 return WaitStatesNeeded;
713}
714
715int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
717 return 0;
718
719 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
720
721 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
722 // SGPR was written by a VALU Instruction.
723 const int VmemSgprWaitStates = 5;
724 auto IsHazardDefFn = [this](const MachineInstr &MI) {
725 return TII.isVALU(MI);
726 };
727 for (const MachineOperand &Use : VMEM->uses()) {
728 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
729 continue;
730
731 int WaitStatesNeededForUse =
732 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
733 VmemSgprWaitStates);
734 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
735 }
736 return WaitStatesNeeded;
737}
738
739int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
740 const SIRegisterInfo *TRI = ST.getRegisterInfo();
741 const SIInstrInfo *TII = ST.getInstrInfo();
742
743 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
744 int DppVgprWaitStates = 2;
745 int DppExecWaitStates = 5;
746 int WaitStatesNeeded = 0;
747 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
748 return TII->isVALU(MI);
749 };
750
751 for (const MachineOperand &Use : DPP->uses()) {
752 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
753 continue;
754 int WaitStatesNeededForUse =
755 DppVgprWaitStates - getWaitStatesSinceDef(
756 Use.getReg(),
757 [](const MachineInstr &) { return true; },
758 DppVgprWaitStates);
759 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
760 }
761
762 WaitStatesNeeded = std::max(
763 WaitStatesNeeded,
764 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
765 DppExecWaitStates));
766
767 return WaitStatesNeeded;
768}
769
770int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
771 const SIInstrInfo *TII = ST.getInstrInfo();
772
773 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
774 // instruction.
775 const int DivFMasWaitStates = 4;
776 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
777 return TII->isVALU(MI);
778 };
779 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
780 DivFMasWaitStates);
781
782 return DivFMasWaitStates - WaitStatesNeeded;
783}
784
785int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
786 const SIInstrInfo *TII = ST.getInstrInfo();
787 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
788
789 const int GetRegWaitStates = 2;
790 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
791 return GetRegHWReg == getHWReg(TII, MI);
792 };
793 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
794
795 return GetRegWaitStates - WaitStatesNeeded;
796}
797
798int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
799 const SIInstrInfo *TII = ST.getInstrInfo();
800 unsigned HWReg = getHWReg(TII, *SetRegInstr);
801
802 const int SetRegWaitStates = ST.getSetRegWaitStates();
803 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
804 return HWReg == getHWReg(TII, MI);
805 };
806 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
807 return SetRegWaitStates - WaitStatesNeeded;
808}
809
810int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
811 if (!MI.mayStore())
812 return -1;
813
814 const SIInstrInfo *TII = ST.getInstrInfo();
815 unsigned Opcode = MI.getOpcode();
816 const MCInstrDesc &Desc = MI.getDesc();
817
818 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
819 int VDataRCID = -1;
820 if (VDataIdx != -1)
821 VDataRCID = Desc.operands()[VDataIdx].RegClass;
822
823 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
824 // There is no hazard if the instruction does not use vector regs
825 // (like wbinvl1)
826 if (VDataIdx == -1)
827 return -1;
828 // For MUBUF/MTBUF instructions this hazard only exists if the
829 // instruction is not using a register in the soffset field.
830 const MachineOperand *SOffset =
831 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
832 // If we have no soffset operand, then assume this field has been
833 // hardcoded to zero.
834 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
835 (!SOffset || !SOffset->isReg()))
836 return VDataIdx;
837 }
838
839 // MIMG instructions create a hazard if they don't use a 256-bit T# and
840 // the store size is greater than 8 bytes and they have more than two bits
841 // of their dmask set.
842 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
843 if (TII->isMIMG(MI)) {
844 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
845 assert(SRsrcIdx != -1 &&
846 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
847 (void)SRsrcIdx;
848 }
849
850 if (TII->isFLAT(MI)) {
851 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
852 if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
853 return DataIdx;
854 }
855
856 return -1;
857}
858
859int
860GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
861 const MachineRegisterInfo &MRI) {
862 // Helper to check for the hazard where VMEM instructions that store more than
863 // 8 bytes can have there store data over written by the next instruction.
864 const SIRegisterInfo *TRI = ST.getRegisterInfo();
865
866 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
867 int WaitStatesNeeded = 0;
868
869 if (!TRI->isVectorRegister(MRI, Def.getReg()))
870 return WaitStatesNeeded;
871 Register Reg = Def.getReg();
872 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
873 int DataIdx = createsVALUHazard(MI);
874 return DataIdx >= 0 &&
875 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
876 };
877 int WaitStatesNeededForDef =
878 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
879 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
880
881 return WaitStatesNeeded;
882}
883
884int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
885 int WaitStatesNeeded = 0;
886
888 const int TransDefWaitstates = 1;
889
890 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
892 return false;
893 const SIRegisterInfo *TRI = ST.getRegisterInfo();
894 const SIInstrInfo *TII = ST.getInstrInfo();
895 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
896
897 for (const MachineOperand &Use : VALU->explicit_uses()) {
898 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
899 return true;
900 }
901
902 return false;
903 };
904
905 int WaitStatesNeededForDef =
906 TransDefWaitstates -
907 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
908 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
909 }
910
911 if (ST.hasDstSelForwardingHazard()) {
912 const int Shift16DefWaitstates = 1;
913
914 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
916 return false;
917 const SIInstrInfo *TII = ST.getInstrInfo();
918 if (SIInstrInfo::isSDWA(MI)) {
919 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
920 if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
921 return false;
922 } else {
923 if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
924 !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
925 ->getImm() &
927 return false;
928 }
929 const SIRegisterInfo *TRI = ST.getRegisterInfo();
930 if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
931 Register Def = Dst->getReg();
932
933 for (const MachineOperand &Use : VALU->explicit_uses()) {
934 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
935 return true;
936 }
937 }
938
939 return false;
940 };
941
942 int WaitStatesNeededForDef =
943 Shift16DefWaitstates -
944 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
945 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
946 }
947
948 if (ST.hasVDecCoExecHazard()) {
949 const int VALUWriteSGPRVALUReadWaitstates = 2;
950 const int VALUWriteEXECRWLane = 4;
951 const int VALUWriteVGPRReadlaneRead = 1;
952
953 const SIRegisterInfo *TRI = ST.getRegisterInfo();
954 const MachineRegisterInfo &MRI = MF.getRegInfo();
956 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
958 return false;
959 return MI.modifiesRegister(UseReg, TRI);
960 };
961
962 for (const MachineOperand &Use : VALU->explicit_uses()) {
963 if (!Use.isReg())
964 continue;
965
966 UseReg = Use.getReg();
967 if (TRI->isSGPRReg(MRI, UseReg)) {
968 int WaitStatesNeededForDef =
969 VALUWriteSGPRVALUReadWaitstates -
970 getWaitStatesSince(IsVALUDefSGPRFn,
971 VALUWriteSGPRVALUReadWaitstates);
972 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
973 }
974 }
975
976 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
977 UseReg = AMDGPU::VCC;
978 int WaitStatesNeededForDef =
979 VALUWriteSGPRVALUReadWaitstates -
980 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
981 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
982 }
983
984 switch (VALU->getOpcode()) {
985 case AMDGPU::V_READLANE_B32:
986 case AMDGPU::V_READFIRSTLANE_B32: {
987 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
988 UseReg = Src->getReg();
989 int WaitStatesNeededForDef =
990 VALUWriteVGPRReadlaneRead -
991 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
992 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
993 }
994 [[fallthrough]];
995 case AMDGPU::V_WRITELANE_B32: {
996 UseReg = AMDGPU::EXEC;
997 int WaitStatesNeededForDef =
998 VALUWriteEXECRWLane -
999 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1000 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1001 break;
1002 }
1003 default:
1004 break;
1005 }
1006 }
1007
1008 // This checks for the hazard where VMEM instructions that store more than
1009 // 8 bytes can have there store data over written by the next instruction.
1010 if (!ST.has12DWordStoreHazard())
1011 return WaitStatesNeeded;
1012
1013 const MachineRegisterInfo &MRI = MF.getRegInfo();
1014
1015 for (const MachineOperand &Def : VALU->defs()) {
1016 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1017 }
1018
1019 return WaitStatesNeeded;
1020}
1021
1022int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1023 // This checks for hazards associated with inline asm statements.
1024 // Since inline asms can contain just about anything, we use this
1025 // to call/leverage other check*Hazard routines. Note that
1026 // this function doesn't attempt to address all possible inline asm
1027 // hazards (good luck), but is a collection of what has been
1028 // problematic thus far.
1029
1030 // see checkVALUHazards()
1031 if (!ST.has12DWordStoreHazard())
1032 return 0;
1033
1034 const MachineRegisterInfo &MRI = MF.getRegInfo();
1035 int WaitStatesNeeded = 0;
1036
1037 for (const MachineOperand &Op :
1039 if (Op.isReg() && Op.isDef()) {
1040 WaitStatesNeeded =
1041 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1042 }
1043 }
1044
1045 return WaitStatesNeeded;
1046}
1047
1048int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1049 const SIInstrInfo *TII = ST.getInstrInfo();
1050 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1051 const MachineRegisterInfo &MRI = MF.getRegInfo();
1052
1053 const MachineOperand *LaneSelectOp =
1054 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1055
1056 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1057 return 0;
1058
1059 Register LaneSelectReg = LaneSelectOp->getReg();
1060 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1061
1062 const int RWLaneWaitStates = 4;
1063 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1064 RWLaneWaitStates);
1065 return RWLaneWaitStates - WaitStatesSince;
1066}
1067
1068int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1069 if (!ST.hasRFEHazards())
1070 return 0;
1071
1072 const SIInstrInfo *TII = ST.getInstrInfo();
1073
1074 const int RFEWaitStates = 1;
1075
1076 auto IsHazardFn = [TII](const MachineInstr &MI) {
1077 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1078 };
1079 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1080 return RFEWaitStates - WaitStatesNeeded;
1081}
1082
1083int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1084 const SIInstrInfo *TII = ST.getInstrInfo();
1085 const int ReadM0WaitStates = 1;
1086 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1087 return ReadM0WaitStates -
1088 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1089}
1090
1091void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1092 fixVMEMtoScalarWriteHazards(MI);
1093 fixVcmpxPermlaneHazards(MI);
1094 fixSMEMtoVectorWriteHazards(MI);
1095 fixVcmpxExecWARHazard(MI);
1096 fixLdsBranchVmemWARHazard(MI);
1097 if (ST.hasLdsDirect()) {
1098 fixLdsDirectVALUHazard(MI);
1099 fixLdsDirectVMEMHazard(MI);
1100 }
1101 fixVALUPartialForwardingHazard(MI);
1102 fixVALUTransUseHazard(MI);
1103 fixWMMAHazards(MI);
1104 fixShift64HighRegBug(MI);
1105 fixVALUMaskWriteHazard(MI);
1106}
1107
1108bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1109 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1110 return false;
1111
1112 const SIInstrInfo *TII = ST.getInstrInfo();
1113 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1114 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1115 return (TII->isVOPC(MI) ||
1116 ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
1117 MI.modifiesRegister(AMDGPU::EXEC, TRI);
1118 };
1119
1120 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1121 unsigned Opc = MI.getOpcode();
1122 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1123 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1124 };
1125
1126 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1127 std::numeric_limits<int>::max())
1128 return false;
1129
1130 // V_NOP will be discarded by SQ.
1131 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1132 // which is always a VGPR and available.
1133 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1134 Register Reg = Src0->getReg();
1135 bool IsUndef = Src0->isUndef();
1136 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1137 TII->get(AMDGPU::V_MOV_B32_e32))
1138 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1139 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1140
1141 return true;
1142}
1143
1144bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1146 return false;
1148
1150 return false;
1151
1152 if (MI->getNumDefs() == 0)
1153 return false;
1154
1155 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1156
1157 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1160 return false;
1161
1162 for (const MachineOperand &Def : MI->defs()) {
1163 const MachineOperand *Op =
1164 I.findRegisterUseOperand(Def.getReg(), false, TRI);
1165 if (!Op)
1166 continue;
1167 return true;
1168 }
1169 return false;
1170 };
1171
1172 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1173 return SIInstrInfo::isVALU(MI) ||
1174 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1175 !MI.getOperand(0).getImm()) ||
1176 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1177 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1178 };
1179
1180 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1181 std::numeric_limits<int>::max())
1182 return false;
1183
1184 const SIInstrInfo *TII = ST.getInstrInfo();
1185 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1186 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1188 return true;
1189}
1190
1191bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1193 return false;
1195
1196 if (!SIInstrInfo::isVALU(*MI))
1197 return false;
1198
1199 unsigned SDSTName;
1200 switch (MI->getOpcode()) {
1201 case AMDGPU::V_READLANE_B32:
1202 case AMDGPU::V_READFIRSTLANE_B32:
1203 SDSTName = AMDGPU::OpName::vdst;
1204 break;
1205 default:
1206 SDSTName = AMDGPU::OpName::sdst;
1207 break;
1208 }
1209
1210 const SIInstrInfo *TII = ST.getInstrInfo();
1211 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1212 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1213 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1214 if (!SDST) {
1215 for (const auto &MO : MI->implicit_operands()) {
1216 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1217 SDST = &MO;
1218 break;
1219 }
1220 }
1221 }
1222
1223 if (!SDST)
1224 return false;
1225
1226 const Register SDSTReg = SDST->getReg();
1227 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1228 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1229 };
1230
1231 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1232 if (TII->isSALU(MI)) {
1233 switch (MI.getOpcode()) {
1234 case AMDGPU::S_SETVSKIP:
1235 case AMDGPU::S_VERSION:
1236 case AMDGPU::S_WAITCNT_VSCNT:
1237 case AMDGPU::S_WAITCNT_VMCNT:
1238 case AMDGPU::S_WAITCNT_EXPCNT:
1239 // These instructions cannot not mitigate the hazard.
1240 return false;
1241 case AMDGPU::S_WAITCNT_LGKMCNT:
1242 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1243 return (MI.getOperand(1).getImm() == 0) &&
1244 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1245 case AMDGPU::S_WAITCNT: {
1246 const int64_t Imm = MI.getOperand(0).getImm();
1248 // DsCnt corresponds to LGKMCnt here.
1249 return (Decoded.DsCnt == 0);
1250 }
1251 default:
1252 // SOPP instructions cannot mitigate the hazard.
1253 if (TII->isSOPP(MI))
1254 return false;
1255 // At this point the SALU can be assumed to mitigate the hazard
1256 // because either:
1257 // (a) it is independent of the at risk SMEM (breaking chain),
1258 // or
1259 // (b) it is dependent on the SMEM, in which case an appropriate
1260 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1261 // SMEM instruction.
1262 return true;
1263 }
1264 }
1265 return false;
1266 };
1267
1268 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1269 std::numeric_limits<int>::max())
1270 return false;
1271
1272 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1273 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1274 .addImm(0);
1275 return true;
1276}
1277
1278bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1279 if (!ST.hasVcmpxExecWARHazard())
1280 return false;
1282
1283 if (!SIInstrInfo::isVALU(*MI))
1284 return false;
1285
1286 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1287 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1288 return false;
1289
1290 auto IsHazardFn = [TRI](const MachineInstr &I) {
1292 return false;
1293 return I.readsRegister(AMDGPU::EXEC, TRI);
1294 };
1295
1296 const SIInstrInfo *TII = ST.getInstrInfo();
1297 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1298 if (SIInstrInfo::isVALU(MI)) {
1299 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1300 return true;
1301 for (auto MO : MI.implicit_operands())
1302 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1303 return true;
1304 }
1305 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1306 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1307 return true;
1308 return false;
1309 };
1310
1311 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1312 std::numeric_limits<int>::max())
1313 return false;
1314
1315 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1316 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1318 return true;
1319}
1320
1322 const GCNSubtarget &ST) {
1323 if (!ST.hasLdsBranchVmemWARHazard())
1324 return false;
1325
1326 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1327 // instructions need to appear in the same function.
1328 bool HasLds = false;
1329 bool HasVmem = false;
1330 for (auto &MBB : MF) {
1331 for (auto &MI : MBB) {
1332 HasLds |= SIInstrInfo::isDS(MI);
1333 HasVmem |=
1335 if (HasLds && HasVmem)
1336 return true;
1337 }
1338 }
1339 return false;
1340}
1341
1343 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1344 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1345 !I.getOperand(1).getImm();
1346}
1347
1348bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1349 if (!RunLdsBranchVmemWARHazardFixup)
1350 return false;
1351
1354
1355 auto IsHazardInst = [](const MachineInstr &MI) {
1356 if (SIInstrInfo::isDS(MI))
1357 return 1;
1359 return 2;
1360 return 0;
1361 };
1362
1363 auto InstType = IsHazardInst(*MI);
1364 if (!InstType)
1365 return false;
1366
1367 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1368 return IsHazardInst(I) || isStoreCountWaitZero(I);
1369 };
1370
1371 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1372 if (!I.isBranch())
1373 return false;
1374
1375 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1376 auto InstType2 = IsHazardInst(I);
1377 return InstType2 && InstType != InstType2;
1378 };
1379
1380 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1381 auto InstType2 = IsHazardInst(I);
1382 if (InstType == InstType2)
1383 return true;
1384
1385 return isStoreCountWaitZero(I);
1386 };
1387
1388 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1389 std::numeric_limits<int>::max();
1390 };
1391
1392 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1393 std::numeric_limits<int>::max())
1394 return false;
1395
1396 const SIInstrInfo *TII = ST.getInstrInfo();
1397 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1398 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1399 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1400 .addImm(0);
1401
1402 return true;
1403}
1404
1405bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1407 return false;
1408
1409 const int NoHazardWaitStates = 15;
1410 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1411 const Register VDSTReg = VDST->getReg();
1412
1413 bool VisitedTrans = false;
1414 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1415 if (!SIInstrInfo::isVALU(I))
1416 return false;
1417 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1418 // Cover both WAR and WAW
1419 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1420 };
1421 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1422 if (WaitStates >= NoHazardWaitStates)
1423 return true;
1424 // Instructions which cause va_vdst==0 expire hazard
1427 };
1428 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1429 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1430 };
1431
1433 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1434 std::next(MI->getReverseIterator()), 0,
1435 IsExpiredFn, Visited, GetWaitStatesFn);
1436
1437 // Transcendentals can execute in parallel to other VALUs.
1438 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1439 if (VisitedTrans)
1440 Count = 0;
1441
1442 MachineOperand *WaitVdstOp =
1443 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1444 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1445
1446 return true;
1447}
1448
1449bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1451 return false;
1452
1453 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1454 const Register VDSTReg = VDST->getReg();
1455
1456 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1459 return false;
1460 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1461 };
1462 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1463 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1464 // according to the type of VMEM instruction.
1465 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1467 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1468 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1469 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1470 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1471 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1472 };
1473
1474 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1475 std::numeric_limits<int>::max())
1476 return false;
1477
1478 if (LdsdirCanWait) {
1479 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1480 } else {
1481 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1482 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1484 }
1485
1486 return true;
1487}
1488
1489bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1491 return false;
1493
1494 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1495 return false;
1496
1498
1499 for (const MachineOperand &Use : MI->explicit_uses()) {
1500 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1501 SrcVGPRs.insert(Use.getReg());
1502 }
1503
1504 // Only applies with >= 2 unique VGPR sources
1505 if (SrcVGPRs.size() <= 1)
1506 return false;
1507
1508 // Look for the following pattern:
1509 // Va <- VALU [PreExecPos]
1510 // intv1
1511 // Exec <- SALU [ExecPos]
1512 // intv2
1513 // Vb <- VALU [PostExecPos]
1514 // intv3
1515 // MI Va, Vb (WaitState = 0)
1516 //
1517 // Where:
1518 // intv1 + intv2 <= 2 VALUs
1519 // intv3 <= 4 VALUs
1520 //
1521 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1522
1523 const int Intv1plus2MaxVALUs = 2;
1524 const int Intv3MaxVALUs = 4;
1525 const int IntvMaxVALUs = 6;
1526 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1527
1528 struct StateType {
1530 int ExecPos = std::numeric_limits<int>::max();
1531 int VALUs = 0;
1532 };
1533
1534 StateType State;
1535
1536 // This overloads expiry testing with all the hazard detection
1537 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1538 // Too many VALU states have passed
1539 if (State.VALUs > NoHazardVALUWaitStates)
1540 return HazardExpired;
1541
1542 // Instructions which cause va_vdst==0 expire hazard
1545 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1546 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1547 return HazardExpired;
1548
1549 // Track registers writes
1550 bool Changed = false;
1551 if (SIInstrInfo::isVALU(I)) {
1552 for (Register Src : SrcVGPRs) {
1553 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1554 State.DefPos[Src] = State.VALUs;
1555 Changed = true;
1556 }
1557 }
1558 } else if (SIInstrInfo::isSALU(I)) {
1559 if (State.ExecPos == std::numeric_limits<int>::max()) {
1560 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1561 State.ExecPos = State.VALUs;
1562 Changed = true;
1563 }
1564 }
1565 }
1566
1567 // Early expiration: too many VALUs in intv3
1568 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1569 return HazardExpired;
1570
1571 // Only evaluate state if something changed
1572 if (!Changed)
1573 return NoHazardFound;
1574
1575 // Determine positions of VALUs pre/post exec change
1576 if (State.ExecPos == std::numeric_limits<int>::max())
1577 return NoHazardFound;
1578
1579 int PreExecPos = std::numeric_limits<int>::max();
1580 int PostExecPos = std::numeric_limits<int>::max();
1581
1582 for (auto Entry : State.DefPos) {
1583 int DefVALUs = Entry.second;
1584 if (DefVALUs != std::numeric_limits<int>::max()) {
1585 if (DefVALUs >= State.ExecPos)
1586 PreExecPos = std::min(PreExecPos, DefVALUs);
1587 else
1588 PostExecPos = std::min(PostExecPos, DefVALUs);
1589 }
1590 }
1591
1592 // Need a VALUs post exec change
1593 if (PostExecPos == std::numeric_limits<int>::max())
1594 return NoHazardFound;
1595
1596 // Too many VALUs in intv3?
1597 int Intv3VALUs = PostExecPos;
1598 if (Intv3VALUs > Intv3MaxVALUs)
1599 return HazardExpired;
1600
1601 // Too many VALUs in intv2?
1602 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1603 if (Intv2VALUs > Intv1plus2MaxVALUs)
1604 return HazardExpired;
1605
1606 // Need a VALUs pre exec change
1607 if (PreExecPos == std::numeric_limits<int>::max())
1608 return NoHazardFound;
1609
1610 // Too many VALUs in intv1?
1611 int Intv1VALUs = PreExecPos - State.ExecPos;
1612 if (Intv1VALUs > Intv1plus2MaxVALUs)
1613 return HazardExpired;
1614
1615 // Too many VALUs in intv1 + intv2
1616 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1617 return HazardExpired;
1618
1619 return HazardFound;
1620 };
1621 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1623 State.VALUs += 1;
1624 };
1625
1627 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1628 std::next(MI->getReverseIterator()), Visited))
1629 return false;
1630
1631 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1632 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1633 .addImm(0x0fff);
1634
1635 return true;
1636}
1637
1638bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1639 if (!ST.hasVALUTransUseHazard())
1640 return false;
1642
1643 if (!SIInstrInfo::isVALU(*MI))
1644 return false;
1645
1646 SmallSet<Register, 4> SrcVGPRs;
1647
1648 for (const MachineOperand &Use : MI->explicit_uses()) {
1649 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1650 SrcVGPRs.insert(Use.getReg());
1651 }
1652
1653 // Look for the following pattern:
1654 // Va <- TRANS VALU
1655 // intv
1656 // MI Va (WaitState = 0)
1657 //
1658 // Where:
1659 // intv <= 5 VALUs / 1 TRANS
1660 //
1661 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1662
1663 const int IntvMaxVALUs = 5;
1664 const int IntvMaxTRANS = 1;
1665
1666 struct StateType {
1667 int VALUs = 0;
1668 int TRANS = 0;
1669 };
1670
1671 StateType State;
1672
1673 // This overloads expiry testing with all the hazard detection
1674 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1675 // Too many VALU states have passed
1676 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1677 return HazardExpired;
1678
1679 // Instructions which cause va_vdst==0 expire hazard
1682 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1683 I.getOperand(0).getImm() == 0x0fff))
1684 return HazardExpired;
1685
1686 // Track registers writes
1687 if (SIInstrInfo::isTRANS(I)) {
1688 for (Register Src : SrcVGPRs) {
1689 if (I.modifiesRegister(Src, &TRI)) {
1690 return HazardFound;
1691 }
1692 }
1693 }
1694
1695 return NoHazardFound;
1696 };
1697 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1699 State.VALUs += 1;
1701 State.TRANS += 1;
1702 };
1703
1705 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1706 std::next(MI->getReverseIterator()), Visited))
1707 return false;
1708
1709 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1710 // avoided.
1711 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1712 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1714
1715 return true;
1716}
1717
1718bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1720 return false;
1721
1722 const SIInstrInfo *TII = ST.getInstrInfo();
1723 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1724
1725 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1727 return false;
1728
1729 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1730 // with the dest(matrix D) of the previous wmma.
1731 const Register CurSrc0Reg =
1732 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1733 const Register CurSrc1Reg =
1734 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1735
1736 const Register PrevDstReg =
1737 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1738
1739 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1740 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1741 return true;
1742 }
1743
1744 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1745 // but Index can't overlap with PrevDstReg.
1746 if (AMDGPU::isGFX12Plus(ST)) {
1747 if (SIInstrInfo::isSWMMAC(*MI)) {
1748 const Register CurIndex =
1749 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1750 if (TRI->regsOverlap(PrevDstReg, CurIndex))
1751 return true;
1752 }
1753 return false;
1754 }
1755
1756 return false;
1757 };
1758
1759 auto IsExpiredFn = [](const MachineInstr &I, int) {
1760 return SIInstrInfo::isVALU(I);
1761 };
1762
1763 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1764 std::numeric_limits<int>::max())
1765 return false;
1766
1767 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1768
1769 return true;
1770}
1771
1772bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1773 if (!ST.hasShift64HighRegBug())
1774 return false;
1776
1777 switch (MI->getOpcode()) {
1778 default:
1779 return false;
1780 case AMDGPU::V_LSHLREV_B64_e64:
1781 case AMDGPU::V_LSHRREV_B64_e64:
1782 case AMDGPU::V_ASHRREV_I64_e64:
1783 break;
1784 }
1785
1786 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1787 if (!Amt->isReg())
1788 return false;
1789
1790 Register AmtReg = Amt->getReg();
1791 const MachineRegisterInfo &MRI = MF.getRegInfo();
1792 // Check if this is a last VGPR in the allocation block.
1793 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1794 return false;
1795
1796 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1797 return false;
1798
1799 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1800 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1801 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1802 bool Overlapped = OverlappedSrc || OverlappedDst;
1803
1804 assert(!OverlappedDst || !OverlappedSrc ||
1805 Src1->getReg() == MI->getOperand(0).getReg());
1807 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1808
1809 Register NewReg;
1810 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1811 : AMDGPU::VGPR_32RegClass) {
1812 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1813 NewReg = Reg;
1814 break;
1815 }
1816 }
1817
1818 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1819 : NewReg;
1820 Register NewAmtLo;
1821
1822 if (Overlapped)
1823 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1824
1825 DebugLoc DL = MI->getDebugLoc();
1826 MachineBasicBlock *MBB = MI->getParent();
1827 // Insert a full wait count because found register might be pending a wait.
1828 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1829 .addImm(0);
1830
1831 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1832 if (Overlapped)
1833 runOnInstruction(
1834 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1835 .addDef(AmtReg - 1)
1836 .addReg(AmtReg - 1, RegState::Undef)
1837 .addReg(NewAmtLo, RegState::Undef));
1838 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1839 .addDef(AmtReg)
1840 .addReg(AmtReg, RegState::Undef)
1841 .addReg(NewAmt, RegState::Undef));
1842
1843 // Instructions emitted after the current instruction will be processed by the
1844 // parent loop of the hazard recognizer in a natural way.
1845 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1846 AmtReg)
1847 .addDef(NewAmt)
1848 .addReg(NewAmt)
1849 .addReg(AmtReg);
1850 if (Overlapped)
1851 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1852 AmtReg - 1)
1853 .addDef(NewAmtLo)
1854 .addReg(NewAmtLo)
1855 .addReg(AmtReg - 1);
1856
1857 // Re-running hazard recognizer on the modified instruction is not necessary,
1858 // inserted V_SWAP_B32 has already both read and write new registers so
1859 // hazards related to these register has already been handled.
1860 Amt->setReg(NewAmt);
1861 Amt->setIsKill(false);
1862 // We do not update liveness, so verifier may see it as undef.
1863 Amt->setIsUndef();
1864 if (OverlappedDst)
1865 MI->getOperand(0).setReg(NewReg);
1866 if (OverlappedSrc) {
1867 Src1->setReg(NewReg);
1868 Src1->setIsKill(false);
1869 Src1->setIsUndef();
1870 }
1871
1872 return true;
1873}
1874
1875int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1876 int NSAtoVMEMWaitStates = 1;
1877
1878 if (!ST.hasNSAtoVMEMBug())
1879 return 0;
1880
1882 return 0;
1883
1884 const SIInstrInfo *TII = ST.getInstrInfo();
1885 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1886 if (!Offset || (Offset->getImm() & 6) == 0)
1887 return 0;
1888
1889 auto IsHazardFn = [TII](const MachineInstr &I) {
1890 if (!SIInstrInfo::isMIMG(I))
1891 return false;
1892 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1893 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1894 TII->getInstSizeInBytes(I) >= 16;
1895 };
1896
1897 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1898}
1899
1900int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1901 int FPAtomicToDenormModeWaitStates = 3;
1902
1904 return 0;
1906
1907 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1908 return 0;
1909
1910 auto IsHazardFn = [](const MachineInstr &I) {
1912 return false;
1913 return SIInstrInfo::isFPAtomic(I);
1914 };
1915
1916 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1917 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
1918 return true;
1919
1920 switch (MI.getOpcode()) {
1921 case AMDGPU::S_WAITCNT:
1922 case AMDGPU::S_WAITCNT_VSCNT:
1923 case AMDGPU::S_WAITCNT_VMCNT:
1924 case AMDGPU::S_WAITCNT_EXPCNT:
1925 case AMDGPU::S_WAITCNT_LGKMCNT:
1926 case AMDGPU::S_WAIT_IDLE:
1927 return true;
1928 default:
1929 break;
1930 }
1931
1932 return false;
1933 };
1934
1935 return FPAtomicToDenormModeWaitStates -
1936 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1937}
1938
1939int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1941
1942 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1943}
1944
1945int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
1946 // Early exit if no padding is requested.
1947 if (MFMAPaddingRatio == 0)
1948 return 0;
1949
1951 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
1952 return 0;
1953
1954 int NeighborMFMALatency = 0;
1955 auto IsNeighboringMFMA = [&NeighborMFMALatency,
1956 this](const MachineInstr &MI) {
1957 if (!SIInstrInfo::isMFMA(MI))
1958 return false;
1959
1960 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
1961 return true;
1962 };
1963
1964 const int MaxMFMAPipelineWaitStates = 16;
1965 int WaitStatesSinceNeighborMFMA =
1966 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
1967
1968 int NeighborMFMAPaddingNeeded =
1969 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
1970 WaitStatesSinceNeighborMFMA;
1971
1972 return std::max(0, NeighborMFMAPaddingNeeded);
1973}
1974
1975int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1976 int WaitStatesNeeded = 0;
1977 unsigned Opc = MI->getOpcode();
1978
1979 auto IsVALUFn = [](const MachineInstr &MI) {
1980 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
1981 };
1982
1983 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1984 const int LegacyVALUWritesVGPRWaitStates = 2;
1985 const int VALUWritesExecWaitStates = 4;
1986 const int MaxWaitStates = 4;
1987
1988 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1989 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1990 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1991
1992 if (WaitStatesNeeded < MaxWaitStates) {
1993 for (const MachineOperand &Use : MI->explicit_uses()) {
1994 const int MaxWaitStates = 2;
1995
1996 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1997 continue;
1998
1999 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2000 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2001 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2002
2003 if (WaitStatesNeeded == MaxWaitStates)
2004 break;
2005 }
2006 }
2007 }
2008
2009 for (const MachineOperand &Op : MI->explicit_operands()) {
2010 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2011 continue;
2012
2013 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2014 continue;
2015
2016 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2017 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2018 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2019 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2020 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2021 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2022 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2023 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2024 const int MaxWaitStates = 18;
2025 Register Reg = Op.getReg();
2026 unsigned HazardDefLatency = 0;
2027
2028 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2029 this](const MachineInstr &MI) {
2030 if (!SIInstrInfo::isMFMA(MI))
2031 return false;
2032 Register DstReg = MI.getOperand(0).getReg();
2033 if (DstReg == Reg)
2034 return false;
2035 HazardDefLatency =
2036 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2037 return TRI.regsOverlap(DstReg, Reg);
2038 };
2039
2040 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2041 MaxWaitStates);
2042 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2043 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2044 int OpNo = Op.getOperandNo();
2045 if (OpNo == SrcCIdx) {
2046 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2047 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2048 switch (HazardDefLatency) {
2049 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2050 break;
2051 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2052 break;
2053 case 16: [[fallthrough]];
2054 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2055 break;
2056 }
2057 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2058 switch (HazardDefLatency) {
2059 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2060 break;
2061 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2062 break;
2063 case 16: [[fallthrough]];
2064 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2065 break;
2066 }
2067 }
2068
2069 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2070 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2071
2072 if (WaitStatesNeeded == MaxWaitStates)
2073 return WaitStatesNeeded; // Early exit.
2074
2075 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2076 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2077 return false;
2078 Register DstReg = MI.getOperand(0).getReg();
2079 return TRI.regsOverlap(Reg, DstReg);
2080 };
2081
2082 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2083 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2084 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2085 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2086 if (OpNo == SrcCIdx)
2087 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2088 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2089 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2090
2091 WaitStatesNeededForUse = NeedWaitStates -
2092 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2093 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2094
2095 if (WaitStatesNeeded == MaxWaitStates)
2096 return WaitStatesNeeded; // Early exit.
2097 }
2098
2099 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2100 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2101 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2102 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2103 const int MaxWaitStates = 13;
2104 Register DstReg = MI->getOperand(0).getReg();
2105 unsigned HazardDefLatency = 0;
2106
2107 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2108 this](const MachineInstr &MI) {
2109 if (!SIInstrInfo::isMFMA(MI))
2110 return false;
2111 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2112 HazardDefLatency =
2113 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2114 return TRI.regsOverlap(Reg, DstReg);
2115 };
2116
2117 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2118 int NeedWaitStates;
2119 switch (HazardDefLatency) {
2120 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2121 break;
2122 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2123 break;
2124 case 16: [[fallthrough]];
2125 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2126 break;
2127 }
2128
2129 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2130 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2131 }
2132
2133 // Pad neighboring MFMA with noops for better inter-wave performance.
2134 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2135
2136 return WaitStatesNeeded;
2137}
2138
2139int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2140 int WaitStatesNeeded = 0;
2141 unsigned Opc = MI->getOpcode();
2142
2143 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2145 };
2146
2147 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2150 };
2151
2152 if (!SIInstrInfo::isMFMA(*MI))
2153 return WaitStatesNeeded;
2154
2155 const int VALUWritesExecWaitStates = 4;
2156 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2157 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2158 VALUWritesExecWaitStates);
2159 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2160
2161 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2162
2163 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2164 for (const MachineOperand &Use : MI->explicit_uses()) {
2165 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2166 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2167 const int GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates = 3;
2168 const int GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates = 5;
2169 const int GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates = 4;
2170 const int GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates = 9;
2171 const int GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates = 8;
2172 const int GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates = 17;
2173 const int GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates = 16;
2174 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2175 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2176 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2177 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2178 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2179 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2180 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2181 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2182 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2183 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2184 const int GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates = 4;
2185 const int GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates = 6;
2186 const int GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates = 10;
2187 const int GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates = 18;
2188 const int GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates = 5;
2189 const int GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates = 7;
2190 const int GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates = 11;
2191 const int GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates = 19;
2192 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2193 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2194 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2195 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2196 const int MaxWaitStates = 19;
2197
2198 if (!Use.isReg())
2199 continue;
2200 Register Reg = Use.getReg();
2201 bool FullReg;
2202 const MachineInstr *MI1;
2203
2204 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2205 this](const MachineInstr &MI) {
2206 if (!SIInstrInfo::isMFMA(MI))
2207 return false;
2208 Register DstReg = MI.getOperand(0).getReg();
2209 FullReg = (DstReg == Reg);
2210 MI1 = &MI;
2211 return TRI.regsOverlap(DstReg, Reg);
2212 };
2213
2214 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2215 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2216 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2217
2218 int NumWaitStates =
2219 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2220 if (NumWaitStates == std::numeric_limits<int>::max())
2221 continue;
2222
2223 int OpNo = Use.getOperandNo();
2224 unsigned Opc1 = MI1->getOpcode();
2225 int NeedWaitStates = 0;
2226 if (OpNo == SrcCIdx) {
2227 if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2228 NeedWaitStates = 0;
2229 } else if (FullReg) {
2230 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2231 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2232 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2233 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2234 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2235 else if (ST.hasGFX940Insts() &&
2236 TSchedModel.computeInstrLatency(MI1) == 2)
2237 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2238 } else {
2239 switch (Opc1) {
2240 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2241 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2242 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2243 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2244 if (!isXDL(ST, *MI))
2245 NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2246 break;
2247 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2248 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2249 if (!isXDL(ST, *MI))
2250 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2251 break;
2252 default:
2253 if (ST.hasGFX940Insts() && isXDL(ST, *MI) && !isXDL(ST, *MI1))
2254 break;
2255 switch (TSchedModel.computeInstrLatency(MI1)) {
2256 case 2:
2257 NeedWaitStates = ST.hasGFX940Insts()
2258 ? isXDL(ST, *MI1)
2259 ? GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates
2260 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates
2261 : isDGEMM(Opc)
2262 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2263 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2264 break;
2265 case 4:
2266 assert(ST.hasGFX940Insts());
2267 NeedWaitStates = isXDL(ST, *MI1)
2268 ? GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates
2269 : GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates;
2270 break;
2271 case 8:
2272 NeedWaitStates = ST.hasGFX940Insts()
2273 ? isXDL(ST, *MI1)
2274 ? GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates
2275 : GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates
2276 : isDGEMM(Opc)
2277 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2278 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2279 break;
2280 case 16: [[fallthrough]];
2281 default:
2282 NeedWaitStates = ST.hasGFX940Insts()
2283 ? isXDL(ST, *MI1)
2284 ? GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates
2285 : GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates
2286 : isDGEMM(Opc)
2287 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2288 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2289 }
2290 }
2291 }
2292 } else {
2293 switch (Opc1) {
2294 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2295 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2296 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2297 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2298 NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2299 break;
2300 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2301 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2302 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2303 break;
2304 default:
2305 switch (TSchedModel.computeInstrLatency(MI1)) {
2306 case 2:
2307 NeedWaitStates = ST.hasGFX940Insts()
2308 ? isXDL(ST, *MI1)
2309 ? GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates
2310 : GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates
2311 : SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2312 break;
2313 case 4:
2314 assert(ST.hasGFX940Insts());
2315 NeedWaitStates = isXDL(ST, *MI1)
2316 ? GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates
2317 : GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates;
2318 break;
2319 case 8:
2320 NeedWaitStates = ST.hasGFX940Insts()
2321 ? isXDL(ST, *MI1)
2322 ? GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates
2323 : GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates
2324 : SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2325 break;
2326 case 16: [[fallthrough]];
2327 default:
2328 NeedWaitStates = ST.hasGFX940Insts()
2329 ? isXDL(ST, *MI1)
2330 ? GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates
2331 : GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates
2332 : SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2333 }
2334 }
2335 }
2336 if (WaitStatesNeeded >= NeedWaitStates)
2337 continue;
2338
2339 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2340 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2341
2342 if (WaitStatesNeeded == MaxWaitStates)
2343 break;
2344 }
2345
2346 return WaitStatesNeeded;
2347}
2348
2349int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2350 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2351 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2352 return 0;
2353
2354 int WaitStatesNeeded = 0;
2355
2356 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2357 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2358 };
2359
2360 for (const MachineOperand &Op : MI->explicit_uses()) {
2361 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2362 continue;
2363
2364 Register Reg = Op.getReg();
2365
2366 const int AccVgprReadLdStWaitStates = 2;
2367 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2368 const int MaxWaitStates = 2;
2369
2370 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2371 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2372 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2373
2374 if (WaitStatesNeeded == MaxWaitStates)
2375 return WaitStatesNeeded; // Early exit.
2376
2377 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2378 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2379 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2380 return false;
2381 auto IsVALUFn = [](const MachineInstr &MI) {
2383 };
2384 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2385 std::numeric_limits<int>::max();
2386 };
2387
2388 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2389 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2390 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2391 }
2392
2393 return WaitStatesNeeded;
2394}
2395
2396int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2397 if (!ST.hasGFX90AInsts())
2398 return 0;
2399
2400 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2401 return isDGEMM(MI.getOpcode());
2402 };
2403
2404 // This is checked in checkMAIHazards90A()
2405 if (SIInstrInfo::isMFMA(*MI))
2406 return 0;
2407
2408 const MachineRegisterInfo &MRI = MF.getRegInfo();
2409
2410 int WaitStatesNeeded = 0;
2411
2412 bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2415 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2416 bool IsVALU = SIInstrInfo::isVALU(*MI);
2417
2418 const MachineInstr *MFMA = nullptr;
2419 unsigned Reg;
2420 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2421 if (!SIInstrInfo::isMFMA(MI) ||
2422 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2423 return false;
2424 MFMA = &MI;
2425 return true;
2426 };
2427
2428 const MachineInstr *DOT = nullptr;
2429 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2430 if (!SIInstrInfo::isDOT(MI) ||
2431 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2432 return false;
2433 DOT = &MI;
2434 return true;
2435 };
2436
2437 bool DGEMMAfterVALUWrite = false;
2438 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2439 // Found DGEMM on reverse traversal to def.
2440 if (isDGEMM(MI.getOpcode()))
2441 DGEMMAfterVALUWrite = true;
2442
2443 // Only hazard if register is defined by a VALU and a DGEMM is found after
2444 // after the def.
2445 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2446 return false;
2447
2448 return true;
2449 };
2450
2451 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2452 AMDGPU::OpName::src2);
2453
2454 if (IsMemOrExport || IsVALU) {
2455 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2456 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2457 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2458 const int GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates = 4;
2459 const int GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates = 6;
2460 const int GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates = 10;
2461 const int GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates = 18;
2462 const int GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates = 5;
2463 const int GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates = 7;
2464 const int GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates = 11;
2465 const int GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates = 19;
2466 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2467 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2468 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2469 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2470 const int DotWriteSameDotReadSrcAB = 3;
2471 const int DotWriteDifferentVALURead = 3;
2472 const int DMFMABetweenVALUWriteVMEMRead = 2;
2473 const int MaxWaitStates = 19;
2474
2475 for (const MachineOperand &Use : MI->explicit_uses()) {
2476 if (!Use.isReg())
2477 continue;
2478 Reg = Use.getReg();
2479
2480 DOT = nullptr;
2481 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2482 MaxWaitStates);
2483 if (DOT) {
2484 int NeedWaitStates = 0;
2485 if (DOT->getOpcode() == MI->getOpcode()) {
2486 if (&Use - &MI->getOperand(0) != SrcCIdx)
2487 NeedWaitStates = DotWriteSameDotReadSrcAB;
2488 } else {
2489 NeedWaitStates = DotWriteDifferentVALURead;
2490 }
2491
2492 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2493 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2494 }
2495
2496 // Workaround for HW data hazard bug observed only in GFX90A. When there
2497 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2498 // causes the SQ to incorrectly not insert two wait states between the two
2499 // instructions needed to avoid data hazard.
2500 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2501 DGEMMAfterVALUWrite = false;
2502 if (TRI.isVectorRegister(MRI, Reg)) {
2503 int WaitStatesNeededForUse =
2504 DMFMABetweenVALUWriteVMEMRead -
2505 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2506 DMFMABetweenVALUWriteVMEMRead);
2507
2508 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2509 }
2510 }
2511
2512 MFMA = nullptr;
2513 WaitStatesSinceDef =
2514 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2515 if (!MFMA)
2516 continue;
2517
2518 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2519 int NeedWaitStates = MaxWaitStates;
2520 switch (HazardDefLatency) {
2521 case 2:
2522 NeedWaitStates =
2523 ST.hasGFX940Insts()
2524 ? isXDL(ST, *MFMA)
2525 ? GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates
2526 : GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates
2527 : SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2528 break;
2529 case 4:
2530 assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
2531 NeedWaitStates =
2532 isDGEMM(MFMA->getOpcode())
2533 ? IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2534 : DMFMA4x4WriteVgprVALUReadWaitStates
2535 : isXDL(ST, *MFMA)
2536 ? GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates
2537 : GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates;
2538 break;
2539 case 8:
2540 NeedWaitStates =
2541 ST.hasGFX940Insts()
2542 ? isXDL(ST, *MFMA)
2543 ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates
2544 : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates
2545 : SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2546 break;
2547 case 16: [[fallthrough]];
2548 default:
2549 NeedWaitStates =
2550 isDGEMM(MFMA->getOpcode())
2551 ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
2552 : DMFMA16x16WriteVgprVALUReadWaitStates
2553 : ST.hasGFX940Insts()
2554 ? isXDL(ST, *MFMA)
2555 ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates
2556 : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates
2557 : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2558 break;
2559 }
2560
2561 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2562 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2563
2564 if (WaitStatesNeeded == MaxWaitStates)
2565 break;
2566 }
2567 }
2568
2569 unsigned Opc = MI->getOpcode();
2570 const int DMFMAToFMA64WaitStates = 2;
2571 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2572 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2573 Opc == AMDGPU::V_FMAC_F64_dpp) &&
2574 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2575 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2576 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2577 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2578 }
2579
2580 if (!IsVALU && !IsMemOrExport)
2581 return WaitStatesNeeded;
2582
2583 for (const MachineOperand &Def : MI->defs()) {
2584 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2585 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2586 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2587 const int GFX940_SMFMA2PassWriteVgprVALUWawWaitStates = 4;
2588 const int GFX940_SMFMA4PassWriteVgprVALUWawWaitStates = 6;
2589 const int GFX940_SMFMA8PassWriteVgprVALUWawWaitStates = 10;
2590 const int GFX940_SMFMA16PassWriteVgprVALUWawWaitStates = 18;
2591 const int GFX940_XDL2PassWriteVgprVALUWawWaitStates = 5;
2592 const int GFX940_XDL4PassWriteVgprVALUWawWaitStates = 7;
2593 const int GFX940_XDL8PassWriteVgprVALUWawWaitStates = 11;
2594 const int GFX940_XDL16PassWriteVgprVALUWawWaitStates = 19;
2595 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2596 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2597 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2598 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2599 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2600 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2601 const int DotWriteDifferentVALUWrite = 3;
2602 const int MaxWaitStates = 19;
2603 const int MaxWarWaitStates = 15;
2604
2605 Reg = Def.getReg();
2606
2607 DOT = nullptr;
2608 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2609 MaxWaitStates);
2610 if (DOT && DOT->getOpcode() != MI->getOpcode())
2611 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2612 WaitStatesSinceDef);
2613
2614 MFMA = nullptr;
2615 WaitStatesSinceDef =
2616 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2617 if (MFMA) {
2618 int NeedWaitStates = MaxWaitStates;
2619 switch (TSchedModel.computeInstrLatency(MFMA)) {
2620 case 2:
2621 NeedWaitStates = ST.hasGFX940Insts()
2622 ? isXDL(ST, *MFMA)
2623 ? GFX940_XDL2PassWriteVgprVALUWawWaitStates
2624 : GFX940_SMFMA2PassWriteVgprVALUWawWaitStates
2625 : SMFMA4x4WriteVgprVALUWawWaitStates;
2626 break;
2627 case 4:
2628 assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
2629 NeedWaitStates = isDGEMM(MFMA->getOpcode())
2630 ? DMFMA4x4WriteVgprVALUWriteWaitStates
2631 : isXDL(ST, *MFMA)
2632 ? GFX940_XDL4PassWriteVgprVALUWawWaitStates
2633 : GFX940_SMFMA4PassWriteVgprVALUWawWaitStates;
2634 break;
2635 case 8:
2636 NeedWaitStates = ST.hasGFX940Insts()
2637 ? isXDL(ST, *MFMA)
2638 ? GFX940_XDL8PassWriteVgprVALUWawWaitStates
2639 : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates
2640 : SMFMA16x16WriteVgprVALUWawWaitStates;
2641 break;
2642 case 16: [[fallthrough]];
2643 default:
2644 NeedWaitStates = isDGEMM(MFMA->getOpcode())
2645 ? DMFMA16x16WriteVgprVALUWriteWaitStates
2646 : ST.hasGFX940Insts()
2647 ? isXDL(ST, *MFMA)
2648 ? GFX940_XDL16PassWriteVgprVALUWawWaitStates
2649 : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates
2650 : SMFMA32x32WriteVgprVALUWawWaitStates;
2651 break;
2652 }
2653
2654 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2655 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2656
2657 if (WaitStatesNeeded == MaxWaitStates)
2658 break;
2659 }
2660
2661 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2662 if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2663 !MI.readsRegister(Reg, &TRI))
2664 return false;
2665
2666 if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2667 return false;
2668
2669 const MachineOperand *SrcC =
2670 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2671 assert(SrcC);
2672 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2673 return false;
2674
2675 MFMA = &MI;
2676 return true;
2677 };
2678
2679 MFMA = nullptr;
2680 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2681 MaxWarWaitStates);
2682 if (!MFMA)
2683 continue;
2684
2685 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2686 int NeedWaitStates = MaxWaitStates;
2687 switch (HazardDefLatency) {
2688 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2689 break;
2690 case 4: assert(ST.hasGFX940Insts());
2691 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2692 break;
2693 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2694 break;
2695 case 16: [[fallthrough]];
2696 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2697 break;
2698 }
2699
2700 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2701 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2702 }
2703
2704 return WaitStatesNeeded;
2705}
2706
2708 if (!SU->isInstr())
2709 return false;
2710
2711 const MachineInstr *MAI = nullptr;
2712
2713 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2714 MAI = nullptr;
2716 MAI = &MI;
2717 return MAI != nullptr;
2718 };
2719
2720 MachineInstr *MI = SU->getInstr();
2721 if (IsMFMAFn(*MI)) {
2722 int W = getWaitStatesSince(IsMFMAFn, 16);
2723 if (MAI)
2724 return W < (int)TSchedModel.computeInstrLatency(MAI);
2725 }
2726
2727 return false;
2728}
2729
2730bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2731 if (!ST.hasVALUMaskWriteHazard())
2732 return false;
2734
2735 if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
2736 return false;
2737
2738 // The hazard sequence is three instructions:
2739 // 1. VALU reads SGPR as mask
2740 // 2. SALU writes SGPR
2741 // 3. SALU reads SGPR
2742 // The hazard can expire if the distance between 2 and 3 is sufficient.
2743 // In practice this happens <10% of the time, hence this always assumes
2744 // the hazard exists if 1 and 2 are present to avoid searching.
2745
2746 const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2747 if (!SDSTOp || !SDSTOp->isReg())
2748 return false;
2749
2750 const Register HazardReg = SDSTOp->getReg();
2751 if (HazardReg == AMDGPU::EXEC ||
2752 HazardReg == AMDGPU::EXEC_LO ||
2753 HazardReg == AMDGPU::EXEC_HI ||
2754 HazardReg == AMDGPU::M0)
2755 return false;
2756
2757 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2758 switch (I.getOpcode()) {
2759 case AMDGPU::V_ADDC_U32_e32:
2760 case AMDGPU::V_ADDC_U32_dpp:
2761 case AMDGPU::V_CNDMASK_B16_e32:
2762 case AMDGPU::V_CNDMASK_B16_dpp:
2763 case AMDGPU::V_CNDMASK_B32_e32:
2764 case AMDGPU::V_CNDMASK_B32_dpp:
2765 case AMDGPU::V_DIV_FMAS_F32_e64:
2766 case AMDGPU::V_DIV_FMAS_F64_e64:
2767 case AMDGPU::V_SUBB_U32_e32:
2768 case AMDGPU::V_SUBB_U32_dpp:
2769 case AMDGPU::V_SUBBREV_U32_e32:
2770 case AMDGPU::V_SUBBREV_U32_dpp:
2771 // These implicitly read VCC as mask source.
2772 return HazardReg == AMDGPU::VCC ||
2773 HazardReg == AMDGPU::VCC_LO ||
2774 HazardReg == AMDGPU::VCC_HI;
2775 case AMDGPU::V_ADDC_U32_e64:
2776 case AMDGPU::V_ADDC_U32_e64_dpp:
2777 case AMDGPU::V_CNDMASK_B16_e64:
2778 case AMDGPU::V_CNDMASK_B16_e64_dpp:
2779 case AMDGPU::V_CNDMASK_B32_e64:
2780 case AMDGPU::V_CNDMASK_B32_e64_dpp:
2781 case AMDGPU::V_SUBB_U32_e64:
2782 case AMDGPU::V_SUBB_U32_e64_dpp:
2783 case AMDGPU::V_SUBBREV_U32_e64:
2784 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2785 // Only check mask register overlaps.
2786 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
2787 assert(SSRCOp);
2788 return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
2789 }
2790 default:
2791 return false;
2792 }
2793 };
2794
2795 const MachineRegisterInfo &MRI = MF.getRegInfo();
2796 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
2797 // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2798 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2799 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
2800 return true;
2801
2802 // VALU access to any SGPR or literal constant other than HazardReg
2803 // mitigates hazard. No need to check HazardReg here as this will
2804 // only be called when !IsHazardFn.
2805 if (!SIInstrInfo::isVALU(I))
2806 return false;
2807 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
2808 const MachineOperand &Op = I.getOperand(OpNo);
2809 if (Op.isReg()) {
2810 Register OpReg = Op.getReg();
2811 // Only consider uses
2812 if (!Op.isUse())
2813 continue;
2814 // Ignore EXEC
2815 if (OpReg == AMDGPU::EXEC ||
2816 OpReg == AMDGPU::EXEC_LO ||
2817 OpReg == AMDGPU::EXEC_HI)
2818 continue;
2819 // Ignore all implicit uses except VCC
2820 if (Op.isImplicit()) {
2821 if (OpReg == AMDGPU::VCC ||
2822 OpReg == AMDGPU::VCC_LO ||
2823 OpReg == AMDGPU::VCC_HI)
2824 return true;
2825 continue;
2826 }
2827 if (TRI.isSGPRReg(MRI, OpReg))
2828 return true;
2829 } else {
2830 const MCInstrDesc &InstDesc = I.getDesc();
2831 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
2832 if (!TII.isInlineConstant(Op, OpInfo))
2833 return true;
2834 }
2835 }
2836 return false;
2837 };
2838
2839 // Check for hazard
2840 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2841 std::numeric_limits<int>::max())
2842 return false;
2843
2844 auto NextMI = std::next(MI->getIterator());
2845
2846 // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
2847 BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
2848 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2850
2851 // SALU write may be s_getpc in a bundle.
2852 if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2853 // Update offsets of any references in the bundle.
2854 while (NextMI != MI->getParent()->end() &&
2855 NextMI->isBundledWithPred()) {
2856 for (auto &Operand : NextMI->operands()) {
2857 if (Operand.isGlobal())
2858 Operand.setOffset(Operand.getOffset() + 4);
2859 }
2860 NextMI++;
2861 }
2862 }
2863
2864 return true;
2865}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
bool End
Definition: ELF_riscv.cpp:480
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI)
static bool isLdsDma(const MachineInstr &MI)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isSSetReg(unsigned Opcode)
static bool hasHazard(StateT State, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, DenseSet< const MachineBasicBlock * > &Visited)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
@ HazardExpired
@ NoHazardFound
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
static bool isDGEMM(unsigned Opcode)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
static bool isPermlane(const MachineInstr &MI)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
if(VerifyEach)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static const uint32_t IV[8]
Definition: blake3_impl.h:78
bool anyCommon(const BitVector &RHS) const
Test if any common bits are set.
Definition: BitVector.h:489
BitVector & set()
Definition: BitVector.h:351
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
unsigned PreEmitNoopsCommon(MachineInstr *)
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
bool hasShift64HighRegBug() const
bool hasFPAtomicToDenormModeHazard() const
bool hasLdsBranchVmemWARHazard() const
bool hasGFX90AInsts() const
bool hasDstSelForwardingHazard() const
bool hasMAIInsts() const
Definition: GCNSubtarget.h:788
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:250
bool hasVALUMaskWriteHazard() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasVcmpxExecWARHazard() const
bool hasReadM0MovRelInterpHazard() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:262
bool hasLdsWaitVMSRC() const
bool hasExtendedWaitCounts() const
bool hasVcmpxPermlaneHazard() const
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
bool hasNoDataDepHazard() const
Definition: GCNSubtarget.h:862
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
Definition: GCNSubtarget.h:493
bool hasTransForwardingHazard() const
bool hasGFX940Insts() const
bool hasReadM0LdsDmaHazard() const
bool hasSMEMtoVectorWriteHazard() const
bool hasVMEMtoScalarWriteHazard() const
bool hasNSAtoVMEMBug() const
bool hasVDecCoExecHazard() const
bool hasReadM0SendMsgHazard() const
bool hasReadM0LdsDirectHazard() const
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:589
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
Definition: GCNSubtarget.h:478
bool hasRFEHazards() const
Definition: GCNSubtarget.h:488
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
Definition: GCNSubtarget.h:484
bool isWave64() const
bool hasVALUTransUseHazard() const
bool hasLdsDirect() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
Instructions::const_reverse_iterator const_reverse_instr_iterator
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
instr_iterator instr_end()
iterator_range< pred_iterator > predecessors()
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:544
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:327
bool isBundle() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:792
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:554
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:432
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:536
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:649
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:408
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:512
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isVINTRP(const MachineInstr &MI)
Definition: SIInstrInfo.h:784
static bool isDOT(const MachineInstr &MI)
Definition: SIInstrInfo.h:805
static bool isSWMMAC(const MachineInstr &MI)
Definition: SIInstrInfo.h:821
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:833
bool isBufferSMRD(const MachineInstr &MI) const
static bool isTRANS(const MachineInstr &MI)
Definition: SIInstrInfo.h:768
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:528
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:618
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:760
static bool isMFMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:800
static bool isFPAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:916
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:580
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:809
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:416
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:362
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:466
ProcResIter getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:31
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:109
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isGFX12Plus(const MCSubtargetInfo &STI)
IsaVersion getIsaVersion(StringRef GPU)
bool getMAIIsGFX940XDL(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool getMAIIsDGEMM(unsigned Opc)
Returns true if MAI operation is a double precision GEMM.
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double e
Definition: MathExtras.h:31
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:456
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
Instruction set architecture version.
Definition: TargetParser.h:125
Represents the counter values to wait for in an s_waitcnt instruction.
Description of the encoding of one expression Op.
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:118
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...
Definition: MCSchedule.h:68
Definition: regcomp.c:192