LLVM 19.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
20
21using namespace llvm;
22
23namespace {
24
25struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
26 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
27
28 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
29 if (Arg.getAsInteger(0, Value))
30 return O.error("'" + Arg + "' value invalid for uint argument!");
31
32 if (Value > 100)
33 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
34
35 return false;
36 }
37};
38
39} // end anonymous namespace
40
42 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
43 cl::desc("Fill a percentage of the latency between "
44 "neighboring MFMA with s_nops."));
45
46//===----------------------------------------------------------------------===//
47// Hazard Recognizer Implementation
48//===----------------------------------------------------------------------===//
49
51 const GCNSubtarget &ST);
52
54 IsHazardRecognizerMode(false),
55 CurrCycleInstr(nullptr),
56 MF(MF),
57 ST(MF.getSubtarget<GCNSubtarget>()),
58 TII(*ST.getInstrInfo()),
59 TRI(TII.getRegisterInfo()),
60 ClauseUses(TRI.getNumRegUnits()),
61 ClauseDefs(TRI.getNumRegUnits()) {
62 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
63 TSchedModel.init(&ST);
64 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
65}
66
68 EmittedInstrs.clear();
69}
70
73}
74
76 CurrCycleInstr = MI;
77}
78
79static bool isDivFMas(unsigned Opcode) {
80 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
81}
82
83static bool isSGetReg(unsigned Opcode) {
84 return Opcode == AMDGPU::S_GETREG_B32;
85}
86
87static bool isSSetReg(unsigned Opcode) {
88 switch (Opcode) {
89 case AMDGPU::S_SETREG_B32:
90 case AMDGPU::S_SETREG_B32_mode:
91 case AMDGPU::S_SETREG_IMM32_B32:
92 case AMDGPU::S_SETREG_IMM32_B32_mode:
93 return true;
94 }
95 return false;
96}
97
98static bool isRWLane(unsigned Opcode) {
99 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
100}
101
102static bool isRFE(unsigned Opcode) {
103 return Opcode == AMDGPU::S_RFE_B64;
104}
105
106static bool isSMovRel(unsigned Opcode) {
107 switch (Opcode) {
108 case AMDGPU::S_MOVRELS_B32:
109 case AMDGPU::S_MOVRELS_B64:
110 case AMDGPU::S_MOVRELD_B32:
111 case AMDGPU::S_MOVRELD_B64:
112 return true;
113 default:
114 return false;
115 }
116}
117
118static bool isDGEMM(unsigned Opcode) {
119 return AMDGPU::getMAIIsDGEMM(Opcode);
120}
121
122static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
123 unsigned Opcode = MI.getOpcode();
124
125 if (!SIInstrInfo::isMAI(MI) ||
126 isDGEMM(Opcode) ||
127 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
128 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
129 return false;
130
131 if (!ST.hasGFX940Insts())
132 return true;
133
134 return AMDGPU::getMAIIsGFX940XDL(Opcode);
135}
136
138 const MachineInstr &MI) {
139 if (TII.isAlwaysGDS(MI.getOpcode()))
140 return true;
141
142 switch (MI.getOpcode()) {
143 case AMDGPU::S_SENDMSG:
144 case AMDGPU::S_SENDMSGHALT:
145 case AMDGPU::S_TTRACEDATA:
146 return true;
147 // These DS opcodes don't support GDS.
148 case AMDGPU::DS_NOP:
149 case AMDGPU::DS_PERMUTE_B32:
150 case AMDGPU::DS_BPERMUTE_B32:
151 return false;
152 default:
153 if (TII.isDS(MI.getOpcode())) {
154 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
155 AMDGPU::OpName::gds);
156 if (MI.getOperand(GDS).getImm())
157 return true;
158 }
159 return false;
160 }
161}
162
163static bool isPermlane(const MachineInstr &MI) {
164 unsigned Opcode = MI.getOpcode();
165 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANE64_B32 ||
167 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
169 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
170}
171
172static bool isLdsDma(const MachineInstr &MI) {
173 return SIInstrInfo::isVALU(MI) &&
175}
176
177static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
178 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
179 AMDGPU::OpName::simm16);
180 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
181}
182
185 MachineInstr *MI = SU->getInstr();
186 // If we are not in "HazardRecognizerMode" and therefore not being run from
187 // the scheduler, track possible stalls from hazards but don't insert noops.
188 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
189
190 if (MI->isBundle())
191 return NoHazard;
192
193 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
194 return HazardType;
195
196 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
197 return HazardType;
198
199 if (checkFPAtomicToDenormModeHazard(MI) > 0)
200 return HazardType;
201
202 if (ST.hasNoDataDepHazard())
203 return NoHazard;
204
205 // FIXME: Should flat be considered vmem?
206 if ((SIInstrInfo::isVMEM(*MI) ||
208 && checkVMEMHazards(MI) > 0)
209 return HazardType;
210
211 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
212 return HazardType;
213
214 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
215 return HazardType;
216
217 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
218 return HazardType;
219
220 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
221 return HazardType;
222
225 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
226 return HazardType;
227
228 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
229 return HazardType;
230
231 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
232 return HazardType;
233
234 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
235 return HazardType;
236
237 if (((ST.hasReadM0MovRelInterpHazard() &&
238 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
239 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
240 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
242 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
244 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
245 checkReadM0Hazards(MI) > 0)
246 return HazardType;
247
248 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
249 return HazardType;
250
251 if ((SIInstrInfo::isVMEM(*MI) ||
253 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
254 return HazardType;
255
256 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
257 return HazardType;
258
259 return NoHazard;
260}
261
263 unsigned Quantity) {
264 while (Quantity > 0) {
265 unsigned Arg = std::min(Quantity, 8u);
266 Quantity -= Arg;
267 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
268 .addImm(Arg - 1);
269 }
270}
271
272unsigned
273GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
274 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
275 assert(TSchedModel.getWriteProcResBegin(SC) !=
276 TSchedModel.getWriteProcResEnd(SC));
277 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
278}
279
280void GCNHazardRecognizer::processBundle() {
281 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
283 // Check bundled MachineInstr's for hazards.
284 for (; MI != E && MI->isInsideBundle(); ++MI) {
285 CurrCycleInstr = &*MI;
286 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
287
288 if (IsHazardRecognizerMode) {
289 fixHazards(CurrCycleInstr);
290
291 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
292 }
293
294 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
295 // include the bundled MI directly after, only add a maximum of
296 // (MaxLookAhead - 1) noops to EmittedInstrs.
297 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
298 EmittedInstrs.push_front(nullptr);
299
300 EmittedInstrs.push_front(CurrCycleInstr);
301 EmittedInstrs.resize(MaxLookAhead);
302 }
303 CurrCycleInstr = nullptr;
304}
305
306void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
307 assert(IsHazardRecognizerMode);
308
309 unsigned NumPreNoops = PreEmitNoops(MI);
310 EmitNoops(NumPreNoops);
311 if (MI->isInsideBundle())
312 insertNoopsInBundle(MI, TII, NumPreNoops);
313 else
314 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
315 NumPreNoops);
317 AdvanceCycle();
318}
319
321 IsHazardRecognizerMode = true;
322 CurrCycleInstr = MI;
323 unsigned W = PreEmitNoopsCommon(MI);
324 fixHazards(MI);
325 CurrCycleInstr = nullptr;
326 return W;
327}
328
330 if (MI->isBundle())
331 return 0;
332
333 int WaitStates = 0;
334
336 return std::max(WaitStates, checkSMRDHazards(MI));
337
338 if (ST.hasNSAtoVMEMBug())
339 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
340
341 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
342
343 if (ST.hasNoDataDepHazard())
344 return WaitStates;
345
347 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
348
350 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
351
353 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
354
355 if (isDivFMas(MI->getOpcode()))
356 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
357
358 if (isRWLane(MI->getOpcode()))
359 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
360
363 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
364 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
365
366 if (MI->isInlineAsm())
367 return std::max(WaitStates, checkInlineAsmHazards(MI));
368
369 if (isSGetReg(MI->getOpcode()))
370 return std::max(WaitStates, checkGetRegHazards(MI));
371
372 if (isSSetReg(MI->getOpcode()))
373 return std::max(WaitStates, checkSetRegHazards(MI));
374
375 if (isRFE(MI->getOpcode()))
376 return std::max(WaitStates, checkRFEHazards(MI));
377
378 if ((ST.hasReadM0MovRelInterpHazard() &&
379 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
380 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
381 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
383 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
385 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
386 return std::max(WaitStates, checkReadM0Hazards(MI));
387
389 return std::max(WaitStates, checkMAIHazards(MI));
390
391 if (SIInstrInfo::isVMEM(*MI) ||
394 return std::max(WaitStates, checkMAILdStHazards(MI));
395
396 return WaitStates;
397}
398
400 EmittedInstrs.push_front(nullptr);
401}
402
404 // When the scheduler detects a stall, it will call AdvanceCycle() without
405 // emitting any instructions.
406 if (!CurrCycleInstr) {
407 EmittedInstrs.push_front(nullptr);
408 return;
409 }
410
411 if (CurrCycleInstr->isBundle()) {
412 processBundle();
413 return;
414 }
415
416 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
417 if (!NumWaitStates) {
418 CurrCycleInstr = nullptr;
419 return;
420 }
421
422 // Keep track of emitted instructions
423 EmittedInstrs.push_front(CurrCycleInstr);
424
425 // Add a nullptr for each additional wait state after the first. Make sure
426 // not to add more than getMaxLookAhead() items to the list, since we
427 // truncate the list to that size right after this loop.
428 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
429 i < e; ++i) {
430 EmittedInstrs.push_front(nullptr);
431 }
432
433 // getMaxLookahead() is the largest number of wait states we will ever need
434 // to insert, so there is no point in keeping track of more than that many
435 // wait states.
436 EmittedInstrs.resize(getMaxLookAhead());
437
438 CurrCycleInstr = nullptr;
439}
440
442 llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
443}
444
445//===----------------------------------------------------------------------===//
446// Helper Functions
447//===----------------------------------------------------------------------===//
448
450
451typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
452typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn;
453
454// Search for a hazard in a block and its predecessors.
455template <typename StateT>
456static bool
457hasHazard(StateT State,
458 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
459 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
460 const MachineBasicBlock *MBB,
463 for (auto E = MBB->instr_rend(); I != E; ++I) {
464 // No need to look at parent BUNDLE instructions.
465 if (I->isBundle())
466 continue;
467
468 switch (IsHazard(State, *I)) {
469 case HazardFound:
470 return true;
471 case HazardExpired:
472 return false;
473 default:
474 // Continue search
475 break;
476 }
477
478 if (I->isInlineAsm() || I->isMetaInstruction())
479 continue;
480
481 UpdateState(State, *I);
482 }
483
484 for (MachineBasicBlock *Pred : MBB->predecessors()) {
485 if (!Visited.insert(Pred).second)
486 continue;
487
488 if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
489 Visited))
490 return true;
491 }
492
493 return false;
494}
495
496// Returns a minimum wait states since \p I walking all predecessors.
497// Only scans until \p IsExpired does not return true.
498// Can only be run in a hazard recognizer mode.
504 for (auto E = MBB->instr_rend(); I != E; ++I) {
505 // Don't add WaitStates for parent BUNDLE instructions.
506 if (I->isBundle())
507 continue;
508
509 if (IsHazard(*I))
510 return WaitStates;
511
512 if (I->isInlineAsm())
513 continue;
514
515 WaitStates += GetNumWaitStates(*I);
516
517 if (IsExpired(*I, WaitStates))
518 return std::numeric_limits<int>::max();
519 }
520
521 int MinWaitStates = std::numeric_limits<int>::max();
522 for (MachineBasicBlock *Pred : MBB->predecessors()) {
523 if (!Visited.insert(Pred).second)
524 continue;
525
526 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
527 IsExpired, Visited, GetNumWaitStates);
528
529 MinWaitStates = std::min(MinWaitStates, W);
530 }
531
532 return MinWaitStates;
533}
534
536 const MachineInstr *MI, IsExpiredFn IsExpired) {
538 return getWaitStatesSince(IsHazard, MI->getParent(),
539 std::next(MI->getReverseIterator()),
540 0, IsExpired, Visited);
541}
542
543int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
544 if (IsHazardRecognizerMode) {
545 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
546 return WaitStates >= Limit;
547 };
548 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
549 }
550
551 int WaitStates = 0;
552 for (MachineInstr *MI : EmittedInstrs) {
553 if (MI) {
554 if (IsHazard(*MI))
555 return WaitStates;
556
557 if (MI->isInlineAsm())
558 continue;
559 }
560 ++WaitStates;
561
562 if (WaitStates >= Limit)
563 break;
564 }
565 return std::numeric_limits<int>::max();
566}
567
568int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
569 IsHazardFn IsHazardDef,
570 int Limit) {
571 const SIRegisterInfo *TRI = ST.getRegisterInfo();
572
573 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
574 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
575 };
576
577 return getWaitStatesSince(IsHazardFn, Limit);
578}
579
580int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
581 int Limit) {
582 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
583 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
584 };
585
586 return getWaitStatesSince(IsHazardFn, Limit);
587}
588
589//===----------------------------------------------------------------------===//
590// No-op Hazard Detection
591//===----------------------------------------------------------------------===//
592
593static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
594 MCRegister Reg) {
595 for (MCRegUnit Unit : TRI.regunits(Reg))
596 BV.set(Unit);
597}
598
599static void addRegsToSet(const SIRegisterInfo &TRI,
601 BitVector &DefSet, BitVector &UseSet) {
602 for (const MachineOperand &Op : Ops) {
603 if (Op.isReg())
604 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
605 }
606}
607
608void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
609 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
610}
611
613 return !SIInstrInfo::isSMRD(*MI);
614}
615
618}
619
620int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
621 // SMEM soft clause are only present on VI+, and only matter if xnack is
622 // enabled.
623 if (!ST.isXNACKEnabled())
624 return 0;
625
626 bool IsSMRD = TII.isSMRD(*MEM);
627
628 resetClause();
629
630 // A soft-clause is any group of consecutive SMEM instructions. The
631 // instructions in this group may return out of order and/or may be
632 // replayed (i.e. the same instruction issued more than once).
633 //
634 // In order to handle these situations correctly we need to make sure that
635 // when a clause has more than one instruction, no instruction in the clause
636 // writes to a register that is read by another instruction in the clause
637 // (including itself). If we encounter this situation, we need to break the
638 // clause by inserting a non SMEM instruction.
639
640 for (MachineInstr *MI : EmittedInstrs) {
641 // When we hit a non-SMEM instruction then we have passed the start of the
642 // clause and we can stop.
643 if (!MI)
644 break;
645
647 break;
648
649 addClauseInst(*MI);
650 }
651
652 if (ClauseDefs.none())
653 return 0;
654
655 // We need to make sure not to put loads and stores in the same clause if they
656 // use the same address. For now, just start a new clause whenever we see a
657 // store.
658 if (MEM->mayStore())
659 return 1;
660
661 addClauseInst(*MEM);
662
663 // If the set of defs and uses intersect then we cannot add this instruction
664 // to the clause, so we have a hazard.
665 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
666}
667
668int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
669 int WaitStatesNeeded = 0;
670
671 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
672
673 // This SMRD hazard only affects SI.
674 if (!ST.hasSMRDReadVALUDefHazard())
675 return WaitStatesNeeded;
676
677 // A read of an SGPR by SMRD instruction requires 4 wait states when the
678 // SGPR was written by a VALU instruction.
679 int SmrdSgprWaitStates = 4;
680 auto IsHazardDefFn = [this](const MachineInstr &MI) {
681 return TII.isVALU(MI);
682 };
683 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
684 return TII.isSALU(MI);
685 };
686
687 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
688
689 for (const MachineOperand &Use : SMRD->uses()) {
690 if (!Use.isReg())
691 continue;
692 int WaitStatesNeededForUse =
693 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
694 SmrdSgprWaitStates);
695 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
696
697 // This fixes what appears to be undocumented hardware behavior in SI where
698 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
699 // needs some number of nops in between. We don't know how many we need, but
700 // let's use 4. This wasn't discovered before probably because the only
701 // case when this happens is when we expand a 64-bit pointer into a full
702 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
703 // probably never encountered in the closed-source land.
704 if (IsBufferSMRD) {
705 int WaitStatesNeededForUse =
706 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
707 IsBufferHazardDefFn,
708 SmrdSgprWaitStates);
709 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
710 }
711 }
712
713 return WaitStatesNeeded;
714}
715
716int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
718 return 0;
719
720 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
721
722 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
723 // SGPR was written by a VALU Instruction.
724 const int VmemSgprWaitStates = 5;
725 auto IsHazardDefFn = [this](const MachineInstr &MI) {
726 return TII.isVALU(MI);
727 };
728 for (const MachineOperand &Use : VMEM->uses()) {
729 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
730 continue;
731
732 int WaitStatesNeededForUse =
733 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
734 VmemSgprWaitStates);
735 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
736 }
737 return WaitStatesNeeded;
738}
739
740int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
741 const SIRegisterInfo *TRI = ST.getRegisterInfo();
742 const SIInstrInfo *TII = ST.getInstrInfo();
743
744 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
745 int DppVgprWaitStates = 2;
746 int DppExecWaitStates = 5;
747 int WaitStatesNeeded = 0;
748 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
749 return TII->isVALU(MI);
750 };
751
752 for (const MachineOperand &Use : DPP->uses()) {
753 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
754 continue;
755 int WaitStatesNeededForUse =
756 DppVgprWaitStates - getWaitStatesSinceDef(
757 Use.getReg(),
758 [](const MachineInstr &) { return true; },
759 DppVgprWaitStates);
760 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
761 }
762
763 WaitStatesNeeded = std::max(
764 WaitStatesNeeded,
765 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
766 DppExecWaitStates));
767
768 return WaitStatesNeeded;
769}
770
771int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
772 const SIInstrInfo *TII = ST.getInstrInfo();
773
774 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
775 // instruction.
776 const int DivFMasWaitStates = 4;
777 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
778 return TII->isVALU(MI);
779 };
780 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
781 DivFMasWaitStates);
782
783 return DivFMasWaitStates - WaitStatesNeeded;
784}
785
786int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
787 const SIInstrInfo *TII = ST.getInstrInfo();
788 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
789
790 const int GetRegWaitStates = 2;
791 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
792 return GetRegHWReg == getHWReg(TII, MI);
793 };
794 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
795
796 return GetRegWaitStates - WaitStatesNeeded;
797}
798
799int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
800 const SIInstrInfo *TII = ST.getInstrInfo();
801 unsigned HWReg = getHWReg(TII, *SetRegInstr);
802
803 const int SetRegWaitStates = ST.getSetRegWaitStates();
804 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
805 return HWReg == getHWReg(TII, MI);
806 };
807 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
808 return SetRegWaitStates - WaitStatesNeeded;
809}
810
811int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
812 if (!MI.mayStore())
813 return -1;
814
815 const SIInstrInfo *TII = ST.getInstrInfo();
816 unsigned Opcode = MI.getOpcode();
817 const MCInstrDesc &Desc = MI.getDesc();
818
819 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
820 int VDataRCID = -1;
821 if (VDataIdx != -1)
822 VDataRCID = Desc.operands()[VDataIdx].RegClass;
823
824 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
825 // There is no hazard if the instruction does not use vector regs
826 // (like wbinvl1)
827 if (VDataIdx == -1)
828 return -1;
829 // For MUBUF/MTBUF instructions this hazard only exists if the
830 // instruction is not using a register in the soffset field.
831 const MachineOperand *SOffset =
832 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
833 // If we have no soffset operand, then assume this field has been
834 // hardcoded to zero.
835 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
836 (!SOffset || !SOffset->isReg()))
837 return VDataIdx;
838 }
839
840 // MIMG instructions create a hazard if they don't use a 256-bit T# and
841 // the store size is greater than 8 bytes and they have more than two bits
842 // of their dmask set.
843 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
844 if (TII->isMIMG(MI)) {
845 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
846 assert(SRsrcIdx != -1 &&
847 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
848 (void)SRsrcIdx;
849 }
850
851 if (TII->isFLAT(MI)) {
852 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
853 if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
854 return DataIdx;
855 }
856
857 return -1;
858}
859
860int
861GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
862 const MachineRegisterInfo &MRI) {
863 // Helper to check for the hazard where VMEM instructions that store more than
864 // 8 bytes can have there store data over written by the next instruction.
865 const SIRegisterInfo *TRI = ST.getRegisterInfo();
866
867 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
868 int WaitStatesNeeded = 0;
869
870 if (!TRI->isVectorRegister(MRI, Def.getReg()))
871 return WaitStatesNeeded;
872 Register Reg = Def.getReg();
873 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
874 int DataIdx = createsVALUHazard(MI);
875 return DataIdx >= 0 &&
876 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
877 };
878 int WaitStatesNeededForDef =
879 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
880 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
881
882 return WaitStatesNeeded;
883}
884
885int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
886 int WaitStatesNeeded = 0;
887
889 const int TransDefWaitstates = 1;
890
891 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
893 return false;
894 const SIRegisterInfo *TRI = ST.getRegisterInfo();
895 const SIInstrInfo *TII = ST.getInstrInfo();
896 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
897
898 for (const MachineOperand &Use : VALU->explicit_uses()) {
899 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
900 return true;
901 }
902
903 return false;
904 };
905
906 int WaitStatesNeededForDef =
907 TransDefWaitstates -
908 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
909 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
910 }
911
912 if (ST.hasDstSelForwardingHazard()) {
913 const int Shift16DefWaitstates = 1;
914
915 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
917 return false;
918 const SIInstrInfo *TII = ST.getInstrInfo();
919 if (SIInstrInfo::isSDWA(MI)) {
920 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
921 if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
922 return false;
923 } else {
924 if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
925 !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
926 ->getImm() &
928 return false;
929 }
930 const SIRegisterInfo *TRI = ST.getRegisterInfo();
931 if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
932 Register Def = Dst->getReg();
933
934 for (const MachineOperand &Use : VALU->explicit_uses()) {
935 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
936 return true;
937 }
938 }
939
940 return false;
941 };
942
943 int WaitStatesNeededForDef =
944 Shift16DefWaitstates -
945 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
946 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
947 }
948
949 if (ST.hasVDecCoExecHazard()) {
950 const int VALUWriteSGPRVALUReadWaitstates = 2;
951 const int VALUWriteEXECRWLane = 4;
952 const int VALUWriteVGPRReadlaneRead = 1;
953
954 const SIRegisterInfo *TRI = ST.getRegisterInfo();
955 const MachineRegisterInfo &MRI = MF.getRegInfo();
957 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
959 return false;
960 return MI.modifiesRegister(UseReg, TRI);
961 };
962
963 for (const MachineOperand &Use : VALU->explicit_uses()) {
964 if (!Use.isReg())
965 continue;
966
967 UseReg = Use.getReg();
968 if (TRI->isSGPRReg(MRI, UseReg)) {
969 int WaitStatesNeededForDef =
970 VALUWriteSGPRVALUReadWaitstates -
971 getWaitStatesSince(IsVALUDefSGPRFn,
972 VALUWriteSGPRVALUReadWaitstates);
973 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
974 }
975 }
976
977 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
978 UseReg = AMDGPU::VCC;
979 int WaitStatesNeededForDef =
980 VALUWriteSGPRVALUReadWaitstates -
981 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
982 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
983 }
984
985 switch (VALU->getOpcode()) {
986 case AMDGPU::V_READLANE_B32:
987 case AMDGPU::V_READFIRSTLANE_B32: {
988 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
989 UseReg = Src->getReg();
990 int WaitStatesNeededForDef =
991 VALUWriteVGPRReadlaneRead -
992 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
993 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
994 }
995 [[fallthrough]];
996 case AMDGPU::V_WRITELANE_B32: {
997 UseReg = AMDGPU::EXEC;
998 int WaitStatesNeededForDef =
999 VALUWriteEXECRWLane -
1000 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1001 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1002 break;
1003 }
1004 default:
1005 break;
1006 }
1007 }
1008
1009 // This checks for the hazard where VMEM instructions that store more than
1010 // 8 bytes can have there store data over written by the next instruction.
1011 if (!ST.has12DWordStoreHazard())
1012 return WaitStatesNeeded;
1013
1014 const MachineRegisterInfo &MRI = MF.getRegInfo();
1015
1016 for (const MachineOperand &Def : VALU->defs()) {
1017 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1018 }
1019
1020 return WaitStatesNeeded;
1021}
1022
1023int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1024 // This checks for hazards associated with inline asm statements.
1025 // Since inline asms can contain just about anything, we use this
1026 // to call/leverage other check*Hazard routines. Note that
1027 // this function doesn't attempt to address all possible inline asm
1028 // hazards (good luck), but is a collection of what has been
1029 // problematic thus far.
1030
1031 // see checkVALUHazards()
1032 if (!ST.has12DWordStoreHazard())
1033 return 0;
1034
1035 const MachineRegisterInfo &MRI = MF.getRegInfo();
1036 int WaitStatesNeeded = 0;
1037
1038 for (const MachineOperand &Op :
1040 if (Op.isReg() && Op.isDef()) {
1041 WaitStatesNeeded =
1042 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1043 }
1044 }
1045
1046 return WaitStatesNeeded;
1047}
1048
1049int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1050 const SIInstrInfo *TII = ST.getInstrInfo();
1051 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1052 const MachineRegisterInfo &MRI = MF.getRegInfo();
1053
1054 const MachineOperand *LaneSelectOp =
1055 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1056
1057 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1058 return 0;
1059
1060 Register LaneSelectReg = LaneSelectOp->getReg();
1061 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1062
1063 const int RWLaneWaitStates = 4;
1064 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1065 RWLaneWaitStates);
1066 return RWLaneWaitStates - WaitStatesSince;
1067}
1068
1069int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1070 if (!ST.hasRFEHazards())
1071 return 0;
1072
1073 const SIInstrInfo *TII = ST.getInstrInfo();
1074
1075 const int RFEWaitStates = 1;
1076
1077 auto IsHazardFn = [TII](const MachineInstr &MI) {
1078 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1079 };
1080 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1081 return RFEWaitStates - WaitStatesNeeded;
1082}
1083
1084int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1085 const SIInstrInfo *TII = ST.getInstrInfo();
1086 const int ReadM0WaitStates = 1;
1087 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1088 return ReadM0WaitStates -
1089 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1090}
1091
1092void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1093 fixVMEMtoScalarWriteHazards(MI);
1094 fixVcmpxPermlaneHazards(MI);
1095 fixSMEMtoVectorWriteHazards(MI);
1096 fixVcmpxExecWARHazard(MI);
1097 fixLdsBranchVmemWARHazard(MI);
1098 if (ST.hasLdsDirect()) {
1099 fixLdsDirectVALUHazard(MI);
1100 fixLdsDirectVMEMHazard(MI);
1101 }
1102 fixVALUPartialForwardingHazard(MI);
1103 fixVALUTransUseHazard(MI);
1104 fixWMMAHazards(MI);
1105 fixShift64HighRegBug(MI);
1106 fixVALUMaskWriteHazard(MI);
1107}
1108
1109bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1110 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1111 return false;
1112
1113 const SIInstrInfo *TII = ST.getInstrInfo();
1114 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1115 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1116 return (TII->isVOPC(MI) ||
1117 ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
1118 MI.modifiesRegister(AMDGPU::EXEC, TRI);
1119 };
1120
1121 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1122 unsigned Opc = MI.getOpcode();
1123 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1124 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1125 };
1126
1127 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1128 std::numeric_limits<int>::max())
1129 return false;
1130
1131 // V_NOP will be discarded by SQ.
1132 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1133 // which is always a VGPR and available.
1134 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1135 Register Reg = Src0->getReg();
1136 bool IsUndef = Src0->isUndef();
1137 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1138 TII->get(AMDGPU::V_MOV_B32_e32))
1139 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1140 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1141
1142 return true;
1143}
1144
1145bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1147 return false;
1149
1151 return false;
1152
1153 if (MI->getNumDefs() == 0)
1154 return false;
1155
1156 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1157
1158 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1161 return false;
1162
1163 for (const MachineOperand &Def : MI->defs()) {
1164 const MachineOperand *Op =
1165 I.findRegisterUseOperand(Def.getReg(), TRI, false);
1166 if (!Op)
1167 continue;
1168 return true;
1169 }
1170 return false;
1171 };
1172
1173 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1174 return SIInstrInfo::isVALU(MI) ||
1175 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1176 !MI.getOperand(0).getImm()) ||
1177 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1178 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1179 };
1180
1181 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1182 std::numeric_limits<int>::max())
1183 return false;
1184
1185 const SIInstrInfo *TII = ST.getInstrInfo();
1186 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1187 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1189 return true;
1190}
1191
1192bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1194 return false;
1196
1197 if (!SIInstrInfo::isVALU(*MI))
1198 return false;
1199
1200 unsigned SDSTName;
1201 switch (MI->getOpcode()) {
1202 case AMDGPU::V_READLANE_B32:
1203 case AMDGPU::V_READFIRSTLANE_B32:
1204 SDSTName = AMDGPU::OpName::vdst;
1205 break;
1206 default:
1207 SDSTName = AMDGPU::OpName::sdst;
1208 break;
1209 }
1210
1211 const SIInstrInfo *TII = ST.getInstrInfo();
1212 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1213 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1214 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1215 if (!SDST) {
1216 for (const auto &MO : MI->implicit_operands()) {
1217 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1218 SDST = &MO;
1219 break;
1220 }
1221 }
1222 }
1223
1224 if (!SDST)
1225 return false;
1226
1227 const Register SDSTReg = SDST->getReg();
1228 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1229 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1230 };
1231
1232 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1233 if (TII->isSALU(MI)) {
1234 switch (MI.getOpcode()) {
1235 case AMDGPU::S_SETVSKIP:
1236 case AMDGPU::S_VERSION:
1237 case AMDGPU::S_WAITCNT_VSCNT:
1238 case AMDGPU::S_WAITCNT_VMCNT:
1239 case AMDGPU::S_WAITCNT_EXPCNT:
1240 // These instructions cannot not mitigate the hazard.
1241 return false;
1242 case AMDGPU::S_WAITCNT_LGKMCNT:
1243 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1244 return (MI.getOperand(1).getImm() == 0) &&
1245 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1246 case AMDGPU::S_WAITCNT: {
1247 const int64_t Imm = MI.getOperand(0).getImm();
1249 // DsCnt corresponds to LGKMCnt here.
1250 return (Decoded.DsCnt == 0);
1251 }
1252 default:
1253 // SOPP instructions cannot mitigate the hazard.
1254 if (TII->isSOPP(MI))
1255 return false;
1256 // At this point the SALU can be assumed to mitigate the hazard
1257 // because either:
1258 // (a) it is independent of the at risk SMEM (breaking chain),
1259 // or
1260 // (b) it is dependent on the SMEM, in which case an appropriate
1261 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1262 // SMEM instruction.
1263 return true;
1264 }
1265 }
1266 return false;
1267 };
1268
1269 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1270 std::numeric_limits<int>::max())
1271 return false;
1272
1273 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1274 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1275 .addImm(0);
1276 return true;
1277}
1278
1279bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1280 if (!ST.hasVcmpxExecWARHazard())
1281 return false;
1283
1284 if (!SIInstrInfo::isVALU(*MI))
1285 return false;
1286
1287 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1288 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1289 return false;
1290
1291 auto IsHazardFn = [TRI](const MachineInstr &I) {
1293 return false;
1294 return I.readsRegister(AMDGPU::EXEC, TRI);
1295 };
1296
1297 const SIInstrInfo *TII = ST.getInstrInfo();
1298 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1299 if (SIInstrInfo::isVALU(MI)) {
1300 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1301 return true;
1302 for (auto MO : MI.implicit_operands())
1303 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1304 return true;
1305 }
1306 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1307 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1308 return true;
1309 return false;
1310 };
1311
1312 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1313 std::numeric_limits<int>::max())
1314 return false;
1315
1316 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1317 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1319 return true;
1320}
1321
1323 const GCNSubtarget &ST) {
1324 if (!ST.hasLdsBranchVmemWARHazard())
1325 return false;
1326
1327 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1328 // instructions need to appear in the same function.
1329 bool HasLds = false;
1330 bool HasVmem = false;
1331 for (auto &MBB : MF) {
1332 for (auto &MI : MBB) {
1333 HasLds |= SIInstrInfo::isDS(MI);
1334 HasVmem |=
1336 if (HasLds && HasVmem)
1337 return true;
1338 }
1339 }
1340 return false;
1341}
1342
1344 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1345 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1346 !I.getOperand(1).getImm();
1347}
1348
1349bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1350 if (!RunLdsBranchVmemWARHazardFixup)
1351 return false;
1352
1355
1356 auto IsHazardInst = [](const MachineInstr &MI) {
1357 if (SIInstrInfo::isDS(MI))
1358 return 1;
1360 return 2;
1361 return 0;
1362 };
1363
1364 auto InstType = IsHazardInst(*MI);
1365 if (!InstType)
1366 return false;
1367
1368 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1369 return IsHazardInst(I) || isStoreCountWaitZero(I);
1370 };
1371
1372 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1373 if (!I.isBranch())
1374 return false;
1375
1376 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1377 auto InstType2 = IsHazardInst(I);
1378 return InstType2 && InstType != InstType2;
1379 };
1380
1381 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1382 auto InstType2 = IsHazardInst(I);
1383 if (InstType == InstType2)
1384 return true;
1385
1386 return isStoreCountWaitZero(I);
1387 };
1388
1389 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1390 std::numeric_limits<int>::max();
1391 };
1392
1393 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1394 std::numeric_limits<int>::max())
1395 return false;
1396
1397 const SIInstrInfo *TII = ST.getInstrInfo();
1398 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1399 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1400 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1401 .addImm(0);
1402
1403 return true;
1404}
1405
1406bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1408 return false;
1409
1410 const int NoHazardWaitStates = 15;
1411 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1412 const Register VDSTReg = VDST->getReg();
1413
1414 bool VisitedTrans = false;
1415 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1416 if (!SIInstrInfo::isVALU(I))
1417 return false;
1418 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1419 // Cover both WAR and WAW
1420 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1421 };
1422 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1423 if (WaitStates >= NoHazardWaitStates)
1424 return true;
1425 // Instructions which cause va_vdst==0 expire hazard
1428 };
1429 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1430 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1431 };
1432
1434 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1435 std::next(MI->getReverseIterator()), 0,
1436 IsExpiredFn, Visited, GetWaitStatesFn);
1437
1438 // Transcendentals can execute in parallel to other VALUs.
1439 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1440 if (VisitedTrans)
1441 Count = 0;
1442
1443 MachineOperand *WaitVdstOp =
1444 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1445 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1446
1447 return true;
1448}
1449
1450bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1452 return false;
1453
1454 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1455 const Register VDSTReg = VDST->getReg();
1456
1457 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1460 return false;
1461 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1462 };
1463 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1464 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1465 // according to the type of VMEM instruction.
1466 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1468 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1469 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1470 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1471 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1472 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1473 };
1474
1475 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1476 std::numeric_limits<int>::max())
1477 return false;
1478
1479 if (LdsdirCanWait) {
1480 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1481 } else {
1482 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1483 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1485 }
1486
1487 return true;
1488}
1489
1490bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1492 return false;
1494
1495 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1496 return false;
1497
1499
1500 for (const MachineOperand &Use : MI->explicit_uses()) {
1501 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1502 SrcVGPRs.insert(Use.getReg());
1503 }
1504
1505 // Only applies with >= 2 unique VGPR sources
1506 if (SrcVGPRs.size() <= 1)
1507 return false;
1508
1509 // Look for the following pattern:
1510 // Va <- VALU [PreExecPos]
1511 // intv1
1512 // Exec <- SALU [ExecPos]
1513 // intv2
1514 // Vb <- VALU [PostExecPos]
1515 // intv3
1516 // MI Va, Vb (WaitState = 0)
1517 //
1518 // Where:
1519 // intv1 + intv2 <= 2 VALUs
1520 // intv3 <= 4 VALUs
1521 //
1522 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1523
1524 const int Intv1plus2MaxVALUs = 2;
1525 const int Intv3MaxVALUs = 4;
1526 const int IntvMaxVALUs = 6;
1527 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1528
1529 struct StateType {
1531 int ExecPos = std::numeric_limits<int>::max();
1532 int VALUs = 0;
1533 };
1534
1535 StateType State;
1536
1537 // This overloads expiry testing with all the hazard detection
1538 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1539 // Too many VALU states have passed
1540 if (State.VALUs > NoHazardVALUWaitStates)
1541 return HazardExpired;
1542
1543 // Instructions which cause va_vdst==0 expire hazard
1546 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1547 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1548 return HazardExpired;
1549
1550 // Track registers writes
1551 bool Changed = false;
1552 if (SIInstrInfo::isVALU(I)) {
1553 for (Register Src : SrcVGPRs) {
1554 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1555 State.DefPos[Src] = State.VALUs;
1556 Changed = true;
1557 }
1558 }
1559 } else if (SIInstrInfo::isSALU(I)) {
1560 if (State.ExecPos == std::numeric_limits<int>::max()) {
1561 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1562 State.ExecPos = State.VALUs;
1563 Changed = true;
1564 }
1565 }
1566 }
1567
1568 // Early expiration: too many VALUs in intv3
1569 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1570 return HazardExpired;
1571
1572 // Only evaluate state if something changed
1573 if (!Changed)
1574 return NoHazardFound;
1575
1576 // Determine positions of VALUs pre/post exec change
1577 if (State.ExecPos == std::numeric_limits<int>::max())
1578 return NoHazardFound;
1579
1580 int PreExecPos = std::numeric_limits<int>::max();
1581 int PostExecPos = std::numeric_limits<int>::max();
1582
1583 for (auto Entry : State.DefPos) {
1584 int DefVALUs = Entry.second;
1585 if (DefVALUs != std::numeric_limits<int>::max()) {
1586 if (DefVALUs >= State.ExecPos)
1587 PreExecPos = std::min(PreExecPos, DefVALUs);
1588 else
1589 PostExecPos = std::min(PostExecPos, DefVALUs);
1590 }
1591 }
1592
1593 // Need a VALUs post exec change
1594 if (PostExecPos == std::numeric_limits<int>::max())
1595 return NoHazardFound;
1596
1597 // Too many VALUs in intv3?
1598 int Intv3VALUs = PostExecPos;
1599 if (Intv3VALUs > Intv3MaxVALUs)
1600 return HazardExpired;
1601
1602 // Too many VALUs in intv2?
1603 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1604 if (Intv2VALUs > Intv1plus2MaxVALUs)
1605 return HazardExpired;
1606
1607 // Need a VALUs pre exec change
1608 if (PreExecPos == std::numeric_limits<int>::max())
1609 return NoHazardFound;
1610
1611 // Too many VALUs in intv1?
1612 int Intv1VALUs = PreExecPos - State.ExecPos;
1613 if (Intv1VALUs > Intv1plus2MaxVALUs)
1614 return HazardExpired;
1615
1616 // Too many VALUs in intv1 + intv2
1617 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1618 return HazardExpired;
1619
1620 return HazardFound;
1621 };
1622 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1624 State.VALUs += 1;
1625 };
1626
1628 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1629 std::next(MI->getReverseIterator()), Visited))
1630 return false;
1631
1632 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1633 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1634 .addImm(0x0fff);
1635
1636 return true;
1637}
1638
1639bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1640 if (!ST.hasVALUTransUseHazard())
1641 return false;
1643
1644 if (!SIInstrInfo::isVALU(*MI))
1645 return false;
1646
1647 SmallSet<Register, 4> SrcVGPRs;
1648
1649 for (const MachineOperand &Use : MI->explicit_uses()) {
1650 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1651 SrcVGPRs.insert(Use.getReg());
1652 }
1653
1654 // Look for the following pattern:
1655 // Va <- TRANS VALU
1656 // intv
1657 // MI Va (WaitState = 0)
1658 //
1659 // Where:
1660 // intv <= 5 VALUs / 1 TRANS
1661 //
1662 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1663
1664 const int IntvMaxVALUs = 5;
1665 const int IntvMaxTRANS = 1;
1666
1667 struct StateType {
1668 int VALUs = 0;
1669 int TRANS = 0;
1670 };
1671
1672 StateType State;
1673
1674 // This overloads expiry testing with all the hazard detection
1675 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1676 // Too many VALU states have passed
1677 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1678 return HazardExpired;
1679
1680 // Instructions which cause va_vdst==0 expire hazard
1683 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1684 I.getOperand(0).getImm() == 0x0fff))
1685 return HazardExpired;
1686
1687 // Track registers writes
1688 if (SIInstrInfo::isTRANS(I)) {
1689 for (Register Src : SrcVGPRs) {
1690 if (I.modifiesRegister(Src, &TRI)) {
1691 return HazardFound;
1692 }
1693 }
1694 }
1695
1696 return NoHazardFound;
1697 };
1698 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1700 State.VALUs += 1;
1702 State.TRANS += 1;
1703 };
1704
1706 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1707 std::next(MI->getReverseIterator()), Visited))
1708 return false;
1709
1710 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1711 // avoided.
1712 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1713 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1715
1716 return true;
1717}
1718
1719bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1721 return false;
1722
1723 const SIInstrInfo *TII = ST.getInstrInfo();
1724 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1725
1726 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1728 return false;
1729
1730 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1731 // with the dest(matrix D) of the previous wmma.
1732 const Register CurSrc0Reg =
1733 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1734 const Register CurSrc1Reg =
1735 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1736
1737 const Register PrevDstReg =
1738 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1739
1740 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1741 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1742 return true;
1743 }
1744
1745 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1746 // but Index can't overlap with PrevDstReg.
1747 if (AMDGPU::isGFX12Plus(ST)) {
1748 if (SIInstrInfo::isSWMMAC(*MI)) {
1749 const Register CurIndex =
1750 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1751 if (TRI->regsOverlap(PrevDstReg, CurIndex))
1752 return true;
1753 }
1754 return false;
1755 }
1756
1757 return false;
1758 };
1759
1760 auto IsExpiredFn = [](const MachineInstr &I, int) {
1761 return SIInstrInfo::isVALU(I);
1762 };
1763
1764 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1765 std::numeric_limits<int>::max())
1766 return false;
1767
1768 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1769
1770 return true;
1771}
1772
1773bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1774 if (!ST.hasShift64HighRegBug())
1775 return false;
1777
1778 switch (MI->getOpcode()) {
1779 default:
1780 return false;
1781 case AMDGPU::V_LSHLREV_B64_e64:
1782 case AMDGPU::V_LSHRREV_B64_e64:
1783 case AMDGPU::V_ASHRREV_I64_e64:
1784 break;
1785 }
1786
1787 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1788 if (!Amt->isReg())
1789 return false;
1790
1791 Register AmtReg = Amt->getReg();
1792 const MachineRegisterInfo &MRI = MF.getRegInfo();
1793 // Check if this is a last VGPR in the allocation block.
1794 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1795 return false;
1796
1797 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1798 return false;
1799
1800 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1801 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1802 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1803 bool Overlapped = OverlappedSrc || OverlappedDst;
1804
1805 assert(!OverlappedDst || !OverlappedSrc ||
1806 Src1->getReg() == MI->getOperand(0).getReg());
1808 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1809
1810 Register NewReg;
1811 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1812 : AMDGPU::VGPR_32RegClass) {
1813 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1814 NewReg = Reg;
1815 break;
1816 }
1817 }
1818
1819 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1820 : NewReg;
1821 Register NewAmtLo;
1822
1823 if (Overlapped)
1824 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1825
1826 DebugLoc DL = MI->getDebugLoc();
1827 MachineBasicBlock *MBB = MI->getParent();
1828 // Insert a full wait count because found register might be pending a wait.
1829 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1830 .addImm(0);
1831
1832 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1833 if (Overlapped)
1834 runOnInstruction(
1835 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1836 .addDef(AmtReg - 1)
1837 .addReg(AmtReg - 1, RegState::Undef)
1838 .addReg(NewAmtLo, RegState::Undef));
1839 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1840 .addDef(AmtReg)
1841 .addReg(AmtReg, RegState::Undef)
1842 .addReg(NewAmt, RegState::Undef));
1843
1844 // Instructions emitted after the current instruction will be processed by the
1845 // parent loop of the hazard recognizer in a natural way.
1846 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1847 AmtReg)
1848 .addDef(NewAmt)
1849 .addReg(NewAmt)
1850 .addReg(AmtReg);
1851 if (Overlapped)
1852 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1853 AmtReg - 1)
1854 .addDef(NewAmtLo)
1855 .addReg(NewAmtLo)
1856 .addReg(AmtReg - 1);
1857
1858 // Re-running hazard recognizer on the modified instruction is not necessary,
1859 // inserted V_SWAP_B32 has already both read and write new registers so
1860 // hazards related to these register has already been handled.
1861 Amt->setReg(NewAmt);
1862 Amt->setIsKill(false);
1863 // We do not update liveness, so verifier may see it as undef.
1864 Amt->setIsUndef();
1865 if (OverlappedDst)
1866 MI->getOperand(0).setReg(NewReg);
1867 if (OverlappedSrc) {
1868 Src1->setReg(NewReg);
1869 Src1->setIsKill(false);
1870 Src1->setIsUndef();
1871 }
1872
1873 return true;
1874}
1875
1876int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1877 int NSAtoVMEMWaitStates = 1;
1878
1879 if (!ST.hasNSAtoVMEMBug())
1880 return 0;
1881
1883 return 0;
1884
1885 const SIInstrInfo *TII = ST.getInstrInfo();
1886 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1887 if (!Offset || (Offset->getImm() & 6) == 0)
1888 return 0;
1889
1890 auto IsHazardFn = [TII](const MachineInstr &I) {
1891 if (!SIInstrInfo::isMIMG(I))
1892 return false;
1893 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1894 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1895 TII->getInstSizeInBytes(I) >= 16;
1896 };
1897
1898 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1899}
1900
1901int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1902 int FPAtomicToDenormModeWaitStates = 3;
1903
1905 return 0;
1907
1908 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1909 return 0;
1910
1911 auto IsHazardFn = [](const MachineInstr &I) {
1913 return false;
1914 return SIInstrInfo::isFPAtomic(I);
1915 };
1916
1917 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1918 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
1919 return true;
1920
1921 switch (MI.getOpcode()) {
1922 case AMDGPU::S_WAITCNT:
1923 case AMDGPU::S_WAITCNT_VSCNT:
1924 case AMDGPU::S_WAITCNT_VMCNT:
1925 case AMDGPU::S_WAITCNT_EXPCNT:
1926 case AMDGPU::S_WAITCNT_LGKMCNT:
1927 case AMDGPU::S_WAIT_IDLE:
1928 return true;
1929 default:
1930 break;
1931 }
1932
1933 return false;
1934 };
1935
1936 return FPAtomicToDenormModeWaitStates -
1937 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1938}
1939
1940int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1942
1943 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1944}
1945
1946int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
1947 // Early exit if no padding is requested.
1948 if (MFMAPaddingRatio == 0)
1949 return 0;
1950
1952 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
1953 return 0;
1954
1955 int NeighborMFMALatency = 0;
1956 auto IsNeighboringMFMA = [&NeighborMFMALatency,
1957 this](const MachineInstr &MI) {
1958 if (!SIInstrInfo::isMFMA(MI))
1959 return false;
1960
1961 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
1962 return true;
1963 };
1964
1965 const int MaxMFMAPipelineWaitStates = 16;
1966 int WaitStatesSinceNeighborMFMA =
1967 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
1968
1969 int NeighborMFMAPaddingNeeded =
1970 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
1971 WaitStatesSinceNeighborMFMA;
1972
1973 return std::max(0, NeighborMFMAPaddingNeeded);
1974}
1975
1976int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1977 int WaitStatesNeeded = 0;
1978 unsigned Opc = MI->getOpcode();
1979
1980 auto IsVALUFn = [](const MachineInstr &MI) {
1981 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
1982 };
1983
1984 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1985 const int LegacyVALUWritesVGPRWaitStates = 2;
1986 const int VALUWritesExecWaitStates = 4;
1987 const int MaxWaitStates = 4;
1988
1989 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1990 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1991 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1992
1993 if (WaitStatesNeeded < MaxWaitStates) {
1994 for (const MachineOperand &Use : MI->explicit_uses()) {
1995 const int MaxWaitStates = 2;
1996
1997 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1998 continue;
1999
2000 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2001 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2002 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2003
2004 if (WaitStatesNeeded == MaxWaitStates)
2005 break;
2006 }
2007 }
2008 }
2009
2010 for (const MachineOperand &Op : MI->explicit_operands()) {
2011 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2012 continue;
2013
2014 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2015 continue;
2016
2017 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2018 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2019 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2020 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2021 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2022 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2023 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2024 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2025 const int MaxWaitStates = 18;
2026 Register Reg = Op.getReg();
2027 unsigned HazardDefLatency = 0;
2028
2029 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2030 this](const MachineInstr &MI) {
2031 if (!SIInstrInfo::isMFMA(MI))
2032 return false;
2033 Register DstReg = MI.getOperand(0).getReg();
2034 if (DstReg == Reg)
2035 return false;
2036 HazardDefLatency =
2037 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2038 return TRI.regsOverlap(DstReg, Reg);
2039 };
2040
2041 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2042 MaxWaitStates);
2043 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2044 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2045 int OpNo = Op.getOperandNo();
2046 if (OpNo == SrcCIdx) {
2047 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2048 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2049 switch (HazardDefLatency) {
2050 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2051 break;
2052 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2053 break;
2054 case 16: [[fallthrough]];
2055 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2056 break;
2057 }
2058 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2059 switch (HazardDefLatency) {
2060 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2061 break;
2062 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2063 break;
2064 case 16: [[fallthrough]];
2065 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2066 break;
2067 }
2068 }
2069
2070 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2071 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2072
2073 if (WaitStatesNeeded == MaxWaitStates)
2074 return WaitStatesNeeded; // Early exit.
2075
2076 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2077 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2078 return false;
2079 Register DstReg = MI.getOperand(0).getReg();
2080 return TRI.regsOverlap(Reg, DstReg);
2081 };
2082
2083 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2084 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2085 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2086 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2087 if (OpNo == SrcCIdx)
2088 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2089 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2090 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2091
2092 WaitStatesNeededForUse = NeedWaitStates -
2093 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2094 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2095
2096 if (WaitStatesNeeded == MaxWaitStates)
2097 return WaitStatesNeeded; // Early exit.
2098 }
2099
2100 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2101 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2102 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2103 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2104 const int MaxWaitStates = 13;
2105 Register DstReg = MI->getOperand(0).getReg();
2106 unsigned HazardDefLatency = 0;
2107
2108 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2109 this](const MachineInstr &MI) {
2110 if (!SIInstrInfo::isMFMA(MI))
2111 return false;
2112 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2113 HazardDefLatency =
2114 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2115 return TRI.regsOverlap(Reg, DstReg);
2116 };
2117
2118 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2119 int NeedWaitStates;
2120 switch (HazardDefLatency) {
2121 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2122 break;
2123 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2124 break;
2125 case 16: [[fallthrough]];
2126 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2127 break;
2128 }
2129
2130 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2131 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2132 }
2133
2134 // Pad neighboring MFMA with noops for better inter-wave performance.
2135 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2136
2137 return WaitStatesNeeded;
2138}
2139
2140static int
2142 // 2 pass -> 3
2143 // 4 pass -> 5
2144 // 8 pass -> 9
2145 // 16 pass -> 17
2146 return NumPasses + 1;
2147}
2148
2149static int
2151 // 2 pass -> 2
2152 // 4 pass -> 4
2153 // 8 pass -> 8
2154 // 16 pass -> 16
2155 return NumPasses;
2156}
2157
2158static int
2160 // 2 pass -> 4
2161 // 4 pass -> 6
2162 // 8 pass -> 10
2163 // 16 pass -> 18
2164 return NumPasses + 2;
2165}
2166
2168 // 2 pass -> 5
2169 // 4 pass -> 7
2170 // 8 pass -> 11
2171 // 16 pass -> 19
2172 return NumPasses + 3;
2173}
2174
2175int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2176 int WaitStatesNeeded = 0;
2177 unsigned Opc = MI->getOpcode();
2178
2179 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2181 };
2182
2183 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2186 };
2187
2188 if (!SIInstrInfo::isMFMA(*MI))
2189 return WaitStatesNeeded;
2190
2191 const int VALUWritesExecWaitStates = 4;
2192 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2193 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2194 VALUWritesExecWaitStates);
2195 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2196
2197 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2198
2199 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2200 for (const MachineOperand &Use : MI->explicit_uses()) {
2201 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2202 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2203 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2204 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2205 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2206 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2207 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2208 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2209 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2210 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2211 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2212 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2213 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2214 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2215 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2216 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2217 const int MaxWaitStates = 19;
2218
2219 if (!Use.isReg())
2220 continue;
2221 Register Reg = Use.getReg();
2222 bool FullReg;
2223 const MachineInstr *MI1;
2224
2225 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2226 this](const MachineInstr &MI) {
2227 if (!SIInstrInfo::isMFMA(MI))
2228 return false;
2229 Register DstReg = MI.getOperand(0).getReg();
2230 FullReg = (DstReg == Reg);
2231 MI1 = &MI;
2232 return TRI.regsOverlap(DstReg, Reg);
2233 };
2234
2235 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2236 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2237 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2238
2239 int NumWaitStates =
2240 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2241 if (NumWaitStates == std::numeric_limits<int>::max())
2242 continue;
2243
2244 int OpNo = Use.getOperandNo();
2245 unsigned Opc1 = MI1->getOpcode();
2246 int NeedWaitStates = 0;
2247 if (OpNo == SrcCIdx) {
2248 if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2249 NeedWaitStates = 0;
2250 } else if (FullReg) {
2251 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2252 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2253 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2254 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2255 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2256 else if (ST.hasGFX940Insts() &&
2257 TSchedModel.computeInstrLatency(MI1) == 2)
2258 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2259 } else {
2260 switch (Opc1) {
2261 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2262 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2263 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2264 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2265 if (!isXDL(ST, *MI))
2266 NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2267 break;
2268 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2269 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2270 if (!isXDL(ST, *MI))
2271 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2272 break;
2273 default:
2274 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2275 if (ST.hasGFX940Insts()) {
2276 if (isXDL(ST, *MI) && !isXDL(ST, *MI1))
2277 break;
2278
2279 NeedWaitStates =
2280 isXDL(ST, *MI1)
2282 NumPasses)
2284 NumPasses);
2285 break;
2286 }
2287
2288 switch (NumPasses) {
2289 case 2:
2290 NeedWaitStates =
2291 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2292 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2293 break;
2294 case 8:
2295 NeedWaitStates =
2296 isDGEMM(Opc)
2297 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2298 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2299 break;
2300 case 16:
2301 NeedWaitStates =
2302 isDGEMM(Opc)
2303 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2304 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2305 break;
2306 default:
2307 llvm_unreachable("unexpected number of passes");
2308 }
2309 }
2310 }
2311 } else {
2312 switch (Opc1) {
2313 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2314 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2315 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2316 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2317 NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2318 break;
2319 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2320 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2321 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2322 break;
2323 default:
2324 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2325
2326 if (ST.hasGFX940Insts()) {
2327 NeedWaitStates =
2328 isXDL(ST, *MI1)
2330 NumPasses)
2332 NumPasses);
2333 break;
2334 }
2335
2336 switch (NumPasses) {
2337 case 2:
2338 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2339 break;
2340 case 4:
2341 llvm_unreachable("unexpected number of passes for mfma");
2342 case 8:
2343 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2344 break;
2345 case 16:
2346 default:
2347 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2348 }
2349 }
2350 }
2351 if (WaitStatesNeeded >= NeedWaitStates)
2352 continue;
2353
2354 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2355 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2356
2357 if (WaitStatesNeeded == MaxWaitStates)
2358 break;
2359 }
2360
2361 // Pad neighboring MFMA with noops for better inter-wave performance.
2362 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2363
2364 return WaitStatesNeeded;
2365}
2366
2367int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2368 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2369 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2370 return 0;
2371
2372 int WaitStatesNeeded = 0;
2373
2374 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2375 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2376 };
2377
2378 for (const MachineOperand &Op : MI->explicit_uses()) {
2379 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2380 continue;
2381
2382 Register Reg = Op.getReg();
2383
2384 const int AccVgprReadLdStWaitStates = 2;
2385 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2386 const int MaxWaitStates = 2;
2387
2388 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2389 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2390 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2391
2392 if (WaitStatesNeeded == MaxWaitStates)
2393 return WaitStatesNeeded; // Early exit.
2394
2395 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2396 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2397 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2398 return false;
2399 auto IsVALUFn = [](const MachineInstr &MI) {
2401 };
2402 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2403 std::numeric_limits<int>::max();
2404 };
2405
2406 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2407 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2408 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2409 }
2410
2411 return WaitStatesNeeded;
2412}
2413
2415 // 2 pass -> 4
2416 // 4 pass -> 6
2417 // 8 pass -> 10
2418 // 16 pass -> 18
2419 return NumPasses + 2;
2420}
2421
2423 // 2 pass -> 5
2424 // 4 pass -> 7
2425 // 8 pass -> 11
2426 // 16 pass -> 19
2427 return NumPasses + 3;
2428}
2429
2431 // 2 pass -> 5
2432 // 4 pass -> 7
2433 // 8 pass -> 11
2434 // 16 pass -> 19
2435 return NumPasses + 3;
2436}
2437
2439 // 2 pass -> 4
2440 // 4 pass -> 6
2441 // 8 pass -> 10
2442 // 16 pass -> 18
2443 return NumPasses + 2;
2444}
2445
2446int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2447 if (!ST.hasGFX90AInsts())
2448 return 0;
2449
2450 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2451 return isDGEMM(MI.getOpcode());
2452 };
2453
2454 // This is checked in checkMAIHazards90A()
2455 if (SIInstrInfo::isMFMA(*MI))
2456 return 0;
2457
2458 const MachineRegisterInfo &MRI = MF.getRegInfo();
2459
2460 int WaitStatesNeeded = 0;
2461
2462 bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2465 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2466 bool IsVALU = SIInstrInfo::isVALU(*MI);
2467
2468 const MachineInstr *MFMA = nullptr;
2469 unsigned Reg;
2470 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2471 if (!SIInstrInfo::isMFMA(MI) ||
2472 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2473 return false;
2474 MFMA = &MI;
2475 return true;
2476 };
2477
2478 const MachineInstr *DOT = nullptr;
2479 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2480 if (!SIInstrInfo::isDOT(MI) ||
2481 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2482 return false;
2483 DOT = &MI;
2484 return true;
2485 };
2486
2487 bool DGEMMAfterVALUWrite = false;
2488 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2489 // Found DGEMM on reverse traversal to def.
2490 if (isDGEMM(MI.getOpcode()))
2491 DGEMMAfterVALUWrite = true;
2492
2493 // Only hazard if register is defined by a VALU and a DGEMM is found after
2494 // after the def.
2495 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2496 return false;
2497
2498 return true;
2499 };
2500
2501 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2502 AMDGPU::OpName::src2);
2503
2504 if (IsMemOrExport || IsVALU) {
2505 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2506 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2507 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2508 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2509 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2510 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2511 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2512 const int DotWriteSameDotReadSrcAB = 3;
2513 const int DotWriteDifferentVALURead = 3;
2514 const int DMFMABetweenVALUWriteVMEMRead = 2;
2515 const int MaxWaitStates = 19;
2516
2517 for (const MachineOperand &Use : MI->explicit_uses()) {
2518 if (!Use.isReg())
2519 continue;
2520 Reg = Use.getReg();
2521
2522 DOT = nullptr;
2523 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2524 MaxWaitStates);
2525 if (DOT) {
2526 int NeedWaitStates = 0;
2527 if (DOT->getOpcode() == MI->getOpcode()) {
2528 if (&Use - &MI->getOperand(0) != SrcCIdx)
2529 NeedWaitStates = DotWriteSameDotReadSrcAB;
2530 } else {
2531 NeedWaitStates = DotWriteDifferentVALURead;
2532 }
2533
2534 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2535 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2536 }
2537
2538 // Workaround for HW data hazard bug observed only in GFX90A. When there
2539 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2540 // causes the SQ to incorrectly not insert two wait states between the two
2541 // instructions needed to avoid data hazard.
2542 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2543 DGEMMAfterVALUWrite = false;
2544 if (TRI.isVectorRegister(MRI, Reg)) {
2545 int WaitStatesNeededForUse =
2546 DMFMABetweenVALUWriteVMEMRead -
2547 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2548 DMFMABetweenVALUWriteVMEMRead);
2549
2550 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2551 }
2552 }
2553
2554 MFMA = nullptr;
2555 WaitStatesSinceDef =
2556 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2557 if (!MFMA)
2558 continue;
2559
2560 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2561 int NumPasses = HazardDefLatency;
2562 int NeedWaitStates = MaxWaitStates;
2563
2564 if (isDGEMM(MFMA->getOpcode())) {
2565 switch (HazardDefLatency) {
2566 case 4:
2567 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2568 : DMFMA4x4WriteVgprVALUReadWaitStates;
2569 break;
2570 case 8:
2571 case 16:
2572 NeedWaitStates = IsMemOrExport
2573 ? DMFMA16x16WriteVgprMemExpReadWaitStates
2574 : DMFMA16x16WriteVgprVALUReadWaitStates;
2575 break;
2576 default:
2577 llvm_unreachable("unexpected dgemm");
2578 }
2579 } else if (ST.hasGFX940Insts()) {
2580 NeedWaitStates =
2581 isXDL(ST, *MFMA)
2584 NumPasses);
2585 } else {
2586 switch (HazardDefLatency) {
2587 case 2:
2588 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2589 break;
2590 case 8:
2591 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2592 break;
2593 case 16:
2594 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2595 break;
2596 default:
2597 llvm_unreachable("unexpected number of passes for mfma");
2598 }
2599 }
2600
2601 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2602 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2603
2604 if (WaitStatesNeeded == MaxWaitStates)
2605 break;
2606 }
2607 }
2608
2609 unsigned Opc = MI->getOpcode();
2610 const int DMFMAToFMA64WaitStates = 2;
2611 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2612 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2613 Opc == AMDGPU::V_FMAC_F64_dpp) &&
2614 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2615 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2616 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2617 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2618 }
2619
2620 if (!IsVALU && !IsMemOrExport)
2621 return WaitStatesNeeded;
2622
2623 for (const MachineOperand &Def : MI->defs()) {
2624 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2625 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2626 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2627 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2628 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2629 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2630 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2631 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2632 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2633 const int DotWriteDifferentVALUWrite = 3;
2634 const int MaxWaitStates = 19;
2635 const int MaxWarWaitStates = 15;
2636
2637 Reg = Def.getReg();
2638
2639 DOT = nullptr;
2640 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2641 MaxWaitStates);
2642 if (DOT && DOT->getOpcode() != MI->getOpcode())
2643 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2644 WaitStatesSinceDef);
2645
2646 MFMA = nullptr;
2647 WaitStatesSinceDef =
2648 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2649 if (MFMA) {
2650 int NeedWaitStates = MaxWaitStates;
2651 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2652
2653 if (isDGEMM(MFMA->getOpcode())) {
2654 switch (NumPasses) {
2655 case 4:
2656 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2657 break;
2658 case 8:
2659 case 16:
2660 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2661 break;
2662 default:
2663 llvm_unreachable("unexpected number of cycles for dgemm");
2664 }
2665 } else if (ST.hasGFX940Insts()) {
2666 NeedWaitStates =
2667 isXDL(ST, *MFMA)
2670 } else {
2671 switch (NumPasses) {
2672 case 2:
2673 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2674 break;
2675 case 8:
2676 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2677 break;
2678 case 16:
2679 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2680 break;
2681 default:
2682 llvm_unreachable("Unexpected number of passes for mfma");
2683 }
2684 }
2685
2686 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2687 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2688
2689 if (WaitStatesNeeded == MaxWaitStates)
2690 break;
2691 }
2692
2693 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2694 if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2695 !MI.readsRegister(Reg, &TRI))
2696 return false;
2697
2698 if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2699 return false;
2700
2701 const MachineOperand *SrcC =
2702 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2703 assert(SrcC);
2704 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2705 return false;
2706
2707 MFMA = &MI;
2708 return true;
2709 };
2710
2711 MFMA = nullptr;
2712 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2713 MaxWarWaitStates);
2714 if (!MFMA)
2715 continue;
2716
2717 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2718 int NeedWaitStates = MaxWaitStates;
2719 switch (HazardDefLatency) {
2720 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2721 break;
2722 case 4: assert(ST.hasGFX940Insts());
2723 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2724 break;
2725 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2726 break;
2727 case 16: [[fallthrough]];
2728 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2729 break;
2730 }
2731
2732 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2733 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2734 }
2735
2736 return WaitStatesNeeded;
2737}
2738
2740 if (!SU->isInstr())
2741 return false;
2742
2743 const MachineInstr *MAI = nullptr;
2744
2745 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2746 MAI = nullptr;
2748 MAI = &MI;
2749 return MAI != nullptr;
2750 };
2751
2752 MachineInstr *MI = SU->getInstr();
2753 if (IsMFMAFn(*MI)) {
2754 int W = getWaitStatesSince(IsMFMAFn, 16);
2755 if (MAI)
2756 return W < (int)TSchedModel.computeInstrLatency(MAI);
2757 }
2758
2759 return false;
2760}
2761
2762bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2763 if (!ST.hasVALUMaskWriteHazard())
2764 return false;
2766
2767 if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
2768 return false;
2769
2770 // The hazard sequence is three instructions:
2771 // 1. VALU reads SGPR as mask
2772 // 2. SALU writes SGPR
2773 // 3. SALU reads SGPR
2774 // The hazard can expire if the distance between 2 and 3 is sufficient.
2775 // In practice this happens <10% of the time, hence this always assumes
2776 // the hazard exists if 1 and 2 are present to avoid searching.
2777
2778 const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2779 if (!SDSTOp || !SDSTOp->isReg())
2780 return false;
2781
2782 const Register HazardReg = SDSTOp->getReg();
2783 if (HazardReg == AMDGPU::EXEC ||
2784 HazardReg == AMDGPU::EXEC_LO ||
2785 HazardReg == AMDGPU::EXEC_HI ||
2786 HazardReg == AMDGPU::M0)
2787 return false;
2788
2789 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2790 switch (I.getOpcode()) {
2791 case AMDGPU::V_ADDC_U32_e32:
2792 case AMDGPU::V_ADDC_U32_dpp:
2793 case AMDGPU::V_CNDMASK_B16_e32:
2794 case AMDGPU::V_CNDMASK_B16_dpp:
2795 case AMDGPU::V_CNDMASK_B32_e32:
2796 case AMDGPU::V_CNDMASK_B32_dpp:
2797 case AMDGPU::V_DIV_FMAS_F32_e64:
2798 case AMDGPU::V_DIV_FMAS_F64_e64:
2799 case AMDGPU::V_SUBB_U32_e32:
2800 case AMDGPU::V_SUBB_U32_dpp:
2801 case AMDGPU::V_SUBBREV_U32_e32:
2802 case AMDGPU::V_SUBBREV_U32_dpp:
2803 // These implicitly read VCC as mask source.
2804 return HazardReg == AMDGPU::VCC ||
2805 HazardReg == AMDGPU::VCC_LO ||
2806 HazardReg == AMDGPU::VCC_HI;
2807 case AMDGPU::V_ADDC_U32_e64:
2808 case AMDGPU::V_ADDC_U32_e64_dpp:
2809 case AMDGPU::V_CNDMASK_B16_e64:
2810 case AMDGPU::V_CNDMASK_B16_e64_dpp:
2811 case AMDGPU::V_CNDMASK_B32_e64:
2812 case AMDGPU::V_CNDMASK_B32_e64_dpp:
2813 case AMDGPU::V_SUBB_U32_e64:
2814 case AMDGPU::V_SUBB_U32_e64_dpp:
2815 case AMDGPU::V_SUBBREV_U32_e64:
2816 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2817 // Only check mask register overlaps.
2818 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
2819 assert(SSRCOp);
2820 return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
2821 }
2822 default:
2823 return false;
2824 }
2825 };
2826
2827 const MachineRegisterInfo &MRI = MF.getRegInfo();
2828 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
2829 // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2830 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2831 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
2832 return true;
2833
2834 // VALU access to any SGPR or literal constant other than HazardReg
2835 // mitigates hazard. No need to check HazardReg here as this will
2836 // only be called when !IsHazardFn.
2837 if (!SIInstrInfo::isVALU(I))
2838 return false;
2839 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
2840 const MachineOperand &Op = I.getOperand(OpNo);
2841 if (Op.isReg()) {
2842 Register OpReg = Op.getReg();
2843 // Only consider uses
2844 if (!Op.isUse())
2845 continue;
2846 // Ignore EXEC
2847 if (OpReg == AMDGPU::EXEC ||
2848 OpReg == AMDGPU::EXEC_LO ||
2849 OpReg == AMDGPU::EXEC_HI)
2850 continue;
2851 // Ignore all implicit uses except VCC
2852 if (Op.isImplicit()) {
2853 if (OpReg == AMDGPU::VCC ||
2854 OpReg == AMDGPU::VCC_LO ||
2855 OpReg == AMDGPU::VCC_HI)
2856 return true;
2857 continue;
2858 }
2859 if (TRI.isSGPRReg(MRI, OpReg))
2860 return true;
2861 } else {
2862 const MCInstrDesc &InstDesc = I.getDesc();
2863 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
2864 if (!TII.isInlineConstant(Op, OpInfo))
2865 return true;
2866 }
2867 }
2868 return false;
2869 };
2870
2871 // Check for hazard
2872 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2873 std::numeric_limits<int>::max())
2874 return false;
2875
2876 auto NextMI = std::next(MI->getIterator());
2877
2878 // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
2879 BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
2880 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2882
2883 // SALU write may be s_getpc in a bundle.
2884 if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2885 // Update offsets of any references in the bundle.
2886 while (NextMI != MI->getParent()->end() &&
2887 NextMI->isBundledWithPred()) {
2888 for (auto &Operand : NextMI->operands()) {
2889 if (Operand.isGlobal())
2890 Operand.setOffset(Operand.getOffset() + 4);
2891 }
2892 NextMI++;
2893 }
2894 }
2895
2896 return true;
2897}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
bool End
Definition: ELF_riscv.cpp:480
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI)
static bool isLdsDma(const MachineInstr &MI)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isSSetReg(unsigned Opcode)
static bool hasHazard(StateT State, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, DenseSet< const MachineBasicBlock * > &Visited)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
@ HazardExpired
@ NoHazardFound
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static bool isDGEMM(unsigned Opcode)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
if(VerifyEach)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static const uint32_t IV[8]
Definition: blake3_impl.h:78
bool anyCommon(const BitVector &RHS) const
Test if any common bits are set.
Definition: BitVector.h:489
BitVector & set()
Definition: BitVector.h:351
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
unsigned PreEmitNoopsCommon(MachineInstr *)
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
bool hasShift64HighRegBug() const
bool hasFPAtomicToDenormModeHazard() const
bool hasLdsBranchVmemWARHazard() const
bool hasGFX90AInsts() const
bool hasDstSelForwardingHazard() const
bool hasMAIInsts() const
Definition: GCNSubtarget.h:809
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:257
bool hasVALUMaskWriteHazard() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasVcmpxExecWARHazard() const
bool hasReadM0MovRelInterpHazard() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:269
bool hasLdsWaitVMSRC() const
bool hasExtendedWaitCounts() const
bool hasVcmpxPermlaneHazard() const
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
bool hasNoDataDepHazard() const
Definition: GCNSubtarget.h:883
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
Definition: GCNSubtarget.h:500
bool hasTransForwardingHazard() const
bool hasGFX940Insts() const
bool hasReadM0LdsDmaHazard() const
bool hasSMEMtoVectorWriteHazard() const
bool hasVMEMtoScalarWriteHazard() const
bool hasNSAtoVMEMBug() const
bool hasVDecCoExecHazard() const
bool hasReadM0SendMsgHazard() const
bool hasReadM0LdsDirectHazard() const
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:596
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
Definition: GCNSubtarget.h:485
bool hasRFEHazards() const
Definition: GCNSubtarget.h:495
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
Definition: GCNSubtarget.h:491
bool isWave64() const
bool hasVALUTransUseHazard() const
bool hasLdsDirect() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
Instructions::const_reverse_iterator const_reverse_instr_iterator
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
instr_iterator instr_end()
iterator_range< pred_iterator > predecessors()
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:566
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:343
bool isBundle() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:792
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:554
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:432
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:536
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:649
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:408
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:512
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isVINTRP(const MachineInstr &MI)
Definition: SIInstrInfo.h:784
static bool isDOT(const MachineInstr &MI)
Definition: SIInstrInfo.h:805
static bool isSWMMAC(const MachineInstr &MI)
Definition: SIInstrInfo.h:821
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:833
bool isBufferSMRD(const MachineInstr &MI) const
static bool isTRANS(const MachineInstr &MI)
Definition: SIInstrInfo.h:768
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:528
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:618
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:760
static bool isMFMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:800
static bool isFPAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:916
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:580
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:809
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:416
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:378
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:390
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:463
ProcResIter getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:31
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:109
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isGFX12Plus(const MCSubtargetInfo &STI)
IsaVersion getIsaVersion(StringRef GPU)
bool getMAIIsGFX940XDL(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool getMAIIsDGEMM(unsigned Opc)
Returns true if MAI operation is a double precision GEMM.
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double e
Definition: MathExtras.h:31
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:456
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
Instruction set architecture version.
Definition: TargetParser.h:127
Represents the counter values to wait for in an s_waitcnt instruction.
Description of the encoding of one expression Op.
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:118
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...
Definition: MCSchedule.h:68
Definition: regcomp.c:192