LLVM 22.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
21
22using namespace llvm;
23
24namespace {
25
26struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
27 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
28
29 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
30 if (Arg.getAsInteger(0, Value))
31 return O.error("'" + Arg + "' value invalid for uint argument!");
32
33 if (Value > 100)
34 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
35
36 return false;
37 }
38};
39
40} // end anonymous namespace
41
43 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
44 cl::desc("Fill a percentage of the latency between "
45 "neighboring MFMA with s_nops."));
46
47// This is intended for debugging purposes only.
49 NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden,
50 cl::desc("Insert a s_nop x before every instruction"));
51
52//===----------------------------------------------------------------------===//
53// Hazard Recognizer Implementation
54//===----------------------------------------------------------------------===//
55
57 const GCNSubtarget &ST);
58
60 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
61 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
62 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
64 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
65 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
66}
67
69 EmittedInstrs.clear();
70}
71
75
77 CurrCycleInstr = MI;
78}
79
80static bool isDivFMas(unsigned Opcode) {
81 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
82}
83
84static bool isSGetReg(unsigned Opcode) {
85 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
86}
87
88static bool isSSetReg(unsigned Opcode) {
89 switch (Opcode) {
90 case AMDGPU::S_SETREG_B32:
91 case AMDGPU::S_SETREG_B32_mode:
92 case AMDGPU::S_SETREG_IMM32_B32:
93 case AMDGPU::S_SETREG_IMM32_B32_mode:
94 return true;
95 }
96 return false;
97}
98
99static bool isRWLane(unsigned Opcode) {
100 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
101}
102
103static bool isRFE(unsigned Opcode) {
104 return Opcode == AMDGPU::S_RFE_B64;
105}
106
107static bool isSMovRel(unsigned Opcode) {
108 switch (Opcode) {
109 case AMDGPU::S_MOVRELS_B32:
110 case AMDGPU::S_MOVRELS_B64:
111 case AMDGPU::S_MOVRELD_B32:
112 case AMDGPU::S_MOVRELD_B64:
113 return true;
114 default:
115 return false;
116 }
117}
118
120 const MachineInstr &MI) {
121 if (TII.isAlwaysGDS(MI.getOpcode()))
122 return true;
123
124 switch (MI.getOpcode()) {
125 case AMDGPU::S_SENDMSG:
126 case AMDGPU::S_SENDMSGHALT:
127 case AMDGPU::S_TTRACEDATA:
128 return true;
129 // These DS opcodes don't support GDS.
130 case AMDGPU::DS_NOP:
131 case AMDGPU::DS_PERMUTE_B32:
132 case AMDGPU::DS_BPERMUTE_B32:
133 return false;
134 default:
135 if (TII.isDS(MI.getOpcode())) {
136 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
137 AMDGPU::OpName::gds);
138 if (MI.getOperand(GDS).getImm())
139 return true;
140 }
141 return false;
142 }
143}
144
145static bool isPermlane(const MachineInstr &MI) {
146 unsigned Opcode = MI.getOpcode();
147 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
148 Opcode == AMDGPU::V_PERMLANE64_B32 ||
149 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
150 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
151 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
152 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
153 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
154 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
155 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
156 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
157 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
158 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
159 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
160 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
161}
162
163static bool isLdsDma(const MachineInstr &MI) {
164 return SIInstrInfo::isVALU(MI) &&
166}
167
168static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
169 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
170 AMDGPU::OpName::simm16);
171 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
172}
173
176 MachineInstr *MI = SU->getInstr();
177 // If we are not in "HazardRecognizerMode" and therefore not being run from
178 // the scheduler, track possible stalls from hazards but don't insert noops.
179 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
180
181 if (MI->isBundle())
182 return NoHazard;
183
184 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
185 return HazardType;
186
187 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
188 return HazardType;
189
190 if (checkFPAtomicToDenormModeHazard(MI) > 0)
191 return HazardType;
192
193 if (ST.hasNoDataDepHazard())
194 return NoHazard;
195
196 if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
197 return HazardType;
198
199 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
200 return HazardType;
201
202 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
203 return HazardType;
204
205 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
206 return HazardType;
207
208 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
209 return HazardType;
210
213 checkMAIVALUHazards(MI) > 0)
214 return HazardType;
215
216 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
217 return HazardType;
218
219 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
220 return HazardType;
221
222 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
223 return HazardType;
224
225 if (((ST.hasReadM0MovRelInterpHazard() &&
226 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
227 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
228 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
229 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
230 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
231 (ST.hasReadM0LdsDirectHazard() &&
232 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
233 checkReadM0Hazards(MI) > 0)
234 return HazardType;
235
236 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
237 return HazardType;
238
240 checkMAILdStHazards(MI) > 0)
241 return HazardType;
242
243 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
244 return HazardType;
245
246 return NoHazard;
247}
248
250 unsigned Quantity) {
251 while (Quantity > 0) {
252 unsigned Arg = std::min(Quantity, 8u);
253 Quantity -= Arg;
254 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
255 .addImm(Arg - 1);
256 }
257}
258
259unsigned
260GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
261 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
262 assert(TSchedModel.getWriteProcResBegin(SC) !=
263 TSchedModel.getWriteProcResEnd(SC));
264 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
265}
266
267void GCNHazardRecognizer::processBundle() {
268 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
269 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
270 // Check bundled MachineInstr's for hazards.
271 for (; MI != E && MI->isInsideBundle(); ++MI) {
272 CurrCycleInstr = &*MI;
273 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
274
275 if (IsHazardRecognizerMode) {
276 fixHazards(CurrCycleInstr);
277
278 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
279 }
280
281 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
282 // include the bundled MI directly after, only add a maximum of
283 // (MaxLookAhead - 1) noops to EmittedInstrs.
284 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
285 EmittedInstrs.push_front(nullptr);
286
287 EmittedInstrs.push_front(CurrCycleInstr);
288 EmittedInstrs.resize(MaxLookAhead);
289 }
290 CurrCycleInstr = nullptr;
291}
292
293void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
294 assert(IsHazardRecognizerMode);
295
296 unsigned NumPreNoops = PreEmitNoops(MI);
297 EmitNoops(NumPreNoops);
298 if (MI->isInsideBundle())
299 insertNoopsInBundle(MI, TII, NumPreNoops);
300 else
301 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
302 NumPreNoops);
304 AdvanceCycle();
305}
306
308 IsHazardRecognizerMode = true;
309 CurrCycleInstr = MI;
310 unsigned W = PreEmitNoopsCommon(MI);
311 fixHazards(MI);
312 CurrCycleInstr = nullptr;
313 return std::max(W, NopPadding.getValue());
314}
315
317 if (MI->isBundle())
318 return 0;
319
320 int WaitStates = 0;
321
323 return std::max(WaitStates, checkSMRDHazards(MI));
324
325 if (ST.hasNSAtoVMEMBug())
326 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
327
328 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
329
330 if (ST.hasNoDataDepHazard())
331 return WaitStates;
332
334 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
335
337 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
338
340 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
341
342 if (isDivFMas(MI->getOpcode()))
343 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
344
345 if (isRWLane(MI->getOpcode()))
346 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
347
350 checkMAIVALUHazards(MI) > 0)
351 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
352
353 if (MI->isInlineAsm())
354 return std::max(WaitStates, checkInlineAsmHazards(MI));
355
356 if (isSGetReg(MI->getOpcode()))
357 return std::max(WaitStates, checkGetRegHazards(MI));
358
359 if (isSSetReg(MI->getOpcode()))
360 return std::max(WaitStates, checkSetRegHazards(MI));
361
362 if (isRFE(MI->getOpcode()))
363 return std::max(WaitStates, checkRFEHazards(MI));
364
365 if ((ST.hasReadM0MovRelInterpHazard() &&
366 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
367 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
368 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
369 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
370 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
371 (ST.hasReadM0LdsDirectHazard() &&
372 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
373 return std::max(WaitStates, checkReadM0Hazards(MI));
374
376 return std::max(WaitStates, checkMAIHazards(MI));
377
379 return std::max(WaitStates, checkMAILdStHazards(MI));
380
381 if (ST.hasGFX950Insts() && isPermlane(*MI))
382 return std::max(WaitStates, checkPermlaneHazards(MI));
383
384 return WaitStates;
385}
386
388 EmittedInstrs.push_front(nullptr);
389}
390
392 // When the scheduler detects a stall, it will call AdvanceCycle() without
393 // emitting any instructions.
394 if (!CurrCycleInstr) {
395 EmittedInstrs.push_front(nullptr);
396 return;
397 }
398
399 if (CurrCycleInstr->isBundle()) {
400 processBundle();
401 return;
402 }
403
404 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
405 if (!NumWaitStates) {
406 CurrCycleInstr = nullptr;
407 return;
408 }
409
410 // Keep track of emitted instructions
411 EmittedInstrs.push_front(CurrCycleInstr);
412
413 // Add a nullptr for each additional wait state after the first. Make sure
414 // not to add more than getMaxLookAhead() items to the list, since we
415 // truncate the list to that size right after this loop.
416 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
417 i < e; ++i) {
418 EmittedInstrs.push_front(nullptr);
419 }
420
421 // getMaxLookahead() is the largest number of wait states we will ever need
422 // to insert, so there is no point in keeping track of more than that many
423 // wait states.
424 EmittedInstrs.resize(getMaxLookAhead());
425
426 CurrCycleInstr = nullptr;
427}
428
430 assert(!IsHazardRecognizerMode &&
431 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
432}
433
434//===----------------------------------------------------------------------===//
435// Helper Functions
436//===----------------------------------------------------------------------===//
437
438using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
439
440using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
441using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
442
443// Search for a hazard in a block and its predecessors.
444template <typename StateT>
445static bool
446hasHazard(StateT InitialState,
447 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
448 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
449 const MachineBasicBlock *InitialMBB,
451 struct StateMapKey {
453 unsigned Idx;
454 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
455 return LHS.States == RHS.States && LHS.Idx == RHS.Idx;
456 }
457 };
458 struct StateMapKeyTraits : DenseMapInfo<StateMapKey> {
459 static inline StateMapKey getEmptyKey() {
460 return {static_cast<SmallVectorImpl<StateT> *>(
463 }
464 static inline StateMapKey getTombstoneKey() {
465 return {static_cast<SmallVectorImpl<StateT> *>(
468 }
469 static unsigned getHashValue(const StateMapKey &Key) {
470 return StateT::getHashValue((*Key.States)[Key.Idx]);
471 }
472 static unsigned getHashValue(const StateT &State) {
473 return StateT::getHashValue(State);
474 }
475 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
476 const auto EKey = getEmptyKey();
477 const auto TKey = getTombstoneKey();
478 if (StateMapKey::isEqual(LHS, EKey) || StateMapKey::isEqual(RHS, EKey) ||
479 StateMapKey::isEqual(LHS, TKey) || StateMapKey::isEqual(RHS, TKey))
480 return StateMapKey::isEqual(LHS, RHS);
481 return StateT::isEqual((*LHS.States)[LHS.Idx], (*RHS.States)[RHS.Idx]);
482 }
483 static bool isEqual(const StateT &LHS, const StateMapKey &RHS) {
484 if (StateMapKey::isEqual(RHS, getEmptyKey()) ||
485 StateMapKey::isEqual(RHS, getTombstoneKey()))
486 return false;
487 return StateT::isEqual(LHS, (*RHS.States)[RHS.Idx]);
488 }
489 };
490
493
495 const MachineBasicBlock *MBB = InitialMBB;
496 StateT State = InitialState;
497
499 unsigned WorkIdx = 0;
500 for (;;) {
501 bool Expired = false;
502 for (auto E = MBB->instr_rend(); I != E; ++I) {
503 // No need to look at parent BUNDLE instructions.
504 if (I->isBundle())
505 continue;
506
507 auto Result = IsHazard(State, *I);
508 if (Result == HazardFound)
509 return true;
510 if (Result == HazardExpired) {
511 Expired = true;
512 break;
513 }
514
515 if (I->isInlineAsm() || I->isMetaInstruction())
516 continue;
517
518 UpdateState(State, *I);
519 }
520
521 if (!Expired) {
522 unsigned StateIdx = States.size();
523 StateMapKey Key = {&States, StateIdx};
524 auto Insertion = StateMap.insert_as(std::pair(Key, StateIdx), State);
525 if (Insertion.second) {
526 States.emplace_back(State);
527 } else {
528 StateIdx = Insertion.first->second;
529 }
530 for (MachineBasicBlock *Pred : MBB->predecessors())
531 Worklist.insert(std::pair(Pred, StateIdx));
532 }
533
534 if (WorkIdx == Worklist.size())
535 break;
536
537 unsigned StateIdx;
538 std::tie(MBB, StateIdx) = Worklist[WorkIdx++];
539 State = States[StateIdx];
540 I = MBB->instr_rbegin();
541 }
542
543 return false;
544}
545
546// Returns a minimum wait states since \p I walking all predecessors.
547// Only scans until \p IsExpired does not return true.
548// Can only be run in a hazard recognizer mode.
554 for (auto E = MBB->instr_rend(); I != E; ++I) {
555 // Don't add WaitStates for parent BUNDLE instructions.
556 if (I->isBundle())
557 continue;
558
559 if (IsHazard(*I))
560 return WaitStates;
561
562 if (I->isInlineAsm())
563 continue;
564
565 WaitStates += GetNumWaitStates(*I);
566
567 if (IsExpired(*I, WaitStates))
568 return std::numeric_limits<int>::max();
569 }
570
571 int MinWaitStates = std::numeric_limits<int>::max();
572 for (MachineBasicBlock *Pred : MBB->predecessors()) {
573 if (!Visited.insert(Pred).second)
574 continue;
575
576 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
577 IsExpired, Visited, GetNumWaitStates);
578
579 MinWaitStates = std::min(MinWaitStates, W);
580 }
581
582 return MinWaitStates;
583}
584
586 const MachineInstr *MI, IsExpiredFn IsExpired) {
588 return getWaitStatesSince(IsHazard, MI->getParent(),
589 std::next(MI->getReverseIterator()), 0, IsExpired,
591}
592
593int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
594 if (IsHazardRecognizerMode) {
595 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
596 return WaitStates >= Limit;
597 };
598 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
599 }
600
601 int WaitStates = 0;
602 for (MachineInstr *MI : EmittedInstrs) {
603 if (MI) {
604 if (IsHazard(*MI))
605 return WaitStates;
606
607 if (MI->isInlineAsm())
608 continue;
609 }
610 ++WaitStates;
611
612 if (WaitStates >= Limit)
613 break;
614 }
615 return std::numeric_limits<int>::max();
616}
617
618int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
619 IsHazardFn IsHazardDef,
620 int Limit) {
621 const SIRegisterInfo *TRI = ST.getRegisterInfo();
622
623 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
624 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
625 };
626
627 return getWaitStatesSince(IsHazardFn, Limit);
628}
629
630int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
631 int Limit) {
632 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
633 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
634 };
635
636 return getWaitStatesSince(IsHazardFn, Limit);
637}
638
639//===----------------------------------------------------------------------===//
640// No-op Hazard Detection
641//===----------------------------------------------------------------------===//
642
643static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
644 MCRegister Reg) {
645 for (MCRegUnit Unit : TRI.regunits(Reg))
646 BV.set(Unit);
647}
648
649static void addRegsToSet(const SIRegisterInfo &TRI,
651 BitVector &DefSet, BitVector &UseSet) {
652 for (const MachineOperand &Op : Ops) {
653 if (Op.isReg())
654 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
655 }
656}
657
658void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
659 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
660}
661
663 return !SIInstrInfo::isSMRD(*MI);
664}
665
667 return !SIInstrInfo::isVMEM(*MI);
668}
669
670int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
671 // SMEM soft clause are only present on VI+, and only matter if xnack is
672 // enabled.
673 if (!ST.isXNACKEnabled())
674 return 0;
675
676 bool IsSMRD = TII.isSMRD(*MEM);
677
678 resetClause();
679
680 // A soft-clause is any group of consecutive SMEM instructions. The
681 // instructions in this group may return out of order and/or may be
682 // replayed (i.e. the same instruction issued more than once).
683 //
684 // In order to handle these situations correctly we need to make sure that
685 // when a clause has more than one instruction, no instruction in the clause
686 // writes to a register that is read by another instruction in the clause
687 // (including itself). If we encounter this situation, we need to break the
688 // clause by inserting a non SMEM instruction.
689
690 for (MachineInstr *MI : EmittedInstrs) {
691 // When we hit a non-SMEM instruction then we have passed the start of the
692 // clause and we can stop.
693 if (!MI)
694 break;
695
697 break;
698
699 addClauseInst(*MI);
700 }
701
702 if (ClauseDefs.none())
703 return 0;
704
705 // We need to make sure not to put loads and stores in the same clause if they
706 // use the same address. For now, just start a new clause whenever we see a
707 // store.
708 if (MEM->mayStore())
709 return 1;
710
711 addClauseInst(*MEM);
712
713 // If the set of defs and uses intersect then we cannot add this instruction
714 // to the clause, so we have a hazard.
715 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
716}
717
718int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
719 int WaitStatesNeeded = 0;
720
721 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
722
723 // This SMRD hazard only affects SI.
724 if (!ST.hasSMRDReadVALUDefHazard())
725 return WaitStatesNeeded;
726
727 // A read of an SGPR by SMRD instruction requires 4 wait states when the
728 // SGPR was written by a VALU instruction.
729 int SmrdSgprWaitStates = 4;
730 auto IsHazardDefFn = [this](const MachineInstr &MI) {
731 return TII.isVALU(MI);
732 };
733 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
734 return TII.isSALU(MI);
735 };
736
737 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
738
739 for (const MachineOperand &Use : SMRD->uses()) {
740 if (!Use.isReg())
741 continue;
742 int WaitStatesNeededForUse =
743 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
744 SmrdSgprWaitStates);
745 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
746
747 // This fixes what appears to be undocumented hardware behavior in SI where
748 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
749 // needs some number of nops in between. We don't know how many we need, but
750 // let's use 4. This wasn't discovered before probably because the only
751 // case when this happens is when we expand a 64-bit pointer into a full
752 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
753 // probably never encountered in the closed-source land.
754 if (IsBufferSMRD) {
755 int WaitStatesNeededForUse =
756 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
757 IsBufferHazardDefFn,
758 SmrdSgprWaitStates);
759 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
760 }
761 }
762
763 return WaitStatesNeeded;
764}
765
766int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
767 if (!ST.hasVMEMReadSGPRVALUDefHazard())
768 return 0;
769
770 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
771
772 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
773 // SGPR was written by a VALU Instruction.
774 const int VmemSgprWaitStates = 5;
775 auto IsHazardDefFn = [this](const MachineInstr &MI) {
776 return TII.isVALU(MI);
777 };
778 for (const MachineOperand &Use : VMEM->uses()) {
779 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
780 continue;
781
782 int WaitStatesNeededForUse =
783 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
784 VmemSgprWaitStates);
785 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
786 }
787 return WaitStatesNeeded;
788}
789
790int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
791 const SIRegisterInfo *TRI = ST.getRegisterInfo();
792 const SIInstrInfo *TII = ST.getInstrInfo();
793
794 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
795 int DppVgprWaitStates = 2;
796 int DppExecWaitStates = 5;
797 int WaitStatesNeeded = 0;
798 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
799 return TII->isVALU(MI);
800 };
801
802 for (const MachineOperand &Use : DPP->uses()) {
803 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
804 continue;
805 int WaitStatesNeededForUse =
806 DppVgprWaitStates - getWaitStatesSinceDef(
807 Use.getReg(),
808 [](const MachineInstr &) { return true; },
809 DppVgprWaitStates);
810 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
811 }
812
813 WaitStatesNeeded = std::max(
814 WaitStatesNeeded,
815 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
816 DppExecWaitStates));
817
818 return WaitStatesNeeded;
819}
820
821int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
822 const SIInstrInfo *TII = ST.getInstrInfo();
823
824 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
825 // instruction.
826 const int DivFMasWaitStates = 4;
827 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
828 return TII->isVALU(MI);
829 };
830 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
831 DivFMasWaitStates);
832
833 return DivFMasWaitStates - WaitStatesNeeded;
834}
835
836int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
837 const SIInstrInfo *TII = ST.getInstrInfo();
838 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
839
840 const int GetRegWaitStates = 2;
841 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
842 return GetRegHWReg == getHWReg(TII, MI);
843 };
844 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
845
846 return GetRegWaitStates - WaitStatesNeeded;
847}
848
849int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
850 const SIInstrInfo *TII = ST.getInstrInfo();
851 unsigned HWReg = getHWReg(TII, *SetRegInstr);
852
853 const int SetRegWaitStates = ST.getSetRegWaitStates();
854 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
855 return HWReg == getHWReg(TII, MI);
856 };
857 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
858 return SetRegWaitStates - WaitStatesNeeded;
859}
860
861int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
862 if (!MI.mayStore())
863 return -1;
864
865 const SIInstrInfo *TII = ST.getInstrInfo();
866 unsigned Opcode = MI.getOpcode();
867 const MCInstrDesc &Desc = MI.getDesc();
868
869 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
870 int VDataRCID = -1;
871 if (VDataIdx != -1)
872 VDataRCID = Desc.operands()[VDataIdx].RegClass;
873
874 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
875 // There is no hazard if the instruction does not use vector regs
876 // (like wbinvl1)
877 if (VDataIdx == -1)
878 return -1;
879 // For MUBUF/MTBUF instructions this hazard only exists if the
880 // instruction is not using a register in the soffset field.
881 const MachineOperand *SOffset =
882 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
883 // If we have no soffset operand, then assume this field has been
884 // hardcoded to zero.
885 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
886 (!SOffset || !SOffset->isReg()))
887 return VDataIdx;
888 }
889
890 // MIMG instructions create a hazard if they don't use a 256-bit T# and
891 // the store size is greater than 8 bytes and they have more than two bits
892 // of their dmask set.
893 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
894 if (TII->isMIMG(MI)) {
895 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
896 assert(SRsrcIdx != -1 &&
897 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
898 (void)SRsrcIdx;
899 }
900
901 if (TII->isFLAT(MI)) {
902 // There is no hazard if the instruction does not use vector regs
903 if (VDataIdx == -1)
904 return -1;
905
906 if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
907 return VDataIdx;
908 }
909
910 return -1;
911}
912
913int
914GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
915 const MachineRegisterInfo &MRI) {
916 // Helper to check for the hazard where VMEM instructions that store more than
917 // 8 bytes can have there store data over written by the next instruction.
918 const SIRegisterInfo *TRI = ST.getRegisterInfo();
919
920 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
921 int WaitStatesNeeded = 0;
922
923 if (!TRI->isVectorRegister(MRI, Def.getReg()))
924 return WaitStatesNeeded;
925 Register Reg = Def.getReg();
926 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
927 int DataIdx = createsVALUHazard(MI);
928 return DataIdx >= 0 &&
929 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
930 };
931
932 int WaitStatesNeededForDef =
933 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
934 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
935
936 return WaitStatesNeeded;
937}
938
939/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
940/// pack the computed value into correct bit position of the dest register. This
941/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
942/// dst_sel that is not aligned to the register. This function analayzes the \p
943/// MI and \returns an operand with dst forwarding issue, or nullptr if
944/// none exists.
945static const MachineOperand *
948 return nullptr;
949
950 const SIInstrInfo *TII = ST.getInstrInfo();
951
952 unsigned Opcode = MI.getOpcode();
953
954 // There are three different types of instructions
955 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
956 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
957 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
958 // op_sel[3:2]
959 // != 0
960 if (SIInstrInfo::isSDWA(MI)) {
961 // Type 1: SDWA with dst_sel != DWORD
962 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
963 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
964 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
965 }
966
967 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
968 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
969 // Type 2: VOP3 which write the hi bits
970 if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
972 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
973
974 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
975 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
976 (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
978 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
979 }
980
981 // Special case: nop is required for all the opsel values for fp4 sr variant
982 // cvt scale instructions
983 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
984 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
985
986 return nullptr;
987}
988
989/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
990/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
991/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
993 const MachineOperand *Dst,
994 const SIRegisterInfo *TRI) {
995 // We must consider implicit reads of the VALU. SDWA with dst_sel and
996 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
997 // and we must account for that hazard.
998 // We also must account for WAW hazards. In particular, WAW with dest
999 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
1000 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
1001 // check for ECC. Without accounting for this hazard, the ECC will be
1002 // wrong.
1003 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
1004 // complete zeroesHigh16BitsOfDest)
1005 for (auto &Operand : VALU->operands()) {
1006 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1007 return true;
1008 }
1009 }
1010 return false;
1011}
1012
1013int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
1014 int WaitStatesNeeded = 0;
1015
1016 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
1017 const int TransDefWaitstates = 1;
1018
1019 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
1021 return false;
1022 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1023 const SIInstrInfo *TII = ST.getInstrInfo();
1024 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
1025
1026 for (const MachineOperand &Use : VALU->explicit_uses()) {
1027 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
1028 return true;
1029 }
1030
1031 return false;
1032 };
1033
1034 int WaitStatesNeededForDef =
1035 TransDefWaitstates -
1036 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1037 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1038 }
1039
1040 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1041 const int Shift16DefWaitstates = 1;
1042
1043 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1044 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1045 const MachineOperand *ForwardedDst =
1046 getDstSelForwardingOperand(ProducerMI, ST);
1047 if (ForwardedDst) {
1048 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
1049 }
1050
1051 if (ProducerMI.isInlineAsm()) {
1052 // Assume inline asm has dst forwarding hazard
1053 for (auto &Def : ProducerMI.all_defs()) {
1054 if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
1055 return true;
1056 }
1057 }
1058
1059 return false;
1060 };
1061
1062 int WaitStatesNeededForDef =
1063 Shift16DefWaitstates -
1064 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1065 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1066 }
1067
1068 if (ST.hasVDecCoExecHazard()) {
1069 const int VALUWriteSGPRVALUReadWaitstates = 2;
1070 const int VALUWriteEXECRWLane = 4;
1071 const int VALUWriteVGPRReadlaneRead = 1;
1072
1073 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1074 const MachineRegisterInfo &MRI = MF.getRegInfo();
1076 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1077 if (!SIInstrInfo::isVALU(MI))
1078 return false;
1079 return MI.modifiesRegister(UseReg, TRI);
1080 };
1081
1082 for (const MachineOperand &Use : VALU->explicit_uses()) {
1083 if (!Use.isReg())
1084 continue;
1085
1086 UseReg = Use.getReg();
1087 if (TRI->isSGPRReg(MRI, UseReg)) {
1088 int WaitStatesNeededForDef =
1089 VALUWriteSGPRVALUReadWaitstates -
1090 getWaitStatesSince(IsVALUDefSGPRFn,
1091 VALUWriteSGPRVALUReadWaitstates);
1092 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1093 }
1094 }
1095
1096 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1097 UseReg = AMDGPU::VCC;
1098 int WaitStatesNeededForDef =
1099 VALUWriteSGPRVALUReadWaitstates -
1100 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1101 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1102 }
1103
1104 switch (VALU->getOpcode()) {
1105 case AMDGPU::V_READLANE_B32:
1106 case AMDGPU::V_READFIRSTLANE_B32: {
1107 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1108 UseReg = Src->getReg();
1109 int WaitStatesNeededForDef =
1110 VALUWriteVGPRReadlaneRead -
1111 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1112 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1113 }
1114 [[fallthrough]];
1115 case AMDGPU::V_WRITELANE_B32: {
1116 UseReg = AMDGPU::EXEC;
1117 int WaitStatesNeededForDef =
1118 VALUWriteEXECRWLane -
1119 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1120 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1121 break;
1122 }
1123 default:
1124 break;
1125 }
1126 }
1127
1128 // This checks for the hazard where VMEM instructions that store more than
1129 // 8 bytes can have there store data over written by the next instruction.
1130 if (!ST.has12DWordStoreHazard())
1131 return WaitStatesNeeded;
1132
1133 const MachineRegisterInfo &MRI = MF.getRegInfo();
1134
1135 for (const MachineOperand &Def : VALU->defs()) {
1136 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1137 }
1138
1139 return WaitStatesNeeded;
1140}
1141
1142int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1143 // This checks for hazards associated with inline asm statements.
1144 // Since inline asms can contain just about anything, we use this
1145 // to call/leverage other check*Hazard routines. Note that
1146 // this function doesn't attempt to address all possible inline asm
1147 // hazards (good luck), but is a collection of what has been
1148 // problematic thus far.
1149
1150 // see checkVALUHazards()
1151 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1152 !ST.hasCvtScaleForwardingHazard())
1153 return 0;
1154
1155 const MachineRegisterInfo &MRI = MF.getRegInfo();
1156 int WaitStatesNeeded = 0;
1157
1158 for (const MachineOperand &Op :
1160 if (Op.isReg() && Op.isDef()) {
1161 if (!TRI.isVectorRegister(MRI, Op.getReg()))
1162 continue;
1163
1164 if (ST.has12DWordStoreHazard()) {
1165 WaitStatesNeeded =
1166 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1167 }
1168 }
1169 }
1170
1171 if (ST.hasDstSelForwardingHazard()) {
1172 const int Shift16DefWaitstates = 1;
1173
1174 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1175 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1176 // Assume inline asm reads the dst
1177 if (Dst)
1178 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1179 IA->readsRegister(Dst->getReg(), &TRI);
1180
1181 if (ProducerMI.isInlineAsm()) {
1182 // If MI is inline asm, assume it has dst forwarding hazard
1183 for (auto &Def : ProducerMI.all_defs()) {
1184 if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1185 IA->readsRegister(Def.getReg(), &TRI)) {
1186 return true;
1187 }
1188 }
1189 }
1190
1191 return false;
1192 };
1193
1194 int WaitStatesNeededForDef =
1195 Shift16DefWaitstates -
1196 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1197 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1198 }
1199
1200 return WaitStatesNeeded;
1201}
1202
1203int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1204 const SIInstrInfo *TII = ST.getInstrInfo();
1205 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1206 const MachineRegisterInfo &MRI = MF.getRegInfo();
1207
1208 const MachineOperand *LaneSelectOp =
1209 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1210
1211 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1212 return 0;
1213
1214 Register LaneSelectReg = LaneSelectOp->getReg();
1215 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1216
1217 const int RWLaneWaitStates = 4;
1218 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1219 RWLaneWaitStates);
1220 return RWLaneWaitStates - WaitStatesSince;
1221}
1222
1223int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1224 if (!ST.hasRFEHazards())
1225 return 0;
1226
1227 const SIInstrInfo *TII = ST.getInstrInfo();
1228
1229 const int RFEWaitStates = 1;
1230
1231 auto IsHazardFn = [TII](const MachineInstr &MI) {
1232 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1233 };
1234 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1235 return RFEWaitStates - WaitStatesNeeded;
1236}
1237
1238int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1239 const SIInstrInfo *TII = ST.getInstrInfo();
1240 const int ReadM0WaitStates = 1;
1241 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1242 return ReadM0WaitStates -
1243 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1244}
1245
1246void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1247 fixVMEMtoScalarWriteHazards(MI);
1248 fixVcmpxPermlaneHazards(MI);
1249 fixSMEMtoVectorWriteHazards(MI);
1250 fixVcmpxExecWARHazard(MI);
1251 fixLdsBranchVmemWARHazard(MI);
1252 if (ST.hasLdsDirect()) {
1253 fixLdsDirectVALUHazard(MI);
1254 fixLdsDirectVMEMHazard(MI);
1255 }
1256 fixVALUPartialForwardingHazard(MI);
1257 fixVALUTransUseHazard(MI);
1258 fixVALUTransCoexecutionHazards(MI);
1259 fixWMMAHazards(MI); // fall-through if co-execution is enabled.
1260 fixWMMACoexecutionHazards(MI);
1261 fixShift64HighRegBug(MI);
1262 fixVALUMaskWriteHazard(MI);
1263 fixRequiredExportPriority(MI);
1264 if (ST.requiresWaitIdleBeforeGetReg())
1265 fixGetRegWaitIdle(MI);
1266 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1267 fixDsAtomicAsyncBarrierArriveB64(MI);
1268 if (ST.hasScratchBaseForwardingHazard())
1269 fixScratchBaseForwardingHazard(MI);
1270 if (ST.setRegModeNeedsVNOPs())
1271 fixSetRegMode(MI);
1272}
1273
1275 const MachineInstr &MI) {
1276 return (TII.isVOPC(MI) ||
1277 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1278 MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1279}
1280
1281bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1282 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1283 return false;
1284
1285 const SIInstrInfo *TII = ST.getInstrInfo();
1286 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1287 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1288 return isVCmpXWritesExec(*TII, *TRI, MI);
1289 };
1290
1291 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1292 unsigned Opc = MI.getOpcode();
1293 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1294 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1295 };
1296
1297 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1298 std::numeric_limits<int>::max())
1299 return false;
1300
1301 // V_NOP will be discarded by SQ.
1302 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1303 // which is always a VGPR and available.
1304 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1305 Register Reg = Src0->getReg();
1306 bool IsUndef = Src0->isUndef();
1307 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1308 TII->get(AMDGPU::V_MOV_B32_e32))
1309 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1311
1312 return true;
1313}
1314
1315bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1316 if (!ST.hasVMEMtoScalarWriteHazard())
1317 return false;
1318 assert(!ST.hasExtendedWaitCounts());
1319
1321 return false;
1322
1323 if (MI->getNumDefs() == 0)
1324 return false;
1325
1326 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1327
1328 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1330 return false;
1331
1332 for (const MachineOperand &Def : MI->defs()) {
1333 const MachineOperand *Op =
1334 I.findRegisterUseOperand(Def.getReg(), TRI, false);
1335 if (!Op)
1336 continue;
1337 return true;
1338 }
1339 return false;
1340 };
1341
1342 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1343 return SIInstrInfo::isVALU(MI) ||
1344 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1345 !MI.getOperand(0).getImm()) ||
1346 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1347 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1348 };
1349
1350 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1351 std::numeric_limits<int>::max())
1352 return false;
1353
1354 const SIInstrInfo *TII = ST.getInstrInfo();
1355 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1356 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1358 return true;
1359}
1360
1361bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1362 if (!ST.hasSMEMtoVectorWriteHazard())
1363 return false;
1364 assert(!ST.hasExtendedWaitCounts());
1365
1366 if (!SIInstrInfo::isVALU(*MI))
1367 return false;
1368
1369 AMDGPU::OpName SDSTName;
1370 switch (MI->getOpcode()) {
1371 case AMDGPU::V_READLANE_B32:
1372 case AMDGPU::V_READFIRSTLANE_B32:
1373 SDSTName = AMDGPU::OpName::vdst;
1374 break;
1375 default:
1376 SDSTName = AMDGPU::OpName::sdst;
1377 break;
1378 }
1379
1380 const SIInstrInfo *TII = ST.getInstrInfo();
1381 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1382 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1383 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1384 if (!SDST) {
1385 for (const auto &MO : MI->implicit_operands()) {
1386 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1387 SDST = &MO;
1388 break;
1389 }
1390 }
1391 }
1392
1393 if (!SDST)
1394 return false;
1395
1396 const Register SDSTReg = SDST->getReg();
1397 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1398 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1399 };
1400
1401 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1402 if (TII->isSALU(MI)) {
1403 switch (MI.getOpcode()) {
1404 case AMDGPU::S_SETVSKIP:
1405 case AMDGPU::S_VERSION:
1406 case AMDGPU::S_WAITCNT_VSCNT:
1407 case AMDGPU::S_WAITCNT_VMCNT:
1408 case AMDGPU::S_WAITCNT_EXPCNT:
1409 // These instructions cannot not mitigate the hazard.
1410 return false;
1411 case AMDGPU::S_WAITCNT_LGKMCNT:
1412 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1413 return (MI.getOperand(1).getImm() == 0) &&
1414 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1415 case AMDGPU::S_WAITCNT: {
1416 const int64_t Imm = MI.getOperand(0).getImm();
1417 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1418 // DsCnt corresponds to LGKMCnt here.
1419 return (Decoded.DsCnt == 0);
1420 }
1421 default:
1422 assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
1423 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1424 "unexpected wait count instruction");
1425 // SOPP instructions cannot mitigate the hazard.
1426 if (TII->isSOPP(MI))
1427 return false;
1428 // At this point the SALU can be assumed to mitigate the hazard
1429 // because either:
1430 // (a) it is independent of the at risk SMEM (breaking chain),
1431 // or
1432 // (b) it is dependent on the SMEM, in which case an appropriate
1433 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1434 // SMEM instruction.
1435 return true;
1436 }
1437 }
1438 return false;
1439 };
1440
1441 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1442 std::numeric_limits<int>::max())
1443 return false;
1444
1445 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1446 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1447 .addImm(0);
1448 return true;
1449}
1450
1451bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1452 if (!ST.hasVcmpxExecWARHazard())
1453 return false;
1454 assert(!ST.hasExtendedWaitCounts());
1455
1456 if (!SIInstrInfo::isVALU(*MI))
1457 return false;
1458
1459 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1460 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1461 return false;
1462
1463 auto IsHazardFn = [TRI](const MachineInstr &I) {
1465 return false;
1466 return I.readsRegister(AMDGPU::EXEC, TRI);
1467 };
1468
1469 const SIInstrInfo *TII = ST.getInstrInfo();
1470 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1471 if (SIInstrInfo::isVALU(MI)) {
1472 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1473 return true;
1474 for (auto MO : MI.implicit_operands())
1475 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1476 return true;
1477 }
1478 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1479 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1480 return true;
1481 return false;
1482 };
1483
1484 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1485 std::numeric_limits<int>::max())
1486 return false;
1487
1488 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1489 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1491 return true;
1492}
1493
1495 const GCNSubtarget &ST) {
1496 if (!ST.hasLdsBranchVmemWARHazard())
1497 return false;
1498
1499 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1500 // instructions need to appear in the same function.
1501 bool HasLds = false;
1502 bool HasVmem = false;
1503 for (auto &MBB : MF) {
1504 for (auto &MI : MBB) {
1505 HasLds |= SIInstrInfo::isDS(MI);
1506 HasVmem |= (SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
1508 if (HasLds && HasVmem)
1509 return true;
1510 }
1511 }
1512 return false;
1513}
1514
1516 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1517 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1518 !I.getOperand(1).getImm();
1519}
1520
1521bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1522 if (!RunLdsBranchVmemWARHazardFixup)
1523 return false;
1524
1525 assert(ST.hasLdsBranchVmemWARHazard());
1526 assert(!ST.hasExtendedWaitCounts());
1527
1528 auto IsHazardInst = [](const MachineInstr &MI) {
1529 if (SIInstrInfo::isDS(MI))
1530 return 1;
1533 return 2;
1534 return 0;
1535 };
1536
1537 auto InstType = IsHazardInst(*MI);
1538 if (!InstType)
1539 return false;
1540
1541 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1542 return IsHazardInst(I) || isStoreCountWaitZero(I);
1543 };
1544
1545 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1546 if (!I.isBranch())
1547 return false;
1548
1549 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1550 auto InstType2 = IsHazardInst(I);
1551 return InstType2 && InstType != InstType2;
1552 };
1553
1554 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1555 auto InstType2 = IsHazardInst(I);
1556 if (InstType == InstType2)
1557 return true;
1558
1559 return isStoreCountWaitZero(I);
1560 };
1561
1562 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1563 std::numeric_limits<int>::max();
1564 };
1565
1566 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1567 std::numeric_limits<int>::max())
1568 return false;
1569
1570 const SIInstrInfo *TII = ST.getInstrInfo();
1571 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1572 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1573 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1574 .addImm(0);
1575
1576 return true;
1577}
1578
1579bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1581 return false;
1582
1583 const int NoHazardWaitStates = 15;
1584 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1585 const Register VDSTReg = VDST->getReg();
1586
1587 bool VisitedTrans = false;
1588 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1589 if (!SIInstrInfo::isVALU(I))
1590 return false;
1591 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1592 // Cover both WAR and WAW
1593 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1594 };
1595 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1596 if (WaitStates >= NoHazardWaitStates)
1597 return true;
1598 // Instructions which cause va_vdst==0 expire hazard
1601 };
1602 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1603 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1604 };
1605
1606 DenseSet<const MachineBasicBlock *> Visited;
1607 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1608 std::next(MI->getReverseIterator()), 0,
1609 IsExpiredFn, Visited, GetWaitStatesFn);
1610
1611 // Transcendentals can execute in parallel to other VALUs.
1612 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1613 if (VisitedTrans)
1614 Count = 0;
1615
1616 MachineOperand *WaitVdstOp =
1617 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1618 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1619
1620 return true;
1621}
1622
1623bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1625 return false;
1626
1627 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1628 const Register VDSTReg = VDST->getReg();
1629
1630 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1632 return false;
1633 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1634 };
1635 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1636 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1637 // according to the type of VMEM instruction.
1638 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1640 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1641 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1642 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1643 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1644 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1645 };
1646
1647 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1648 std::numeric_limits<int>::max())
1649 return false;
1650
1651 if (LdsdirCanWait) {
1652 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1653 } else {
1654 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1655 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1657 }
1658
1659 return true;
1660}
1661
1662bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1663 if (!ST.hasVALUPartialForwardingHazard())
1664 return false;
1665 assert(!ST.hasExtendedWaitCounts());
1666
1667 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1668 return false;
1669
1670 SmallSetVector<Register, 4> SrcVGPRs;
1671
1672 for (const MachineOperand &Use : MI->explicit_uses()) {
1673 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1674 SrcVGPRs.insert(Use.getReg());
1675 }
1676
1677 // Only applies with >= 2 unique VGPR sources
1678 if (SrcVGPRs.size() <= 1)
1679 return false;
1680
1681 // Look for the following pattern:
1682 // Va <- VALU [PreExecPos]
1683 // intv1
1684 // Exec <- SALU [ExecPos]
1685 // intv2
1686 // Vb <- VALU [PostExecPos]
1687 // intv3
1688 // MI Va, Vb (WaitState = 0)
1689 //
1690 // Where:
1691 // intv1 + intv2 <= 2 VALUs
1692 // intv3 <= 4 VALUs
1693 //
1694 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1695
1696 const int Intv1plus2MaxVALUs = 2;
1697 const int Intv3MaxVALUs = 4;
1698 const int IntvMaxVALUs = 6;
1699 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1700
1701 struct StateType {
1702 SmallDenseMap<Register, int, 4> DefPos;
1703 int ExecPos = std::numeric_limits<int>::max();
1704 int VALUs = 0;
1705
1706 static unsigned getHashValue(const StateType &State) {
1707 return hash_combine(State.ExecPos, State.VALUs,
1708 hash_combine_range(State.DefPos));
1709 }
1710 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1711 return LHS.DefPos == RHS.DefPos && LHS.ExecPos == RHS.ExecPos &&
1712 LHS.VALUs == RHS.VALUs;
1713 }
1714 };
1715
1716 StateType State;
1717
1718 // This overloads expiry testing with all the hazard detection
1719 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1720 // Too many VALU states have passed
1721 if (State.VALUs > NoHazardVALUWaitStates)
1722 return HazardExpired;
1723
1724 // Instructions which cause va_vdst==0 expire hazard
1727 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1728 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1729 return HazardExpired;
1730
1731 // Track registers writes
1732 bool Changed = false;
1733 if (SIInstrInfo::isVALU(I)) {
1734 for (Register Src : SrcVGPRs) {
1735 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1736 State.DefPos[Src] = State.VALUs;
1737 Changed = true;
1738 }
1739 }
1740 } else if (SIInstrInfo::isSALU(I)) {
1741 if (State.ExecPos == std::numeric_limits<int>::max()) {
1742 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1743 State.ExecPos = State.VALUs;
1744 Changed = true;
1745 }
1746 }
1747 }
1748
1749 // Early expiration: too many VALUs in intv3
1750 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1751 return HazardExpired;
1752
1753 // Only evaluate state if something changed
1754 if (!Changed)
1755 return NoHazardFound;
1756
1757 // Determine positions of VALUs pre/post exec change
1758 if (State.ExecPos == std::numeric_limits<int>::max())
1759 return NoHazardFound;
1760
1761 int PreExecPos = std::numeric_limits<int>::max();
1762 int PostExecPos = std::numeric_limits<int>::max();
1763
1764 for (auto Entry : State.DefPos) {
1765 int DefVALUs = Entry.second;
1766 if (DefVALUs != std::numeric_limits<int>::max()) {
1767 if (DefVALUs >= State.ExecPos)
1768 PreExecPos = std::min(PreExecPos, DefVALUs);
1769 else
1770 PostExecPos = std::min(PostExecPos, DefVALUs);
1771 }
1772 }
1773
1774 // Need a VALUs post exec change
1775 if (PostExecPos == std::numeric_limits<int>::max())
1776 return NoHazardFound;
1777
1778 // Too many VALUs in intv3?
1779 int Intv3VALUs = PostExecPos;
1780 if (Intv3VALUs > Intv3MaxVALUs)
1781 return HazardExpired;
1782
1783 // Too many VALUs in intv2?
1784 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1785 if (Intv2VALUs > Intv1plus2MaxVALUs)
1786 return HazardExpired;
1787
1788 // Need a VALUs pre exec change
1789 if (PreExecPos == std::numeric_limits<int>::max())
1790 return NoHazardFound;
1791
1792 // Too many VALUs in intv1?
1793 int Intv1VALUs = PreExecPos - State.ExecPos;
1794 if (Intv1VALUs > Intv1plus2MaxVALUs)
1795 return HazardExpired;
1796
1797 // Too many VALUs in intv1 + intv2
1798 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1799 return HazardExpired;
1800
1801 return HazardFound;
1802 };
1803 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1805 State.VALUs += 1;
1806 };
1807
1808 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1809 std::next(MI->getReverseIterator())))
1810 return false;
1811
1812 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1813 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1815
1816 return true;
1817}
1818
1819bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1820 if (!ST.hasVALUTransUseHazard())
1821 return false;
1822 assert(!ST.hasExtendedWaitCounts());
1823
1824 if (!SIInstrInfo::isVALU(*MI))
1825 return false;
1826
1827 SmallSet<Register, 4> SrcVGPRs;
1828
1829 for (const MachineOperand &Use : MI->explicit_uses()) {
1830 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1831 SrcVGPRs.insert(Use.getReg());
1832 }
1833
1834 // Look for the following pattern:
1835 // Va <- TRANS VALU
1836 // intv
1837 // MI Va (WaitState = 0)
1838 //
1839 // Where:
1840 // intv <= 5 VALUs / 1 TRANS
1841 //
1842 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1843
1844 const int IntvMaxVALUs = 5;
1845 const int IntvMaxTRANS = 1;
1846
1847 struct StateType {
1848 int VALUs = 0;
1849 int TRANS = 0;
1850
1851 static unsigned getHashValue(const StateType &State) {
1852 return hash_combine(State.VALUs, State.TRANS);
1853 }
1854 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1855 return LHS.VALUs == RHS.VALUs && LHS.TRANS == RHS.TRANS;
1856 }
1857 };
1858
1859 StateType State;
1860
1861 // This overloads expiry testing with all the hazard detection
1862 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1863 // Too many VALU states have passed
1864 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1865 return HazardExpired;
1866
1867 // Instructions which cause va_vdst==0 expire hazard
1870 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1871 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1872 return HazardExpired;
1873
1874 // Track registers writes
1875 if (SIInstrInfo::isTRANS(I)) {
1876 for (Register Src : SrcVGPRs) {
1877 if (I.modifiesRegister(Src, &TRI)) {
1878 return HazardFound;
1879 }
1880 }
1881 }
1882
1883 return NoHazardFound;
1884 };
1885 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1887 State.VALUs += 1;
1889 State.TRANS += 1;
1890 };
1891
1892 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1893 std::next(MI->getReverseIterator())))
1894 return false;
1895
1896 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1897 // avoided.
1898 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1899 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1901
1902 return true;
1903}
1904
1905bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
1906 if (!AMDGPU::isGFX1250(ST) || // Coexecution disabled.
1908 return false;
1909
1910 const SIInstrInfo *TII = ST.getInstrInfo();
1911 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1912
1913 auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1914 if (!SIInstrInfo::isTRANS(I))
1915 return false;
1916
1917 // RAW: Trans(I) writes, VALU(MI) reads.
1918 Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1919 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
1920 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1921 return true;
1922 }
1923
1924 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
1925 if (!ValuDst || !ValuDst->isReg())
1926 return false;
1927
1928 // WAR: Trans(I) reads, VALU(MI) writes.
1929 Register ValuDef = ValuDst->getReg();
1930 for (const MachineOperand &TransUse : I.explicit_uses()) {
1931 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1932 return true;
1933 }
1934
1935 return false;
1936 };
1937
1938 auto IsExpiredFn = [](const MachineInstr &I, int) {
1939 return SIInstrInfo::isVALU(I);
1940 };
1941
1942 const int HasVALU = std::numeric_limits<int>::max();
1943 if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU)
1944 return false;
1945
1946 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1947 return true;
1948}
1949
1950bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1952 return false;
1953
1954 const SIInstrInfo *TII = ST.getInstrInfo();
1955 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1956
1957 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1959 return false;
1960
1961 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1962 // with the dest(matrix D) of the previous wmma.
1963 const Register CurSrc0Reg =
1964 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1965 const Register CurSrc1Reg =
1966 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1967
1968 const Register PrevDstReg =
1969 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1970
1971 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1972 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1973 return true;
1974 }
1975
1976 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1977 // but Index can't overlap with PrevDstReg.
1978 if (AMDGPU::isGFX12Plus(ST)) {
1979 if (SIInstrInfo::isSWMMAC(*MI)) {
1980 const Register CurIndex =
1981 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1982 if (TRI->regsOverlap(PrevDstReg, CurIndex))
1983 return true;
1984 }
1985 return false;
1986 }
1987
1988 return false;
1989 };
1990
1991 auto IsExpiredFn = [](const MachineInstr &I, int) {
1992 return SIInstrInfo::isVALU(I);
1993 };
1994
1995 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1996 std::numeric_limits<int>::max())
1997 return false;
1998
1999 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2000
2001 return true;
2002}
2003
2006 !SIInstrInfo::isWMMA(MI) && !SIInstrInfo::isSWMMAC(MI); // What else?
2007}
2008
2010 const SIInstrInfo *TII, unsigned Latency,
2011 unsigned Category) {
2012 assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) &&
2013 "Handle me if the xdl wmma instruction latency changes");
2014
2015 switch (Category) {
2016 case 0: // Dense WMMA Instructions:
2017 // WMMA_*F16, WMMA_*BF16
2018 // WMMA_*FP8FP8
2019 // WMMA_*FP8BF8
2020 // WMMA_*BF8FP8
2021 // WMMA_*BF8BF8
2022 // WMMA_*F8F6F4 if SRCA & SRCB != F8
2023 return Latency == 8 && SIInstrInfo::isWMMA(MI);
2024
2025 case 1: // Dense WMMA Instructions:
2026 // WMMA_IU8
2027 // WMMA_IU4
2028 // WMMA_*F8F6F4 if SRCA OR SRCB == F8
2029 return Latency == 16 && SIInstrInfo::isWMMA(MI);
2030
2031 case 2: // Dense SWMMAC Instructions
2032 // SWMMAC_*F16, SWMMAC_*BF16,
2033 // SWMMAC_*FP8FP8
2034 // SWMMAC_*BF8FP8
2035 // SWMMAC_*FP8BF8
2036 // SWMMAC_*BF8BF8
2037 return Latency == 8 && SIInstrInfo::isSWMMAC(MI);
2038
2039 case 3: // Sparse WMMA Instructions:
2040 // SWMMAC_IU8
2041 // SWMMAC_IU4
2042 return Latency == 16 && SIInstrInfo::isSWMMAC(MI);
2043 default:
2044 break;
2045 } // end switch.
2046
2047 return false;
2048}
2049
2050bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
2051 if (!AMDGPU::isGFX1250(ST))
2052 return false;
2053
2054 const SIInstrInfo *TII = ST.getInstrInfo();
2055 if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
2056 return false;
2057
2058 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2059
2060 // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
2061 // be in between the first WMMA and the second instruction to cover the hazard
2062 // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
2063 // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
2064 // numbers, which depends on the category of the first WMMA.
2065 const int WMMAWaitStates[] = {5, 9, 3, 5};
2066 const int VALUWaitStates[] = {4, 8, 2, 4};
2067 unsigned Category = 0;
2068
2069 auto IsWMMAHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
2070 if (!TII->isXDLWMMA(I))
2071 return false;
2072
2073 unsigned Latency = TSchedModel.computeInstrLatency(&I);
2074 if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
2075 return false;
2076
2077 Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2078 Register A1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
2079 Register B1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
2080
2081 // WMMA0 wrires (D0), WMMA1 reads (A1/B1/Idx1).
2082 if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))
2083 return true;
2084
2085 if (SIInstrInfo::isSWMMAC(*MI)) {
2086 Register Idx1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
2087 if (TRI->regsOverlap(D0, Idx1))
2088 return true;
2089 }
2090
2091 return false;
2092 };
2093
2094 auto IsVALUHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
2095 if (!TII->isXDLWMMA(I))
2096 return false;
2097
2098 unsigned Latency = TSchedModel.computeInstrLatency(&I);
2099 if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
2100 return false;
2101
2102 // WMMA writes, VALU reads.
2103 Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2104 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
2105 if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))
2106 return true;
2107 }
2108
2109 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
2110 if (!ValuDst || !ValuDst->isReg())
2111 return false;
2112 Register D1 = ValuDst->getReg();
2113
2114 // WMMA writes, VALU writes.
2115 if (TRI->regsOverlap(D0, D1))
2116 return true;
2117
2118 // WMMA reads, VALU writes.
2119 Register A0 = TII->getNamedOperand(I, AMDGPU::OpName::src0)->getReg();
2120 Register B0 = TII->getNamedOperand(I, AMDGPU::OpName::src1)->getReg();
2121 if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))
2122 return true;
2123
2124 if (SIInstrInfo::isSWMMAC(I)) {
2125 Register Idx0 = TII->getNamedOperand(I, AMDGPU::OpName::src2)->getReg();
2126 if (TRI->regsOverlap(D1, Idx0))
2127 return true;
2128 }
2129
2130 return false;
2131 };
2132
2133 int Limit = 0;
2134 auto IsExpiredFn = [&Limit](const MachineInstr &, int WaitStates) {
2135 return WaitStates >= Limit;
2136 };
2137
2138 auto GetWaitStatesFn = [](const MachineInstr &I) {
2139 return SIInstrInfo::isVALU(I) ? 1 : 0;
2140 };
2141
2142 int WaitStatesNeeded = -1;
2143 if (TII->isXDLWMMA(*MI)) {
2144 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2145 Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
2146 DenseSet<const MachineBasicBlock *> Visited;
2147 // '::getWaitStatesSince' returns the number of VALUs in between if hazard
2148 // exists, and INT_MAX if there is no hazard. As a result, a negative
2149 // WaitStatesNeeded here means no hazard, and we will continue to search
2150 // for other categories.
2151 WaitStatesNeeded =
2152 Limit - ::getWaitStatesSince(IsWMMAHazardFn, MI->getParent(),
2153 std::next(MI->getReverseIterator()), 0,
2154 IsExpiredFn, Visited, GetWaitStatesFn);
2155 }
2156 } else { // Must be a co-executable VALU.
2157 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2158 Limit = VALUWaitStates[Category]; // for IsExpiredFn.
2159 DenseSet<const MachineBasicBlock *> Visited;
2160 // '::getWaitStatesSince' returns the number of VALUs in between if hazard
2161 // exists, and INT_MAX if there is no hazard. As a result, a negative
2162 // WaitStatesNeeded here means no hazard, and we will continue to search
2163 // for other categories.
2164 WaitStatesNeeded =
2165 Limit - ::getWaitStatesSince(IsVALUHazardFn, MI->getParent(),
2166 std::next(MI->getReverseIterator()), 0,
2167 IsExpiredFn, Visited, GetWaitStatesFn);
2168 }
2169 }
2170
2171 // WaitStatesNeeded now is the number of V_NOPs we need to insert, negative
2172 // means not needed.
2173 for (int i = 0; i < WaitStatesNeeded; i++)
2174 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2175 TII->get(AMDGPU::V_NOP_e32));
2176
2177 return true;
2178}
2179
2180bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
2181 if (!ST.hasShift64HighRegBug())
2182 return false;
2183 assert(!ST.hasExtendedWaitCounts());
2184
2185 switch (MI->getOpcode()) {
2186 default:
2187 return false;
2188 case AMDGPU::V_LSHLREV_B64_e64:
2189 case AMDGPU::V_LSHRREV_B64_e64:
2190 case AMDGPU::V_ASHRREV_I64_e64:
2191 break;
2192 }
2193
2194 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
2195 if (!Amt->isReg())
2196 return false;
2197
2198 Register AmtReg = Amt->getReg();
2199 const MachineRegisterInfo &MRI = MF.getRegInfo();
2200 // Check if this is a last VGPR in the allocation block.
2201 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2202 return false;
2203
2204 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
2205 return false;
2206
2207 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
2208 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
2209 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
2210 bool Overlapped = OverlappedSrc || OverlappedDst;
2211
2212 assert(!OverlappedDst || !OverlappedSrc ||
2213 Src1->getReg() == MI->getOperand(0).getReg());
2214 assert(ST.needsAlignedVGPRs());
2215 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2216
2217 Register NewReg;
2218 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2219 : AMDGPU::VGPR_32RegClass) {
2220 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
2221 NewReg = Reg;
2222 break;
2223 }
2224 }
2225
2226 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2227 : NewReg;
2228 Register NewAmtLo;
2229
2230 if (Overlapped)
2231 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2232
2233 DebugLoc DL = MI->getDebugLoc();
2234 MachineBasicBlock *MBB = MI->getParent();
2235 // Insert a full wait count because found register might be pending a wait.
2236 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
2237 .addImm(0);
2238
2239 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
2240 if (Overlapped)
2241 runOnInstruction(
2242 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
2243 .addDef(AmtReg - 1)
2244 .addReg(AmtReg - 1, RegState::Undef)
2245 .addReg(NewAmtLo, RegState::Undef));
2246 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2247 .addDef(AmtReg)
2248 .addReg(AmtReg, RegState::Undef)
2249 .addReg(NewAmt, RegState::Undef));
2250
2251 // Instructions emitted after the current instruction will be processed by the
2252 // parent loop of the hazard recognizer in a natural way.
2253 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2254 AmtReg)
2255 .addDef(NewAmt)
2256 .addReg(NewAmt)
2257 .addReg(AmtReg);
2258 if (Overlapped)
2259 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2260 AmtReg - 1)
2261 .addDef(NewAmtLo)
2262 .addReg(NewAmtLo)
2263 .addReg(AmtReg - 1);
2264
2265 // Re-running hazard recognizer on the modified instruction is not necessary,
2266 // inserted V_SWAP_B32 has already both read and write new registers so
2267 // hazards related to these register has already been handled.
2268 Amt->setReg(NewAmt);
2269 Amt->setIsKill(false);
2270 // We do not update liveness, so verifier may see it as undef.
2271 Amt->setIsUndef();
2272 if (OverlappedDst)
2273 MI->getOperand(0).setReg(NewReg);
2274 if (OverlappedSrc) {
2275 Src1->setReg(NewReg);
2276 Src1->setIsKill(false);
2277 Src1->setIsUndef();
2278 }
2279
2280 return true;
2281}
2282
2283int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
2284 int NSAtoVMEMWaitStates = 1;
2285
2286 if (!ST.hasNSAtoVMEMBug())
2287 return 0;
2288
2290 return 0;
2291
2292 const SIInstrInfo *TII = ST.getInstrInfo();
2293 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2294 if (!Offset || (Offset->getImm() & 6) == 0)
2295 return 0;
2296
2297 auto IsHazardFn = [TII](const MachineInstr &I) {
2298 if (!SIInstrInfo::isMIMG(I))
2299 return false;
2300 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
2301 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2302 TII->getInstSizeInBytes(I) >= 16;
2303 };
2304
2305 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
2306}
2307
2308int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
2309 int FPAtomicToDenormModeWaitStates = 3;
2310
2311 if (!ST.hasFPAtomicToDenormModeHazard())
2312 return 0;
2313 assert(!ST.hasExtendedWaitCounts());
2314
2315 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2316 return 0;
2317
2318 auto IsHazardFn = [](const MachineInstr &I) {
2319 if (!SIInstrInfo::isVMEM(I))
2320 return false;
2321 return SIInstrInfo::isFPAtomic(I);
2322 };
2323
2324 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2325 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2326 return true;
2327
2328 return SIInstrInfo::isWaitcnt(MI.getOpcode());
2329 };
2330
2331 return FPAtomicToDenormModeWaitStates -
2332 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2333}
2334
2335int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2337
2338 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2339}
2340
2341int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2342 // Early exit if no padding is requested.
2343 if (MFMAPaddingRatio == 0)
2344 return 0;
2345
2346 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2347 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2348 return 0;
2349
2350 int NeighborMFMALatency = 0;
2351 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2352 this](const MachineInstr &MI) {
2353 if (!SIInstrInfo::isMFMA(MI))
2354 return false;
2355
2356 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2357 return true;
2358 };
2359
2360 const int MaxMFMAPipelineWaitStates = 16;
2361 int WaitStatesSinceNeighborMFMA =
2362 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2363
2364 int NeighborMFMAPaddingNeeded =
2365 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2366 WaitStatesSinceNeighborMFMA;
2367
2368 return std::max(0, NeighborMFMAPaddingNeeded);
2369}
2370
2371int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2372 int WaitStatesNeeded = 0;
2373 unsigned Opc = MI->getOpcode();
2374
2375 auto IsVALUFn = [](const MachineInstr &MI) {
2376 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2377 };
2378
2379 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2380 const int LegacyVALUWritesVGPRWaitStates = 2;
2381 const int VALUWritesExecWaitStates = 4;
2382 const int MaxWaitStates = 4;
2383
2384 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2385 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2386 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2387
2388 if (WaitStatesNeeded < MaxWaitStates) {
2389 for (const MachineOperand &Use : MI->explicit_uses()) {
2390 const int MaxWaitStates = 2;
2391
2392 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2393 continue;
2394
2395 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2396 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2397 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2398
2399 if (WaitStatesNeeded == MaxWaitStates)
2400 break;
2401 }
2402 }
2403 }
2404
2405 for (const MachineOperand &Op : MI->explicit_operands()) {
2406 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2407 continue;
2408
2409 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2410 continue;
2411
2412 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2413 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2414 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2415 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2416 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2417 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2418 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2419 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2420 const int MaxWaitStates = 18;
2421 Register Reg = Op.getReg();
2422 unsigned HazardDefLatency = 0;
2423
2424 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2425 this](const MachineInstr &MI) {
2426 if (!SIInstrInfo::isMFMA(MI))
2427 return false;
2428 Register DstReg = MI.getOperand(0).getReg();
2429 if (DstReg == Reg)
2430 return false;
2431 HazardDefLatency =
2432 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2433 return TRI.regsOverlap(DstReg, Reg);
2434 };
2435
2436 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2437 MaxWaitStates);
2438 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2439 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2440 int OpNo = Op.getOperandNo();
2441 if (OpNo == SrcCIdx) {
2442 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2443 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2444 switch (HazardDefLatency) {
2445 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2446 break;
2447 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2448 break;
2449 case 16: [[fallthrough]];
2450 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2451 break;
2452 }
2453 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2454 switch (HazardDefLatency) {
2455 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2456 break;
2457 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2458 break;
2459 case 16: [[fallthrough]];
2460 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2461 break;
2462 }
2463 }
2464
2465 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2466 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2467
2468 if (WaitStatesNeeded == MaxWaitStates)
2469 return WaitStatesNeeded; // Early exit.
2470
2471 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2472 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2473 return false;
2474 Register DstReg = MI.getOperand(0).getReg();
2475 return TRI.regsOverlap(Reg, DstReg);
2476 };
2477
2478 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2479 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2480 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2481 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2482 if (OpNo == SrcCIdx)
2483 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2484 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2485 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2486
2487 WaitStatesNeededForUse = NeedWaitStates -
2488 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2489 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2490
2491 if (WaitStatesNeeded == MaxWaitStates)
2492 return WaitStatesNeeded; // Early exit.
2493 }
2494
2495 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2496 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2497 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2498 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2499 const int MaxWaitStates = 13;
2500 Register DstReg = MI->getOperand(0).getReg();
2501 unsigned HazardDefLatency = 0;
2502
2503 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2504 this](const MachineInstr &MI) {
2505 if (!SIInstrInfo::isMFMA(MI))
2506 return false;
2507 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2508 HazardDefLatency =
2509 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2510 return TRI.regsOverlap(Reg, DstReg);
2511 };
2512
2513 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2514 int NeedWaitStates;
2515 switch (HazardDefLatency) {
2516 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2517 break;
2518 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2519 break;
2520 case 16: [[fallthrough]];
2521 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2522 break;
2523 }
2524
2525 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2526 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2527 }
2528
2529 // Pad neighboring MFMA with noops for better inter-wave performance.
2530 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2531
2532 return WaitStatesNeeded;
2533}
2534
2535static int
2537 bool IsGFX950) {
2538 // xdl def cycles | gfx940 | gfx950
2539 // 2 pass | 3 4
2540 // 4 pass | 5 6
2541 // 8 pass | 9 10
2542 // 16 pass | 17 18
2543 return NumPasses + 1 + IsGFX950;
2544}
2545
2546static int
2548 bool IsGFX950) {
2549 // xdl def cycles | gfx940 | gfx950
2550 // 2 pass | 3 3
2551 // 4 pass | 5 6
2552 // 8 pass | 9 10
2553 // 16 pass | 17 18
2554 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2555}
2556
2557static int
2559 // 2 pass -> 2
2560 // 4 pass -> 4
2561 // 8 pass -> 8
2562 // 16 pass -> 16
2563 return NumPasses;
2564}
2565
2566static int
2568 // 2 pass -> 4
2569 // 4 pass -> 6
2570 // 8 pass -> 10
2571 // 16 pass -> 18
2572 return NumPasses + 2;
2573}
2574
2576 bool IsGFX950) {
2577 // xdl def cycles | gfx942 | gfx950
2578 // 2 pass | 5 5
2579 // 4 pass | 7 8
2580 // 8 pass | 11 12
2581 // 16 pass | 19 20
2582 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2583}
2584
2585int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2586 int WaitStatesNeeded = 0;
2587 unsigned Opc = MI->getOpcode();
2588
2589 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2591 };
2592
2593 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2596 };
2597
2598 if (!SIInstrInfo::isMFMA(*MI))
2599 return WaitStatesNeeded;
2600
2601 const int VALUWritesExecWaitStates = 4;
2602 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2603 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2604 VALUWritesExecWaitStates);
2605 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2606
2607 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2608
2609 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2610 for (const MachineOperand &Use : MI->explicit_uses()) {
2611 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2612 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2613 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2614 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2615 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2616 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2617 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2618 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2619 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2620 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2621 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2622 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2623 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2624 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2625 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2626 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2627 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2628 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2629 const int MaxWaitStates = 19;
2630
2631 if (!Use.isReg())
2632 continue;
2633 Register Reg = Use.getReg();
2634 bool FullReg;
2635 const MachineInstr *MI1;
2636
2637 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2638 this](const MachineInstr &MI) {
2639 if (!SIInstrInfo::isMFMA(MI))
2640 return false;
2641 Register DstReg = MI.getOperand(0).getReg();
2642 FullReg = (DstReg == Reg);
2643 MI1 = &MI;
2644 return TRI.regsOverlap(DstReg, Reg);
2645 };
2646
2647 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2648 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2649 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2650
2651 int NumWaitStates =
2652 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2653 if (NumWaitStates == std::numeric_limits<int>::max())
2654 continue;
2655
2656 int OpNo = Use.getOperandNo();
2657 unsigned Opc1 = MI1->getOpcode();
2658 int NeedWaitStates = 0;
2659 if (OpNo == SrcCIdx) {
2660 if (!SIInstrInfo::isDGEMM(Opc) &&
2661 (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opc1))) {
2662 NeedWaitStates = 0;
2663 } else if (FullReg) {
2664 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2665 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2666 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2667 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2668 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2669 else if (ST.hasGFX940Insts() &&
2670 TSchedModel.computeInstrLatency(MI1) == 2)
2671 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2672 } else {
2673 switch (Opc1) {
2674 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2675 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2676 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2677 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2678 if (!TII.isXDL(*MI))
2679 NeedWaitStates =
2680 ST.hasGFX950Insts()
2681 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2682 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2683 break;
2684 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2685 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2686 if (!TII.isXDL(*MI))
2687 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2688 break;
2689 default:
2690 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2691 if (ST.hasGFX940Insts()) {
2692 if (TII.isXDL(*MI) && !TII.isXDL(*MI1))
2693 break;
2694
2695 NeedWaitStates =
2696 TII.isXDL(*MI1)
2697 ? (TII.isXDL(*MI)
2699 NumPasses, ST.hasGFX950Insts())
2701 NumPasses, ST.hasGFX950Insts()))
2703 NumPasses);
2704 break;
2705 }
2706
2707 switch (NumPasses) {
2708 case 2:
2709 NeedWaitStates =
2711 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2712 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2713 break;
2714 case 8:
2715 NeedWaitStates =
2717 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2718 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2719 break;
2720 case 16:
2721 NeedWaitStates =
2723 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2724 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2725 break;
2726 default:
2727 llvm_unreachable("unexpected number of passes");
2728 }
2729 }
2730 }
2731 } else {
2732 switch (Opc1) {
2733 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2734 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2735 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2736 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2737 NeedWaitStates =
2738 ST.hasGFX950Insts()
2739 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2740 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2741 break;
2742 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2743 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2744 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2745 break;
2746 default:
2747 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2748
2749 if (ST.hasGFX940Insts()) {
2750 NeedWaitStates =
2751 TII.isXDL(*MI1)
2753 NumPasses, ST.hasGFX950Insts())
2755 NumPasses);
2756 break;
2757 }
2758
2759 switch (NumPasses) {
2760 case 2:
2761 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2762 break;
2763 case 4:
2764 llvm_unreachable("unexpected number of passes for mfma");
2765 case 8:
2766 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2767 break;
2768 case 16:
2769 default:
2770 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2771 }
2772 }
2773 }
2774 if (WaitStatesNeeded >= NeedWaitStates)
2775 continue;
2776
2777 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2778 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2779
2780 if (WaitStatesNeeded == MaxWaitStates)
2781 break;
2782 }
2783
2784 // Pad neighboring MFMA with noops for better inter-wave performance.
2785 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2786
2787 return WaitStatesNeeded;
2788}
2789
2790int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2791 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2792 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2793 return 0;
2794
2795 int WaitStatesNeeded = 0;
2796
2797 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2798 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2799 };
2800
2801 for (const MachineOperand &Op : MI->explicit_uses()) {
2802 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2803 continue;
2804
2805 Register Reg = Op.getReg();
2806
2807 const int AccVgprReadLdStWaitStates = 2;
2808 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2809 const int MaxWaitStates = 2;
2810
2811 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2812 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2813 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2814
2815 if (WaitStatesNeeded == MaxWaitStates)
2816 return WaitStatesNeeded; // Early exit.
2817
2818 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2819 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2820 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2821 return false;
2822 auto IsVALUFn = [](const MachineInstr &MI) {
2824 };
2825 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2826 std::numeric_limits<int>::max();
2827 };
2828
2829 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2830 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2831 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2832 }
2833
2834 return WaitStatesNeeded;
2835}
2836
2837int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
2838 assert(!ST.hasVcmpxPermlaneHazard() &&
2839 "this is a different vcmpx+permlane hazard");
2840 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2841 const SIInstrInfo *TII = ST.getInstrInfo();
2842
2843 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2844 return isVCmpXWritesExec(*TII, *TRI, MI);
2845 };
2846
2847 auto IsVALUFn = [](const MachineInstr &MI) {
2848 return SIInstrInfo::isVALU(MI);
2849 };
2850
2851 const int VCmpXWritesExecWaitStates = 4;
2852 const int VALUWritesVDstWaitStates = 2;
2853 int WaitStatesNeeded = 0;
2854
2855 for (const MachineOperand &Op : MI->explicit_uses()) {
2856 if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
2857 continue;
2858 Register Reg = Op.getReg();
2859
2860 int WaitStatesSinceDef =
2861 VALUWritesVDstWaitStates -
2862 getWaitStatesSinceDef(Reg, IsVALUFn,
2863 /*MaxWaitStates=*/VALUWritesVDstWaitStates);
2864 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2865 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2866 break;
2867 }
2868
2869 int VCmpXHazardWaits =
2870 VCmpXWritesExecWaitStates -
2871 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2872
2873 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2874 return WaitStatesNeeded;
2875}
2876
2878 // 2 pass -> 4
2879 // 4 pass -> 6
2880 // 8 pass -> 10
2881 // 16 pass -> 18
2882 return NumPasses + 2;
2883}
2884
2886 bool IsGFX950) {
2887 // xdl def cycles | gfx942 | gfx950
2888 // 2 pass | 5 5
2889 // 4 pass | 7 8
2890 // 8 pass | 11 12
2891 // 16 pass | 19 20
2892 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2893}
2894
2896 bool IsGFX950) {
2897 // xdl def cycles | gfx942 | gfx950
2898 // 2 pass | 5 5
2899 // 4 pass | 7 8
2900 // 8 pass | 11 12
2901 // 16 pass | 19 20
2902 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2903}
2904
2906 // 2 pass -> 4
2907 // 4 pass -> 6
2908 // 8 pass -> 10
2909 // 16 pass -> 18
2910 return NumPasses + 2;
2911}
2912
2913int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2914 if (!ST.hasGFX90AInsts())
2915 return 0;
2916
2917 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2918 return SIInstrInfo::isDGEMM(MI.getOpcode());
2919 };
2920
2921 // This is checked in checkMAIHazards90A()
2922 if (SIInstrInfo::isMFMA(*MI))
2923 return 0;
2924
2925 const MachineRegisterInfo &MRI = MF.getRegInfo();
2926
2927 int WaitStatesNeeded = 0;
2928
2929 bool IsMem = SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI);
2930 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2931 bool IsVALU = SIInstrInfo::isVALU(*MI);
2932
2933 const MachineInstr *MFMA = nullptr;
2934 unsigned Reg;
2935 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2936 if (!SIInstrInfo::isMFMA(MI) ||
2937 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2938 return false;
2939 MFMA = &MI;
2940 return true;
2941 };
2942
2943 const MachineInstr *DOT = nullptr;
2944 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2945 if (!SIInstrInfo::isDOT(MI) ||
2946 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2947 return false;
2948 DOT = &MI;
2949 return true;
2950 };
2951
2952 bool DGEMMAfterVALUWrite = false;
2953 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2954 // Found DGEMM on reverse traversal to def.
2955 if (SIInstrInfo::isDGEMM(MI.getOpcode()))
2956 DGEMMAfterVALUWrite = true;
2957
2958 // Only hazard if register is defined by a VALU and a DGEMM is found after
2959 // after the def.
2960 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2961 return false;
2962
2963 return true;
2964 };
2965
2966 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2967 AMDGPU::OpName::src2);
2968
2969 if (IsMemOrExport || IsVALU) {
2970 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2971 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2972 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2973 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2974 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2975 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2976 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2977 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2978 const int DotWriteSameDotReadSrcAB = 3;
2979 const int DotWriteDifferentVALURead = 3;
2980 const int DMFMABetweenVALUWriteVMEMRead = 2;
2981 const int MaxWaitStates = 19;
2982
2983 for (const MachineOperand &Use : MI->explicit_uses()) {
2984 if (!Use.isReg())
2985 continue;
2986 Reg = Use.getReg();
2987
2988 DOT = nullptr;
2989 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2990 MaxWaitStates);
2991 if (DOT) {
2992 int NeedWaitStates = 0;
2993 if (DOT->getOpcode() == MI->getOpcode()) {
2994 if (&Use - &MI->getOperand(0) != SrcCIdx)
2995 NeedWaitStates = DotWriteSameDotReadSrcAB;
2996 } else {
2997 NeedWaitStates = DotWriteDifferentVALURead;
2998 }
2999
3000 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3001 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3002 }
3003
3004 // Workaround for HW data hazard bug observed only in GFX90A. When there
3005 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
3006 // causes the SQ to incorrectly not insert two wait states between the two
3007 // instructions needed to avoid data hazard.
3008 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3009 DGEMMAfterVALUWrite = false;
3010 if (TRI.isVectorRegister(MRI, Reg)) {
3011 int WaitStatesNeededForUse =
3012 DMFMABetweenVALUWriteVMEMRead -
3013 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
3014 DMFMABetweenVALUWriteVMEMRead);
3015
3016 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3017 }
3018 }
3019
3020 MFMA = nullptr;
3021 WaitStatesSinceDef =
3022 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3023 if (!MFMA)
3024 continue;
3025
3026 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3027 int NumPasses = HazardDefLatency;
3028 int NeedWaitStates = MaxWaitStates;
3029
3030 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3031 switch (HazardDefLatency) {
3032 case 4:
3033 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3034 : DMFMA4x4WriteVgprVALUReadWaitStates;
3035 break;
3036 case 8:
3037 case 16:
3038 NeedWaitStates =
3039 IsMemOrExport
3040 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3041 : (ST.hasGFX950Insts()
3042 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3043 : DMFMA16x16WriteVgprVALUReadWaitStates);
3044 break;
3045 default:
3046 llvm_unreachable("unexpected dgemm");
3047 }
3048 } else if (ST.hasGFX940Insts()) {
3049 NeedWaitStates =
3050 TII.isXDL(*MFMA)
3052 NumPasses, ST.hasGFX950Insts())
3054 NumPasses);
3055 } else {
3056 switch (HazardDefLatency) {
3057 case 2:
3058 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3059 break;
3060 case 8:
3061 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3062 break;
3063 case 16:
3064 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3065 break;
3066 default:
3067 llvm_unreachable("unexpected number of passes for mfma");
3068 }
3069 }
3070
3071 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3072 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3073
3074 if (WaitStatesNeeded == MaxWaitStates)
3075 break;
3076 }
3077 }
3078
3079 unsigned Opc = MI->getOpcode();
3080 const int DMFMAToFMA64WaitStates = 2;
3081 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
3082 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
3083 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3084 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3085 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3086 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3087 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3088 }
3089
3090 if (!IsVALU && !IsMemOrExport)
3091 return WaitStatesNeeded;
3092
3093 for (const MachineOperand &Def : MI->defs()) {
3094 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3095 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3096 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3097 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3098 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3099 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3100 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3101 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3102 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3103 const int DotWriteDifferentVALUWrite = 3;
3104 const int MaxWaitStates = 19;
3105 const int MaxWarWaitStates = 15;
3106
3107 Reg = Def.getReg();
3108
3109 DOT = nullptr;
3110 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3111 MaxWaitStates);
3112 if (DOT && DOT->getOpcode() != MI->getOpcode())
3113 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3114 WaitStatesSinceDef);
3115
3116 MFMA = nullptr;
3117 WaitStatesSinceDef =
3118 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3119 if (MFMA) {
3120 int NeedWaitStates = MaxWaitStates;
3121 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
3122
3123 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3124 switch (NumPasses) {
3125 case 4:
3126 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3127 break;
3128 case 8:
3129 case 16:
3130 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3131 break;
3132 default:
3133 llvm_unreachable("unexpected number of cycles for dgemm");
3134 }
3135 } else if (ST.hasGFX940Insts()) {
3136 NeedWaitStates =
3137 TII.isXDL(*MFMA)
3139 NumPasses, ST.hasGFX950Insts())
3141 } else {
3142 switch (NumPasses) {
3143 case 2:
3144 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3145 break;
3146 case 8:
3147 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3148 break;
3149 case 16:
3150 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3151 break;
3152 default:
3153 llvm_unreachable("Unexpected number of passes for mfma");
3154 }
3155 }
3156
3157 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3158 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3159
3160 if (WaitStatesNeeded == MaxWaitStates)
3161 break;
3162 }
3163
3164 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3165 if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(MI.getOpcode()) ||
3166 !MI.readsRegister(Reg, &TRI))
3167 return false;
3168
3169 if (ST.hasGFX940Insts() && !TII.isXDL(MI))
3170 return false;
3171
3172 const MachineOperand *SrcC =
3173 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
3174 assert(SrcC);
3175 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
3176 return false;
3177
3178 MFMA = &MI;
3179 return true;
3180 };
3181
3182 MFMA = nullptr;
3183 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3184 MaxWarWaitStates);
3185 if (!MFMA)
3186 continue;
3187
3188 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3189 int NeedWaitStates = MaxWaitStates;
3190 switch (HazardDefLatency) {
3191 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3192 break;
3193 case 4: assert(ST.hasGFX940Insts());
3194 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3195 break;
3196 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3197 break;
3198 case 16: [[fallthrough]];
3199 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3200 break;
3201 }
3202
3203 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3204 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3205 }
3206
3207 return WaitStatesNeeded;
3208}
3209
3211 if (!SU->isInstr())
3212 return false;
3213
3214 const MachineInstr *MAI = nullptr;
3215
3216 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
3217 MAI = nullptr;
3219 MAI = &MI;
3220 return MAI != nullptr;
3221 };
3222
3223 MachineInstr *MI = SU->getInstr();
3224 if (IsMFMAFn(*MI)) {
3225 int W = getWaitStatesSince(IsMFMAFn, 16);
3226 if (MAI)
3227 return W < (int)TSchedModel.computeInstrLatency(MAI);
3228 }
3229
3230 return false;
3231}
3232
3233// Adjust global offsets for instructions bundled with S_GETPC_B64 after
3234// insertion of a new instruction.
3235static void updateGetPCBundle(MachineInstr *NewMI) {
3236 if (!NewMI->isBundled())
3237 return;
3238
3239 // Find start of bundle.
3240 auto I = NewMI->getIterator();
3241 while (I->isBundledWithPred())
3242 I--;
3243 if (I->isBundle())
3244 I++;
3245
3246 // Bail if this is not an S_GETPC bundle.
3247 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
3248 return;
3249
3250 // Update offsets of any references in the bundle.
3251 const unsigned NewBytes = 4;
3252 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3253 "Unexpected instruction insertion in bundle");
3254 auto NextMI = std::next(NewMI->getIterator());
3255 auto End = NewMI->getParent()->end();
3256 while (NextMI != End && NextMI->isBundledWithPred()) {
3257 for (auto &Operand : NextMI->operands()) {
3258 if (Operand.isGlobal())
3259 Operand.setOffset(Operand.getOffset() + NewBytes);
3260 }
3261 NextMI++;
3262 }
3263}
3264
3265bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
3266 if (!ST.hasVALUMaskWriteHazard())
3267 return false;
3268 assert(!ST.hasExtendedWaitCounts());
3269
3270 if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
3271 return false;
3272
3273 // The hazard sequence is three instructions:
3274 // 1. VALU reads SGPR as mask
3275 // 2. SALU writes SGPR
3276 // 3. SALU reads SGPR
3277 // The hazard can expire if the distance between 2 and 3 is sufficient.
3278 // In practice this happens <10% of the time, hence this always assumes
3279 // the hazard exists if 1 and 2 are present to avoid searching.
3280
3281 const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
3282 if (!SDSTOp || !SDSTOp->isReg())
3283 return false;
3284
3285 const Register HazardReg = SDSTOp->getReg();
3286 if (HazardReg == AMDGPU::EXEC ||
3287 HazardReg == AMDGPU::EXEC_LO ||
3288 HazardReg == AMDGPU::EXEC_HI ||
3289 HazardReg == AMDGPU::M0)
3290 return false;
3291
3292 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
3293 switch (I.getOpcode()) {
3294 case AMDGPU::V_ADDC_U32_e32:
3295 case AMDGPU::V_ADDC_U32_dpp:
3296 case AMDGPU::V_CNDMASK_B16_t16_e32:
3297 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3298 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3299 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3300 case AMDGPU::V_CNDMASK_B32_e32:
3301 case AMDGPU::V_CNDMASK_B32_dpp:
3302 case AMDGPU::V_DIV_FMAS_F32_e64:
3303 case AMDGPU::V_DIV_FMAS_F64_e64:
3304 case AMDGPU::V_SUBB_U32_e32:
3305 case AMDGPU::V_SUBB_U32_dpp:
3306 case AMDGPU::V_SUBBREV_U32_e32:
3307 case AMDGPU::V_SUBBREV_U32_dpp:
3308 // These implicitly read VCC as mask source.
3309 return HazardReg == AMDGPU::VCC ||
3310 HazardReg == AMDGPU::VCC_LO ||
3311 HazardReg == AMDGPU::VCC_HI;
3312 case AMDGPU::V_ADDC_U32_e64:
3313 case AMDGPU::V_ADDC_U32_e64_dpp:
3314 case AMDGPU::V_CNDMASK_B16_t16_e64:
3315 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3316 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3317 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3318 case AMDGPU::V_CNDMASK_B32_e64:
3319 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3320 case AMDGPU::V_SUBB_U32_e64:
3321 case AMDGPU::V_SUBB_U32_e64_dpp:
3322 case AMDGPU::V_SUBBREV_U32_e64:
3323 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3324 // Only check mask register overlaps.
3325 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
3326 assert(SSRCOp);
3327 return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
3328 }
3329 default:
3330 return false;
3331 }
3332 };
3333
3334 const MachineRegisterInfo &MRI = MF.getRegInfo();
3335 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
3336 // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
3337 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3338 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3339 return true;
3340
3341 // VALU access to any SGPR or literal constant other than HazardReg
3342 // mitigates hazard. No need to check HazardReg here as this will
3343 // only be called when !IsHazardFn.
3344 if (!SIInstrInfo::isVALU(I))
3345 return false;
3346 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
3347 const MachineOperand &Op = I.getOperand(OpNo);
3348 if (Op.isReg()) {
3349 Register OpReg = Op.getReg();
3350 // Only consider uses
3351 if (!Op.isUse())
3352 continue;
3353 // Ignore EXEC
3354 if (OpReg == AMDGPU::EXEC ||
3355 OpReg == AMDGPU::EXEC_LO ||
3356 OpReg == AMDGPU::EXEC_HI)
3357 continue;
3358 // Ignore all implicit uses except VCC
3359 if (Op.isImplicit()) {
3360 if (OpReg == AMDGPU::VCC ||
3361 OpReg == AMDGPU::VCC_LO ||
3362 OpReg == AMDGPU::VCC_HI)
3363 return true;
3364 continue;
3365 }
3366 if (TRI.isSGPRReg(MRI, OpReg))
3367 return true;
3368 } else {
3369 const MCInstrDesc &InstDesc = I.getDesc();
3370 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
3371 if (!TII.isInlineConstant(Op, OpInfo))
3372 return true;
3373 }
3374 }
3375 return false;
3376 };
3377
3378 // Check for hazard
3379 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
3380 std::numeric_limits<int>::max())
3381 return false;
3382
3383 auto NextMI = std::next(MI->getIterator());
3384
3385 // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
3386 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3387 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3389
3390 // SALU write may be s_getpc in a bundle.
3391 updateGetPCBundle(NewMI);
3392
3393 return true;
3394}
3395
3396static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3397 const SIInstrInfo &TII) {
3398 MachineBasicBlock &EntryMBB = MF->front();
3399 if (EntryMBB.begin() != EntryMBB.end()) {
3400 auto &EntryMI = *EntryMBB.begin();
3401 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3402 EntryMI.getOperand(0).getImm() >= Priority)
3403 return false;
3404 }
3405
3406 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3407 .addImm(Priority);
3408 return true;
3409}
3410
3411bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3412 if (!ST.hasRequiredExportPriority())
3413 return false;
3414
3415 // Assume the following shader types will never have exports,
3416 // and avoid adding or adjusting S_SETPRIO.
3417 MachineBasicBlock *MBB = MI->getParent();
3418 MachineFunction *MF = MBB->getParent();
3419 auto CC = MF->getFunction().getCallingConv();
3420 switch (CC) {
3425 return false;
3426 default:
3427 break;
3428 }
3429
3430 const int MaxPriority = 3;
3431 const int NormalPriority = 2;
3432 const int PostExportPriority = 0;
3433
3434 auto It = MI->getIterator();
3435 switch (MI->getOpcode()) {
3436 case AMDGPU::S_ENDPGM:
3437 case AMDGPU::S_ENDPGM_SAVED:
3438 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3439 case AMDGPU::SI_RETURN_TO_EPILOG:
3440 // Ensure shader with calls raises priority at entry.
3441 // This ensures correct priority if exports exist in callee.
3442 if (MF->getFrameInfo().hasCalls())
3443 return ensureEntrySetPrio(MF, NormalPriority, TII);
3444 return false;
3445 case AMDGPU::S_SETPRIO: {
3446 // Raise minimum priority unless in workaround.
3447 auto &PrioOp = MI->getOperand(0);
3448 int Prio = PrioOp.getImm();
3449 bool InWA = (Prio == PostExportPriority) &&
3450 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3451 if (InWA || Prio >= NormalPriority)
3452 return false;
3453 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3454 return true;
3455 }
3456 default:
3457 if (!TII.isEXP(*MI))
3458 return false;
3459 break;
3460 }
3461
3462 // Check entry priority at each export (as there will only be a few).
3463 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3464 bool Changed = false;
3466 Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3467
3468 auto NextMI = std::next(It);
3469 bool EndOfShader = false;
3470 if (NextMI != MBB->end()) {
3471 // Only need WA at end of sequence of exports.
3472 if (TII.isEXP(*NextMI))
3473 return Changed;
3474 // Assume appropriate S_SETPRIO after export means WA already applied.
3475 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3476 NextMI->getOperand(0).getImm() == PostExportPriority)
3477 return Changed;
3478 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3479 }
3480
3481 const DebugLoc &DL = MI->getDebugLoc();
3482
3483 // Lower priority.
3484 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3485 .addImm(PostExportPriority);
3486
3487 if (!EndOfShader) {
3488 // Wait for exports to complete.
3489 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3490 .addReg(AMDGPU::SGPR_NULL)
3491 .addImm(0);
3492 }
3493
3494 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3495 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3496
3497 if (!EndOfShader) {
3498 // Return to normal (higher) priority.
3499 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3500 .addImm(NormalPriority);
3501 }
3502
3503 return true;
3504}
3505
3506bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
3507 if (!isSGetReg(MI->getOpcode()))
3508 return false;
3509
3510 const SIInstrInfo *TII = ST.getInstrInfo();
3511 switch (getHWReg(TII, *MI)) {
3512 default:
3513 return false;
3518 break;
3519 }
3520
3521 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3522 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3523 .addImm(0);
3524 return true;
3525}
3526
3527bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
3528 if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3529 return false;
3530
3531 const SIInstrInfo *TII = ST.getInstrInfo();
3532 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3533 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3534 .addImm(0xFFE3);
3535 BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(),
3536 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3537 .addImm(0xFFE3);
3538
3539 return true;
3540}
3541
3542bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
3543 // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3544 // for hazard to trigger.
3545 if (!IsHazardRecognizerMode)
3546 return false;
3547
3548 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3549 const SIInstrInfo *TII = ST.getInstrInfo();
3550 // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3551 const int FlatScrBaseWaitStates = 10;
3552
3553 bool ReadsFlatScrLo =
3554 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3555 bool ReadsFlatScrHi =
3556 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3557 if (isSGetReg(MI->getOpcode())) {
3558 switch (getHWReg(TII, *MI)) {
3559 default:
3560 break;
3562 ReadsFlatScrLo = true;
3563 break;
3565 ReadsFlatScrHi = true;
3566 break;
3567 }
3568 }
3569
3570 const MachineRegisterInfo &MRI = MF.getRegInfo();
3571
3572 auto IsRegDefHazard = [&](Register Reg) -> bool {
3573 DenseSet<const MachineBasicBlock *> Visited;
3574 auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3575 return MI.modifiesRegister(Reg, TRI);
3576 };
3577
3578 // This literally abuses the idea of waitstates. Instead of waitstates it
3579 // returns 1 for SGPR written and 0 otherwise.
3580 auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3581 if (!TII->isSALU(MI) && !TII->isVALU(MI))
3582 return 0;
3583 for (const MachineOperand &MO : MI.all_defs()) {
3584 if (TRI->isSGPRReg(MRI, MO.getReg()))
3585 return 1;
3586 }
3587 return 0;
3588 };
3589
3590 auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3591 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3592 unsigned Wait = MI.getOperand(0).getImm();
3595 return true;
3596 }
3597 return SgprWrites >= FlatScrBaseWaitStates;
3598 };
3599
3600 return ::getWaitStatesSince(
3601 IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()),
3602 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3603 };
3604
3605 if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
3606 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3607 (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
3608 !IsRegDefHazard(AMDGPU::SGPR103)))
3609 return false;
3610
3611 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3612 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3615 return true;
3616}
3617
3618bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
3619 if (!isSSetReg(MI->getOpcode()) ||
3620 MI->getOperand(1).getImm() != AMDGPU::Hwreg::ID_MODE)
3621 return false;
3622
3623 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3624 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3625 return true;
3626}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, unsigned Latency, unsigned Category)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(DataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
Definition LineTable.cpp:54
#define I(x, y, z)
Definition MD5.cpp:58
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
BitVector & set()
Definition BitVector.h:370
A debug info location.
Definition DebugLoc.h:124
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Definition DenseMap.h:255
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
unsigned PreEmitNoopsCommon(MachineInstr *)
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
ArrayRef< MCOperandInfo > operands() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
Instructions::const_reverse_iterator const_reverse_instr_iterator
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:338
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:472
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
A range adaptor for a pair of iterators.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
@ Entry
Definition COFF.h:862
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
constexpr double e
Definition MathExtras.h:47
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Wait
Definition Threading.h:60
Op::Description Desc
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
unsigned MCRegUnit
Register units are used to compute register aliasing.
Definition MCRegister.h:30
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...