LLVM 22.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
21
22using namespace llvm;
23
24namespace {
25
26struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
27 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
28
29 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
30 if (Arg.getAsInteger(0, Value))
31 return O.error("'" + Arg + "' value invalid for uint argument!");
32
33 if (Value > 100)
34 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
35
36 return false;
37 }
38};
39
40} // end anonymous namespace
41
43 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
44 cl::desc("Fill a percentage of the latency between "
45 "neighboring MFMA with s_nops."));
46
47// This is intended for debugging purposes only.
49 NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden,
50 cl::desc("Insert a s_nop x before every instruction"));
51
52//===----------------------------------------------------------------------===//
53// Hazard Recognizer Implementation
54//===----------------------------------------------------------------------===//
55
57 const GCNSubtarget &ST);
58
60 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
61 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
62 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
64 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
65 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
66}
67
69 EmittedInstrs.clear();
70}
71
75
77 CurrCycleInstr = MI;
78}
79
80static bool isDivFMas(unsigned Opcode) {
81 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
82}
83
84static bool isSGetReg(unsigned Opcode) {
85 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
86}
87
88static bool isSSetReg(unsigned Opcode) {
89 switch (Opcode) {
90 case AMDGPU::S_SETREG_B32:
91 case AMDGPU::S_SETREG_B32_mode:
92 case AMDGPU::S_SETREG_IMM32_B32:
93 case AMDGPU::S_SETREG_IMM32_B32_mode:
94 return true;
95 }
96 return false;
97}
98
99static bool isRWLane(unsigned Opcode) {
100 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
101}
102
103static bool isRFE(unsigned Opcode) {
104 return Opcode == AMDGPU::S_RFE_B64;
105}
106
107static bool isSMovRel(unsigned Opcode) {
108 switch (Opcode) {
109 case AMDGPU::S_MOVRELS_B32:
110 case AMDGPU::S_MOVRELS_B64:
111 case AMDGPU::S_MOVRELD_B32:
112 case AMDGPU::S_MOVRELD_B64:
113 return true;
114 default:
115 return false;
116 }
117}
118
120 const MachineInstr &MI) {
121 if (TII.isAlwaysGDS(MI.getOpcode()))
122 return true;
123
124 switch (MI.getOpcode()) {
125 case AMDGPU::S_SENDMSG:
126 case AMDGPU::S_SENDMSGHALT:
127 case AMDGPU::S_TTRACEDATA:
128 return true;
129 // These DS opcodes don't support GDS.
130 case AMDGPU::DS_NOP:
131 case AMDGPU::DS_PERMUTE_B32:
132 case AMDGPU::DS_BPERMUTE_B32:
133 return false;
134 default:
135 if (TII.isDS(MI.getOpcode())) {
136 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
137 AMDGPU::OpName::gds);
138 if (MI.getOperand(GDS).getImm())
139 return true;
140 }
141 return false;
142 }
143}
144
145static bool isPermlane(const MachineInstr &MI) {
146 unsigned Opcode = MI.getOpcode();
147 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
148 Opcode == AMDGPU::V_PERMLANE64_B32 ||
149 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
150 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
151 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
152 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
153 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
154 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
155 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
156 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
157 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
158 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
159 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
160 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
161}
162
163static bool isLdsDma(const MachineInstr &MI) {
164 return SIInstrInfo::isVALU(MI) &&
166}
167
168static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
169 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
170 AMDGPU::OpName::simm16);
171 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
172}
173
176 MachineInstr *MI = SU->getInstr();
177 // If we are not in "HazardRecognizerMode" and therefore not being run from
178 // the scheduler, track possible stalls from hazards but don't insert noops.
179 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
180
181 if (MI->isBundle())
182 return NoHazard;
183
184 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
185 return HazardType;
186
187 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
188 return HazardType;
189
190 if (checkFPAtomicToDenormModeHazard(MI) > 0)
191 return HazardType;
192
193 if (ST.hasNoDataDepHazard())
194 return NoHazard;
195
196 if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
197 return HazardType;
198
199 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
200 return HazardType;
201
202 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
203 return HazardType;
204
205 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
206 return HazardType;
207
208 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
209 return HazardType;
210
213 checkMAIVALUHazards(MI) > 0)
214 return HazardType;
215
216 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
217 return HazardType;
218
219 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
220 return HazardType;
221
222 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
223 return HazardType;
224
225 if (((ST.hasReadM0MovRelInterpHazard() &&
226 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
227 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
228 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
229 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
230 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
231 (ST.hasReadM0LdsDirectHazard() &&
232 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
233 checkReadM0Hazards(MI) > 0)
234 return HazardType;
235
236 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
237 return HazardType;
238
240 checkMAILdStHazards(MI) > 0)
241 return HazardType;
242
243 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
244 return HazardType;
245
246 return NoHazard;
247}
248
250 unsigned Quantity) {
251 while (Quantity > 0) {
252 unsigned Arg = std::min(Quantity, 8u);
253 Quantity -= Arg;
254 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
255 .addImm(Arg - 1);
256 }
257}
258
259unsigned
260GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
261 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
262 assert(TSchedModel.getWriteProcResBegin(SC) !=
263 TSchedModel.getWriteProcResEnd(SC));
264 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
265}
266
267void GCNHazardRecognizer::processBundle() {
268 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
269 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
270 // Check bundled MachineInstr's for hazards.
271 for (; MI != E && MI->isInsideBundle(); ++MI) {
272 CurrCycleInstr = &*MI;
273 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
274
275 if (IsHazardRecognizerMode) {
276 fixHazards(CurrCycleInstr);
277
278 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
279 }
280
281 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
282 // include the bundled MI directly after, only add a maximum of
283 // (MaxLookAhead - 1) noops to EmittedInstrs.
284 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
285 EmittedInstrs.push_front(nullptr);
286
287 EmittedInstrs.push_front(CurrCycleInstr);
288 EmittedInstrs.resize(MaxLookAhead);
289 }
290 CurrCycleInstr = nullptr;
291}
292
293void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
294 assert(IsHazardRecognizerMode);
295
296 unsigned NumPreNoops = PreEmitNoops(MI);
297 EmitNoops(NumPreNoops);
298 if (MI->isInsideBundle())
299 insertNoopsInBundle(MI, TII, NumPreNoops);
300 else
301 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
302 NumPreNoops);
304 AdvanceCycle();
305}
306
308 IsHazardRecognizerMode = true;
309 CurrCycleInstr = MI;
310 unsigned W = PreEmitNoopsCommon(MI);
311 fixHazards(MI);
312 CurrCycleInstr = nullptr;
313 return std::max(W, NopPadding.getValue());
314}
315
317 if (MI->isBundle())
318 return 0;
319
320 int WaitStates = 0;
321
323 return std::max(WaitStates, checkSMRDHazards(MI));
324
325 if (ST.hasNSAtoVMEMBug())
326 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
327
328 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
329
330 if (ST.hasNoDataDepHazard())
331 return WaitStates;
332
334 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
335
337 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
338
340 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
341
342 if (isDivFMas(MI->getOpcode()))
343 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
344
345 if (isRWLane(MI->getOpcode()))
346 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
347
350 checkMAIVALUHazards(MI) > 0)
351 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
352
353 if (MI->isInlineAsm())
354 return std::max(WaitStates, checkInlineAsmHazards(MI));
355
356 if (isSGetReg(MI->getOpcode()))
357 return std::max(WaitStates, checkGetRegHazards(MI));
358
359 if (isSSetReg(MI->getOpcode()))
360 return std::max(WaitStates, checkSetRegHazards(MI));
361
362 if (isRFE(MI->getOpcode()))
363 return std::max(WaitStates, checkRFEHazards(MI));
364
365 if ((ST.hasReadM0MovRelInterpHazard() &&
366 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
367 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
368 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
369 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
370 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
371 (ST.hasReadM0LdsDirectHazard() &&
372 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
373 return std::max(WaitStates, checkReadM0Hazards(MI));
374
376 return std::max(WaitStates, checkMAIHazards(MI));
377
379 return std::max(WaitStates, checkMAILdStHazards(MI));
380
381 if (ST.hasGFX950Insts() && isPermlane(*MI))
382 return std::max(WaitStates, checkPermlaneHazards(MI));
383
384 return WaitStates;
385}
386
388 EmittedInstrs.push_front(nullptr);
389}
390
392 // When the scheduler detects a stall, it will call AdvanceCycle() without
393 // emitting any instructions.
394 if (!CurrCycleInstr) {
395 EmittedInstrs.push_front(nullptr);
396 return;
397 }
398
399 if (CurrCycleInstr->isBundle()) {
400 processBundle();
401 return;
402 }
403
404 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
405 if (!NumWaitStates) {
406 CurrCycleInstr = nullptr;
407 return;
408 }
409
410 // Keep track of emitted instructions
411 EmittedInstrs.push_front(CurrCycleInstr);
412
413 // Add a nullptr for each additional wait state after the first. Make sure
414 // not to add more than getMaxLookAhead() items to the list, since we
415 // truncate the list to that size right after this loop.
416 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
417 i < e; ++i) {
418 EmittedInstrs.push_front(nullptr);
419 }
420
421 // getMaxLookahead() is the largest number of wait states we will ever need
422 // to insert, so there is no point in keeping track of more than that many
423 // wait states.
424 EmittedInstrs.resize(getMaxLookAhead());
425
426 CurrCycleInstr = nullptr;
427}
428
430 assert(!IsHazardRecognizerMode &&
431 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
432}
433
434//===----------------------------------------------------------------------===//
435// Helper Functions
436//===----------------------------------------------------------------------===//
437
439
440using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
441using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
442
443// Search for a hazard in a block and its predecessors.
444template <typename StateT>
445static bool
446hasHazard(StateT InitialState,
447 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
448 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
449 const MachineBasicBlock *InitialMBB,
451 struct StateMapKey {
453 unsigned Idx;
454 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
455 return LHS.States == RHS.States && LHS.Idx == RHS.Idx;
456 }
457 };
458 struct StateMapKeyTraits : DenseMapInfo<StateMapKey> {
459 static inline StateMapKey getEmptyKey() {
460 return {static_cast<SmallVectorImpl<StateT> *>(
463 }
464 static inline StateMapKey getTombstoneKey() {
465 return {static_cast<SmallVectorImpl<StateT> *>(
468 }
469 static unsigned getHashValue(const StateMapKey &Key) {
470 return StateT::getHashValue((*Key.States)[Key.Idx]);
471 }
472 static unsigned getHashValue(const StateT &State) {
473 return StateT::getHashValue(State);
474 }
475 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
476 const auto EKey = getEmptyKey();
477 const auto TKey = getTombstoneKey();
478 if (StateMapKey::isEqual(LHS, EKey) || StateMapKey::isEqual(RHS, EKey) ||
479 StateMapKey::isEqual(LHS, TKey) || StateMapKey::isEqual(RHS, TKey))
480 return StateMapKey::isEqual(LHS, RHS);
481 return StateT::isEqual((*LHS.States)[LHS.Idx], (*RHS.States)[RHS.Idx]);
482 }
483 static bool isEqual(const StateT &LHS, const StateMapKey &RHS) {
484 if (StateMapKey::isEqual(RHS, getEmptyKey()) ||
485 StateMapKey::isEqual(RHS, getTombstoneKey()))
486 return false;
487 return StateT::isEqual(LHS, (*RHS.States)[RHS.Idx]);
488 }
489 };
490
493
495 const MachineBasicBlock *MBB = InitialMBB;
496 StateT State = InitialState;
497
499 unsigned WorkIdx = 0;
500 for (;;) {
501 bool Expired = false;
502 for (auto E = MBB->instr_rend(); I != E; ++I) {
503 // No need to look at parent BUNDLE instructions.
504 if (I->isBundle())
505 continue;
506
507 auto Result = IsHazard(State, *I);
508 if (Result == HazardFound)
509 return true;
510 if (Result == HazardExpired) {
511 Expired = true;
512 break;
513 }
514
515 if (I->isInlineAsm() || I->isMetaInstruction())
516 continue;
517
518 UpdateState(State, *I);
519 }
520
521 if (!Expired) {
522 unsigned StateIdx = States.size();
523 StateMapKey Key = {&States, StateIdx};
524 auto Insertion = StateMap.insert_as(std::pair(Key, StateIdx), State);
525 if (Insertion.second) {
526 States.emplace_back(State);
527 } else {
528 StateIdx = Insertion.first->second;
529 }
530 for (MachineBasicBlock *Pred : MBB->predecessors())
531 Worklist.insert(std::pair(Pred, StateIdx));
532 }
533
534 if (WorkIdx == Worklist.size())
535 break;
536
537 unsigned StateIdx;
538 std::tie(MBB, StateIdx) = Worklist[WorkIdx++];
539 State = States[StateIdx];
540 I = MBB->instr_rbegin();
541 }
542
543 return false;
544}
545
546// Returns a minimum wait states since \p I walking all predecessors.
547// Only scans until \p IsExpired does not return true.
548// Can only be run in a hazard recognizer mode.
554 for (auto E = MBB->instr_rend(); I != E; ++I) {
555 // Don't add WaitStates for parent BUNDLE instructions.
556 if (I->isBundle())
557 continue;
558
559 if (IsHazard(*I))
560 return WaitStates;
561
562 if (I->isInlineAsm())
563 continue;
564
565 WaitStates += GetNumWaitStates(*I);
566
567 if (IsExpired(*I, WaitStates))
568 return std::numeric_limits<int>::max();
569 }
570
571 int MinWaitStates = std::numeric_limits<int>::max();
572 for (MachineBasicBlock *Pred : MBB->predecessors()) {
573 if (!Visited.insert(Pred).second)
574 continue;
575
576 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
577 IsExpired, Visited, GetNumWaitStates);
578
579 MinWaitStates = std::min(MinWaitStates, W);
580 }
581
582 return MinWaitStates;
583}
584
586 const MachineInstr *MI, IsExpiredFn IsExpired) {
588 return getWaitStatesSince(IsHazard, MI->getParent(),
589 std::next(MI->getReverseIterator()), 0, IsExpired,
591}
592
593int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
594 if (IsHazardRecognizerMode) {
595 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
596 return WaitStates >= Limit;
597 };
598 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
599 }
600
601 int WaitStates = 0;
602 for (MachineInstr *MI : EmittedInstrs) {
603 if (MI) {
604 if (IsHazard(*MI))
605 return WaitStates;
606
607 if (MI->isInlineAsm())
608 continue;
609 }
610 ++WaitStates;
611
612 if (WaitStates >= Limit)
613 break;
614 }
615 return std::numeric_limits<int>::max();
616}
617
618int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
619 IsHazardFn IsHazardDef,
620 int Limit) {
621 const SIRegisterInfo *TRI = ST.getRegisterInfo();
622
623 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
624 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
625 };
626
627 return getWaitStatesSince(IsHazardFn, Limit);
628}
629
630int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
631 int Limit) {
632 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
633 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
634 };
635
636 return getWaitStatesSince(IsHazardFn, Limit);
637}
638
639//===----------------------------------------------------------------------===//
640// No-op Hazard Detection
641//===----------------------------------------------------------------------===//
642
643static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
644 MCRegister Reg) {
645 for (MCRegUnit Unit : TRI.regunits(Reg))
646 BV.set(static_cast<unsigned>(Unit));
647}
648
649static void addRegsToSet(const SIRegisterInfo &TRI,
651 BitVector &DefSet, BitVector &UseSet) {
652 for (const MachineOperand &Op : Ops) {
653 if (Op.isReg())
654 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
655 }
656}
657
658void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
659 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
660}
661
663 return !SIInstrInfo::isSMRD(*MI);
664}
665
667 return !SIInstrInfo::isVMEM(*MI);
668}
669
670int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
671 // SMEM soft clause are only present on VI+, and only matter if xnack is
672 // enabled.
673 if (!ST.isXNACKEnabled())
674 return 0;
675
676 bool IsSMRD = TII.isSMRD(*MEM);
677
678 resetClause();
679
680 // A soft-clause is any group of consecutive SMEM instructions. The
681 // instructions in this group may return out of order and/or may be
682 // replayed (i.e. the same instruction issued more than once).
683 //
684 // In order to handle these situations correctly we need to make sure that
685 // when a clause has more than one instruction, no instruction in the clause
686 // writes to a register that is read by another instruction in the clause
687 // (including itself). If we encounter this situation, we need to break the
688 // clause by inserting a non SMEM instruction.
689
690 for (MachineInstr *MI : EmittedInstrs) {
691 // When we hit a non-SMEM instruction then we have passed the start of the
692 // clause and we can stop.
693 if (!MI)
694 break;
695
697 break;
698
699 addClauseInst(*MI);
700 }
701
702 if (ClauseDefs.none())
703 return 0;
704
705 // We need to make sure not to put loads and stores in the same clause if they
706 // use the same address. For now, just start a new clause whenever we see a
707 // store.
708 if (MEM->mayStore())
709 return 1;
710
711 addClauseInst(*MEM);
712
713 // If the set of defs and uses intersect then we cannot add this instruction
714 // to the clause, so we have a hazard.
715 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
716}
717
718int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
719 int WaitStatesNeeded = 0;
720
721 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
722
723 // This SMRD hazard only affects SI.
724 if (!ST.hasSMRDReadVALUDefHazard())
725 return WaitStatesNeeded;
726
727 // A read of an SGPR by SMRD instruction requires 4 wait states when the
728 // SGPR was written by a VALU instruction.
729 int SmrdSgprWaitStates = 4;
730 auto IsHazardDefFn = [this](const MachineInstr &MI) {
731 return TII.isVALU(MI);
732 };
733 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
734 return TII.isSALU(MI);
735 };
736
737 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
738
739 for (const MachineOperand &Use : SMRD->uses()) {
740 if (!Use.isReg())
741 continue;
742 int WaitStatesNeededForUse =
743 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
744 SmrdSgprWaitStates);
745 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
746
747 // This fixes what appears to be undocumented hardware behavior in SI where
748 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
749 // needs some number of nops in between. We don't know how many we need, but
750 // let's use 4. This wasn't discovered before probably because the only
751 // case when this happens is when we expand a 64-bit pointer into a full
752 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
753 // probably never encountered in the closed-source land.
754 if (IsBufferSMRD) {
755 int WaitStatesNeededForUse =
756 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
757 IsBufferHazardDefFn,
758 SmrdSgprWaitStates);
759 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
760 }
761 }
762
763 return WaitStatesNeeded;
764}
765
766int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
767 if (!ST.hasVMEMReadSGPRVALUDefHazard())
768 return 0;
769
770 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
771
772 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
773 // SGPR was written by a VALU Instruction.
774 const int VmemSgprWaitStates = 5;
775 auto IsHazardDefFn = [this](const MachineInstr &MI) {
776 return TII.isVALU(MI);
777 };
778 for (const MachineOperand &Use : VMEM->uses()) {
779 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
780 continue;
781
782 int WaitStatesNeededForUse =
783 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
784 VmemSgprWaitStates);
785 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
786 }
787 return WaitStatesNeeded;
788}
789
790int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
791 const SIRegisterInfo *TRI = ST.getRegisterInfo();
792 const SIInstrInfo *TII = ST.getInstrInfo();
793
794 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
795 int DppVgprWaitStates = 2;
796 int DppExecWaitStates = 5;
797 int WaitStatesNeeded = 0;
798 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
799 return TII->isVALU(MI);
800 };
801
802 for (const MachineOperand &Use : DPP->uses()) {
803 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
804 continue;
805 int WaitStatesNeededForUse =
806 DppVgprWaitStates - getWaitStatesSinceDef(
807 Use.getReg(),
808 [](const MachineInstr &) { return true; },
809 DppVgprWaitStates);
810 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
811 }
812
813 WaitStatesNeeded = std::max(
814 WaitStatesNeeded,
815 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
816 DppExecWaitStates));
817
818 return WaitStatesNeeded;
819}
820
821int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
822 const SIInstrInfo *TII = ST.getInstrInfo();
823
824 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
825 // instruction.
826 const int DivFMasWaitStates = 4;
827 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
828 return TII->isVALU(MI);
829 };
830 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
831 DivFMasWaitStates);
832
833 return DivFMasWaitStates - WaitStatesNeeded;
834}
835
836int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
837 const SIInstrInfo *TII = ST.getInstrInfo();
838 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
839
840 const int GetRegWaitStates = 2;
841 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
842 return GetRegHWReg == getHWReg(TII, MI);
843 };
844 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
845
846 return GetRegWaitStates - WaitStatesNeeded;
847}
848
849int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
850 const SIInstrInfo *TII = ST.getInstrInfo();
851 unsigned HWReg = getHWReg(TII, *SetRegInstr);
852
853 const int SetRegWaitStates = ST.getSetRegWaitStates();
854 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
855 return HWReg == getHWReg(TII, MI);
856 };
857 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
858 return SetRegWaitStates - WaitStatesNeeded;
859}
860
861int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
862 if (!MI.mayStore())
863 return -1;
864
865 const SIInstrInfo *TII = ST.getInstrInfo();
866 unsigned Opcode = MI.getOpcode();
867 const MCInstrDesc &Desc = MI.getDesc();
868
869 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
870 int VDataRCID = -1;
871 if (VDataIdx != -1)
872 VDataRCID = TII->getOpRegClassID(Desc.operands()[VDataIdx]);
873
874 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
875 // There is no hazard if the instruction does not use vector regs
876 // (like wbinvl1)
877 if (VDataIdx == -1)
878 return -1;
879 // For MUBUF/MTBUF instructions this hazard only exists if the
880 // instruction is not using a register in the soffset field.
881 const MachineOperand *SOffset =
882 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
883 // If we have no soffset operand, then assume this field has been
884 // hardcoded to zero.
885 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
886 (!SOffset || !SOffset->isReg()))
887 return VDataIdx;
888 }
889
890 // MIMG instructions create a hazard if they don't use a 256-bit T# and
891 // the store size is greater than 8 bytes and they have more than two bits
892 // of their dmask set.
893 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
894 if (TII->isMIMG(MI)) {
895 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
896 assert(SRsrcIdx != -1 && AMDGPU::getRegBitWidth(TII->getOpRegClassID(
897 Desc.operands()[SRsrcIdx])) == 256);
898 (void)SRsrcIdx;
899 }
900
901 if (TII->isFLAT(MI)) {
902 // There is no hazard if the instruction does not use vector regs
903 if (VDataIdx == -1)
904 return -1;
905
906 if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
907 return VDataIdx;
908 }
909
910 return -1;
911}
912
913int
914GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
915 const MachineRegisterInfo &MRI) {
916 // Helper to check for the hazard where VMEM instructions that store more than
917 // 8 bytes can have there store data over written by the next instruction.
918 const SIRegisterInfo *TRI = ST.getRegisterInfo();
919
920 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
921 int WaitStatesNeeded = 0;
922
923 if (!TRI->isVectorRegister(MRI, Def.getReg()))
924 return WaitStatesNeeded;
925 Register Reg = Def.getReg();
926 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
927 int DataIdx = createsVALUHazard(MI);
928 return DataIdx >= 0 &&
929 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
930 };
931
932 int WaitStatesNeededForDef =
933 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
934 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
935
936 return WaitStatesNeeded;
937}
938
939/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
940/// pack the computed value into correct bit position of the dest register. This
941/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
942/// dst_sel that is not aligned to the register. This function analayzes the \p
943/// MI and \returns an operand with dst forwarding issue, or nullptr if
944/// none exists.
945static const MachineOperand *
948 return nullptr;
949
950 const SIInstrInfo *TII = ST.getInstrInfo();
951
952 unsigned Opcode = MI.getOpcode();
953
954 // There are three different types of instructions
955 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
956 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
957 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
958 // op_sel[3:2]
959 // != 0
960 if (SIInstrInfo::isSDWA(MI)) {
961 // Type 1: SDWA with dst_sel != DWORD
962 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
963 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
964 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
965 }
966
967 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
968 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
969 // Type 2: VOP3 which write the hi bits
970 if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
972 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
973
974 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
975 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
976 (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
978 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
979 }
980
981 // Special case: nop is required for all the opsel values for fp4 sr variant
982 // cvt scale instructions
983 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
984 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
985
986 return nullptr;
987}
988
989/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
990/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
991/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
993 const MachineOperand *Dst,
994 const SIRegisterInfo *TRI) {
995 // We must consider implicit reads of the VALU. SDWA with dst_sel and
996 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
997 // and we must account for that hazard.
998 // We also must account for WAW hazards. In particular, WAW with dest
999 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
1000 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
1001 // check for ECC. Without accounting for this hazard, the ECC will be
1002 // wrong.
1003 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
1004 // complete zeroesHigh16BitsOfDest)
1005 for (auto &Operand : VALU->operands()) {
1006 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1007 return true;
1008 }
1009 }
1010 return false;
1011}
1012
1013int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
1014 int WaitStatesNeeded = 0;
1015
1016 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
1017 const int TransDefWaitstates = 1;
1018
1019 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
1021 return false;
1022 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1023 const SIInstrInfo *TII = ST.getInstrInfo();
1024 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
1025
1026 for (const MachineOperand &Use : VALU->explicit_uses()) {
1027 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
1028 return true;
1029 }
1030
1031 return false;
1032 };
1033
1034 int WaitStatesNeededForDef =
1035 TransDefWaitstates -
1036 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1037 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1038 }
1039
1040 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1041 const int Shift16DefWaitstates = 1;
1042
1043 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1044 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1045 const MachineOperand *ForwardedDst =
1046 getDstSelForwardingOperand(ProducerMI, ST);
1047 if (ForwardedDst) {
1048 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
1049 }
1050
1051 if (ProducerMI.isInlineAsm()) {
1052 // Assume inline asm has dst forwarding hazard
1053 for (auto &Def : ProducerMI.all_defs()) {
1054 if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
1055 return true;
1056 }
1057 }
1058
1059 return false;
1060 };
1061
1062 int WaitStatesNeededForDef =
1063 Shift16DefWaitstates -
1064 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1065 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1066 }
1067
1068 if (ST.hasVDecCoExecHazard()) {
1069 const int VALUWriteSGPRVALUReadWaitstates = 2;
1070 const int VALUWriteEXECRWLane = 4;
1071 const int VALUWriteVGPRReadlaneRead = 1;
1072
1073 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1074 const MachineRegisterInfo &MRI = MF.getRegInfo();
1076 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1077 if (!SIInstrInfo::isVALU(MI))
1078 return false;
1079 return MI.modifiesRegister(UseReg, TRI);
1080 };
1081
1082 for (const MachineOperand &Use : VALU->explicit_uses()) {
1083 if (!Use.isReg())
1084 continue;
1085
1086 UseReg = Use.getReg();
1087 if (TRI->isSGPRReg(MRI, UseReg)) {
1088 int WaitStatesNeededForDef =
1089 VALUWriteSGPRVALUReadWaitstates -
1090 getWaitStatesSince(IsVALUDefSGPRFn,
1091 VALUWriteSGPRVALUReadWaitstates);
1092 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1093 }
1094 }
1095
1096 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1097 UseReg = AMDGPU::VCC;
1098 int WaitStatesNeededForDef =
1099 VALUWriteSGPRVALUReadWaitstates -
1100 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1101 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1102 }
1103
1104 switch (VALU->getOpcode()) {
1105 case AMDGPU::V_READLANE_B32:
1106 case AMDGPU::V_READFIRSTLANE_B32: {
1107 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1108 UseReg = Src->getReg();
1109 int WaitStatesNeededForDef =
1110 VALUWriteVGPRReadlaneRead -
1111 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1112 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1113 }
1114 [[fallthrough]];
1115 case AMDGPU::V_WRITELANE_B32: {
1116 UseReg = AMDGPU::EXEC;
1117 int WaitStatesNeededForDef =
1118 VALUWriteEXECRWLane -
1119 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1120 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1121 break;
1122 }
1123 default:
1124 break;
1125 }
1126 }
1127
1128 // This checks for the hazard where VMEM instructions that store more than
1129 // 8 bytes can have there store data over written by the next instruction.
1130 if (!ST.has12DWordStoreHazard())
1131 return WaitStatesNeeded;
1132
1133 const MachineRegisterInfo &MRI = MF.getRegInfo();
1134
1135 for (const MachineOperand &Def : VALU->defs()) {
1136 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1137 }
1138
1139 return WaitStatesNeeded;
1140}
1141
1142int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1143 // This checks for hazards associated with inline asm statements.
1144 // Since inline asms can contain just about anything, we use this
1145 // to call/leverage other check*Hazard routines. Note that
1146 // this function doesn't attempt to address all possible inline asm
1147 // hazards (good luck), but is a collection of what has been
1148 // problematic thus far.
1149
1150 // see checkVALUHazards()
1151 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1152 !ST.hasCvtScaleForwardingHazard())
1153 return 0;
1154
1155 const MachineRegisterInfo &MRI = MF.getRegInfo();
1156 int WaitStatesNeeded = 0;
1157
1158 for (const MachineOperand &Op :
1160 if (Op.isReg() && Op.isDef()) {
1161 if (!TRI.isVectorRegister(MRI, Op.getReg()))
1162 continue;
1163
1164 if (ST.has12DWordStoreHazard()) {
1165 WaitStatesNeeded =
1166 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1167 }
1168 }
1169 }
1170
1171 if (ST.hasDstSelForwardingHazard()) {
1172 const int Shift16DefWaitstates = 1;
1173
1174 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1175 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1176 // Assume inline asm reads the dst
1177 if (Dst)
1178 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1179 IA->readsRegister(Dst->getReg(), &TRI);
1180
1181 if (ProducerMI.isInlineAsm()) {
1182 // If MI is inline asm, assume it has dst forwarding hazard
1183 for (auto &Def : ProducerMI.all_defs()) {
1184 if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1185 IA->readsRegister(Def.getReg(), &TRI)) {
1186 return true;
1187 }
1188 }
1189 }
1190
1191 return false;
1192 };
1193
1194 int WaitStatesNeededForDef =
1195 Shift16DefWaitstates -
1196 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1197 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1198 }
1199
1200 return WaitStatesNeeded;
1201}
1202
1203int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1204 const SIInstrInfo *TII = ST.getInstrInfo();
1205 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1206 const MachineRegisterInfo &MRI = MF.getRegInfo();
1207
1208 const MachineOperand *LaneSelectOp =
1209 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1210
1211 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1212 return 0;
1213
1214 Register LaneSelectReg = LaneSelectOp->getReg();
1215 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1216
1217 const int RWLaneWaitStates = 4;
1218 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1219 RWLaneWaitStates);
1220 return RWLaneWaitStates - WaitStatesSince;
1221}
1222
1223int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1224 if (!ST.hasRFEHazards())
1225 return 0;
1226
1227 const SIInstrInfo *TII = ST.getInstrInfo();
1228
1229 const int RFEWaitStates = 1;
1230
1231 auto IsHazardFn = [TII](const MachineInstr &MI) {
1232 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1233 };
1234 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1235 return RFEWaitStates - WaitStatesNeeded;
1236}
1237
1238int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1239 const SIInstrInfo *TII = ST.getInstrInfo();
1240 const int ReadM0WaitStates = 1;
1241 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1242 return ReadM0WaitStates -
1243 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1244}
1245
1246void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1247 fixVMEMtoScalarWriteHazards(MI);
1248 fixVcmpxPermlaneHazards(MI);
1249 fixSMEMtoVectorWriteHazards(MI);
1250 fixVcmpxExecWARHazard(MI);
1251 fixLdsBranchVmemWARHazard(MI);
1252 if (ST.hasLdsDirect()) {
1253 fixLdsDirectVALUHazard(MI);
1254 fixLdsDirectVMEMHazard(MI);
1255 }
1256 fixVALUPartialForwardingHazard(MI);
1257 fixVALUTransUseHazard(MI);
1258 fixVALUTransCoexecutionHazards(MI);
1259 fixWMMAHazards(MI); // fall-through if co-execution is enabled.
1260 fixWMMACoexecutionHazards(MI);
1261 fixShift64HighRegBug(MI);
1262 fixVALUMaskWriteHazard(MI);
1263 fixRequiredExportPriority(MI);
1264 if (ST.requiresWaitIdleBeforeGetReg())
1265 fixGetRegWaitIdle(MI);
1266 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1267 fixDsAtomicAsyncBarrierArriveB64(MI);
1268 if (ST.hasScratchBaseForwardingHazard())
1269 fixScratchBaseForwardingHazard(MI);
1270 if (ST.setRegModeNeedsVNOPs())
1271 fixSetRegMode(MI);
1272}
1273
1275 const MachineInstr &MI) {
1276 return (TII.isVOPC(MI) ||
1277 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1278 MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1279}
1280
1281bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1282 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1283 return false;
1284
1285 const SIInstrInfo *TII = ST.getInstrInfo();
1286 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1287 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1288 return isVCmpXWritesExec(*TII, *TRI, MI);
1289 };
1290
1291 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1292 unsigned Opc = MI.getOpcode();
1293 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1294 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1295 };
1296
1297 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1298 std::numeric_limits<int>::max())
1299 return false;
1300
1301 // V_NOP will be discarded by SQ.
1302 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1303 // which is always a VGPR and available.
1304 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1305 Register Reg = Src0->getReg();
1306 bool IsUndef = Src0->isUndef();
1307 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1308 TII->get(AMDGPU::V_MOV_B32_e32))
1309 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1311
1312 return true;
1313}
1314
1315bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1316 if (!ST.hasVMEMtoScalarWriteHazard())
1317 return false;
1318 assert(!ST.hasExtendedWaitCounts());
1319
1321 return false;
1322
1323 if (MI->getNumDefs() == 0)
1324 return false;
1325
1326 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1327
1328 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1330 return false;
1331
1332 for (const MachineOperand &Def : MI->defs()) {
1333 const MachineOperand *Op =
1334 I.findRegisterUseOperand(Def.getReg(), TRI, false);
1335 if (!Op)
1336 continue;
1337 return true;
1338 }
1339 return false;
1340 };
1341
1342 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1343 return SIInstrInfo::isVALU(MI) ||
1344 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1345 !MI.getOperand(0).getImm()) ||
1346 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1347 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1348 };
1349
1350 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1351 std::numeric_limits<int>::max())
1352 return false;
1353
1354 const SIInstrInfo *TII = ST.getInstrInfo();
1355 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1356 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1358 return true;
1359}
1360
1361bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1362 if (!ST.hasSMEMtoVectorWriteHazard())
1363 return false;
1364 assert(!ST.hasExtendedWaitCounts());
1365
1366 if (!SIInstrInfo::isVALU(*MI))
1367 return false;
1368
1369 AMDGPU::OpName SDSTName;
1370 switch (MI->getOpcode()) {
1371 case AMDGPU::V_READLANE_B32:
1372 case AMDGPU::V_READFIRSTLANE_B32:
1373 SDSTName = AMDGPU::OpName::vdst;
1374 break;
1375 default:
1376 SDSTName = AMDGPU::OpName::sdst;
1377 break;
1378 }
1379
1380 const SIInstrInfo *TII = ST.getInstrInfo();
1381 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1382 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1383 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1384 if (!SDST) {
1385 for (const auto &MO : MI->implicit_operands()) {
1386 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1387 SDST = &MO;
1388 break;
1389 }
1390 }
1391 }
1392
1393 if (!SDST)
1394 return false;
1395
1396 const Register SDSTReg = SDST->getReg();
1397 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1398 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1399 };
1400
1401 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1402 if (TII->isSALU(MI)) {
1403 switch (MI.getOpcode()) {
1404 case AMDGPU::S_SETVSKIP:
1405 case AMDGPU::S_VERSION:
1406 case AMDGPU::S_WAITCNT_VSCNT:
1407 case AMDGPU::S_WAITCNT_VMCNT:
1408 case AMDGPU::S_WAITCNT_EXPCNT:
1409 // These instructions cannot not mitigate the hazard.
1410 return false;
1411 case AMDGPU::S_WAITCNT_LGKMCNT:
1412 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1413 return (MI.getOperand(1).getImm() == 0) &&
1414 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1415 case AMDGPU::S_WAITCNT: {
1416 const int64_t Imm = MI.getOperand(0).getImm();
1417 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1418 // DsCnt corresponds to LGKMCnt here.
1419 return (Decoded.DsCnt == 0);
1420 }
1421 default:
1422 assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
1423 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1424 "unexpected wait count instruction");
1425 // SOPP instructions cannot mitigate the hazard.
1426 if (TII->isSOPP(MI))
1427 return false;
1428 // At this point the SALU can be assumed to mitigate the hazard
1429 // because either:
1430 // (a) it is independent of the at risk SMEM (breaking chain),
1431 // or
1432 // (b) it is dependent on the SMEM, in which case an appropriate
1433 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1434 // SMEM instruction.
1435 return true;
1436 }
1437 }
1438 return false;
1439 };
1440
1441 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1442 std::numeric_limits<int>::max())
1443 return false;
1444
1445 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1446 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1447 .addImm(0);
1448 return true;
1449}
1450
1451bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1452 if (!ST.hasVcmpxExecWARHazard())
1453 return false;
1454 assert(!ST.hasExtendedWaitCounts());
1455
1456 if (!SIInstrInfo::isVALU(*MI))
1457 return false;
1458
1459 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1460 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1461 return false;
1462
1463 auto IsHazardFn = [TRI](const MachineInstr &I) {
1465 return false;
1466 return I.readsRegister(AMDGPU::EXEC, TRI);
1467 };
1468
1469 const SIInstrInfo *TII = ST.getInstrInfo();
1470 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1471 if (SIInstrInfo::isVALU(MI)) {
1472 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1473 return true;
1474 for (auto MO : MI.implicit_operands())
1475 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1476 return true;
1477 }
1478 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1479 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1480 return true;
1481 return false;
1482 };
1483
1484 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1485 std::numeric_limits<int>::max())
1486 return false;
1487
1488 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1489 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1491 return true;
1492}
1493
1495 const GCNSubtarget &ST) {
1496 if (!ST.hasLdsBranchVmemWARHazard())
1497 return false;
1498
1499 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1500 // instructions need to appear in the same function.
1501 bool HasLds = false;
1502 bool HasVmem = false;
1503 for (auto &MBB : MF) {
1504 for (auto &MI : MBB) {
1506 HasVmem |= SIInstrInfo::isVMEM(MI);
1507 if (HasLds && HasVmem)
1508 return true;
1509 }
1510 }
1511 return false;
1512}
1513
1515 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1516 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1517 !I.getOperand(1).getImm();
1518}
1519
1520bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1521 if (!RunLdsBranchVmemWARHazardFixup)
1522 return false;
1523
1524 assert(ST.hasLdsBranchVmemWARHazard());
1525 assert(!ST.hasExtendedWaitCounts());
1526
1527 auto IsHazardInst = [](const MachineInstr &MI) {
1529 return 1;
1531 return 2;
1532 return 0;
1533 };
1534
1535 auto InstType = IsHazardInst(*MI);
1536 if (!InstType)
1537 return false;
1538
1539 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1540 return IsHazardInst(I) || isStoreCountWaitZero(I);
1541 };
1542
1543 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1544 if (!I.isBranch())
1545 return false;
1546
1547 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1548 auto InstType2 = IsHazardInst(I);
1549 return InstType2 && InstType != InstType2;
1550 };
1551
1552 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1553 auto InstType2 = IsHazardInst(I);
1554 if (InstType == InstType2)
1555 return true;
1556
1557 return isStoreCountWaitZero(I);
1558 };
1559
1560 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1561 std::numeric_limits<int>::max();
1562 };
1563
1564 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1565 std::numeric_limits<int>::max())
1566 return false;
1567
1568 const SIInstrInfo *TII = ST.getInstrInfo();
1569 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1570 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1571 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1572 .addImm(0);
1573
1574 return true;
1575}
1576
1577bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1579 return false;
1580
1581 const int NoHazardWaitStates = 15;
1582 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1583 const Register VDSTReg = VDST->getReg();
1584
1585 bool VisitedTrans = false;
1586 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1587 if (!SIInstrInfo::isVALU(I))
1588 return false;
1589 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1590 // Cover both WAR and WAW
1591 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1592 };
1593 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1594 if (WaitStates >= NoHazardWaitStates)
1595 return true;
1596 // Instructions which cause va_vdst==0 expire hazard
1599 };
1600 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1601 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1602 };
1603
1604 DenseSet<const MachineBasicBlock *> Visited;
1605 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1606 std::next(MI->getReverseIterator()), 0,
1607 IsExpiredFn, Visited, GetWaitStatesFn);
1608
1609 // Transcendentals can execute in parallel to other VALUs.
1610 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1611 if (VisitedTrans)
1612 Count = 0;
1613
1614 MachineOperand *WaitVdstOp =
1615 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1616 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1617
1618 return true;
1619}
1620
1621bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1623 return false;
1624
1625 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1626 const Register VDSTReg = VDST->getReg();
1627
1628 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1630 return false;
1631 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1632 };
1633 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1634 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1635 // according to the type of VMEM instruction.
1636 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1638 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1639 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1640 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1641 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1642 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1643 };
1644
1645 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1646 std::numeric_limits<int>::max())
1647 return false;
1648
1649 if (LdsdirCanWait) {
1650 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1651 } else {
1652 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1653 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1655 }
1656
1657 return true;
1658}
1659
1660bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1661 if (!ST.hasVALUPartialForwardingHazard())
1662 return false;
1663 assert(!ST.hasExtendedWaitCounts());
1664
1665 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1666 return false;
1667
1668 SmallSetVector<Register, 4> SrcVGPRs;
1669
1670 for (const MachineOperand &Use : MI->explicit_uses()) {
1671 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1672 SrcVGPRs.insert(Use.getReg());
1673 }
1674
1675 // Only applies with >= 2 unique VGPR sources
1676 if (SrcVGPRs.size() <= 1)
1677 return false;
1678
1679 // Look for the following pattern:
1680 // Va <- VALU [PreExecPos]
1681 // intv1
1682 // Exec <- SALU [ExecPos]
1683 // intv2
1684 // Vb <- VALU [PostExecPos]
1685 // intv3
1686 // MI Va, Vb (WaitState = 0)
1687 //
1688 // Where:
1689 // intv1 + intv2 <= 2 VALUs
1690 // intv3 <= 4 VALUs
1691 //
1692 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1693
1694 const int Intv1plus2MaxVALUs = 2;
1695 const int Intv3MaxVALUs = 4;
1696 const int IntvMaxVALUs = 6;
1697 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1698
1699 struct StateType {
1700 SmallDenseMap<Register, int, 4> DefPos;
1701 int ExecPos = std::numeric_limits<int>::max();
1702 int VALUs = 0;
1703
1704 static unsigned getHashValue(const StateType &State) {
1705 return hash_combine(State.ExecPos, State.VALUs,
1706 hash_combine_range(State.DefPos));
1707 }
1708 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1709 return LHS.DefPos == RHS.DefPos && LHS.ExecPos == RHS.ExecPos &&
1710 LHS.VALUs == RHS.VALUs;
1711 }
1712 };
1713
1714 StateType State;
1715
1716 // This overloads expiry testing with all the hazard detection
1717 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1718 // Too many VALU states have passed
1719 if (State.VALUs > NoHazardVALUWaitStates)
1720 return HazardExpired;
1721
1722 // Instructions which cause va_vdst==0 expire hazard
1725 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1726 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1727 return HazardExpired;
1728
1729 // Track registers writes
1730 bool Changed = false;
1731 if (SIInstrInfo::isVALU(I)) {
1732 for (Register Src : SrcVGPRs) {
1733 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1734 State.DefPos[Src] = State.VALUs;
1735 Changed = true;
1736 }
1737 }
1738 } else if (SIInstrInfo::isSALU(I)) {
1739 if (State.ExecPos == std::numeric_limits<int>::max()) {
1740 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1741 State.ExecPos = State.VALUs;
1742 Changed = true;
1743 }
1744 }
1745 }
1746
1747 // Early expiration: too many VALUs in intv3
1748 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1749 return HazardExpired;
1750
1751 // Only evaluate state if something changed
1752 if (!Changed)
1753 return NoHazardFound;
1754
1755 // Determine positions of VALUs pre/post exec change
1756 if (State.ExecPos == std::numeric_limits<int>::max())
1757 return NoHazardFound;
1758
1759 int PreExecPos = std::numeric_limits<int>::max();
1760 int PostExecPos = std::numeric_limits<int>::max();
1761
1762 for (auto Entry : State.DefPos) {
1763 int DefVALUs = Entry.second;
1764 if (DefVALUs != std::numeric_limits<int>::max()) {
1765 if (DefVALUs >= State.ExecPos)
1766 PreExecPos = std::min(PreExecPos, DefVALUs);
1767 else
1768 PostExecPos = std::min(PostExecPos, DefVALUs);
1769 }
1770 }
1771
1772 // Need a VALUs post exec change
1773 if (PostExecPos == std::numeric_limits<int>::max())
1774 return NoHazardFound;
1775
1776 // Too many VALUs in intv3?
1777 int Intv3VALUs = PostExecPos;
1778 if (Intv3VALUs > Intv3MaxVALUs)
1779 return HazardExpired;
1780
1781 // Too many VALUs in intv2?
1782 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1783 if (Intv2VALUs > Intv1plus2MaxVALUs)
1784 return HazardExpired;
1785
1786 // Need a VALUs pre exec change
1787 if (PreExecPos == std::numeric_limits<int>::max())
1788 return NoHazardFound;
1789
1790 // Too many VALUs in intv1?
1791 int Intv1VALUs = PreExecPos - State.ExecPos;
1792 if (Intv1VALUs > Intv1plus2MaxVALUs)
1793 return HazardExpired;
1794
1795 // Too many VALUs in intv1 + intv2
1796 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1797 return HazardExpired;
1798
1799 return HazardFound;
1800 };
1801 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1803 State.VALUs += 1;
1804 };
1805
1806 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1807 std::next(MI->getReverseIterator())))
1808 return false;
1809
1810 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1811 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1813
1814 return true;
1815}
1816
1817bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1818 if (!ST.hasVALUTransUseHazard())
1819 return false;
1820 assert(!ST.hasExtendedWaitCounts());
1821
1822 if (!SIInstrInfo::isVALU(*MI))
1823 return false;
1824
1825 SmallSet<Register, 4> SrcVGPRs;
1826
1827 for (const MachineOperand &Use : MI->explicit_uses()) {
1828 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1829 SrcVGPRs.insert(Use.getReg());
1830 }
1831
1832 // Look for the following pattern:
1833 // Va <- TRANS VALU
1834 // intv
1835 // MI Va (WaitState = 0)
1836 //
1837 // Where:
1838 // intv <= 5 VALUs / 1 TRANS
1839 //
1840 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1841
1842 const int IntvMaxVALUs = 5;
1843 const int IntvMaxTRANS = 1;
1844
1845 struct StateType {
1846 int VALUs = 0;
1847 int TRANS = 0;
1848
1849 static unsigned getHashValue(const StateType &State) {
1850 return hash_combine(State.VALUs, State.TRANS);
1851 }
1852 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1853 return LHS.VALUs == RHS.VALUs && LHS.TRANS == RHS.TRANS;
1854 }
1855 };
1856
1857 StateType State;
1858
1859 // This overloads expiry testing with all the hazard detection
1860 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1861 // Too many VALU states have passed
1862 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1863 return HazardExpired;
1864
1865 // Instructions which cause va_vdst==0 expire hazard
1868 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1869 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1870 return HazardExpired;
1871
1872 // Track registers writes
1873 if (SIInstrInfo::isTRANS(I)) {
1874 for (Register Src : SrcVGPRs) {
1875 if (I.modifiesRegister(Src, &TRI)) {
1876 return HazardFound;
1877 }
1878 }
1879 }
1880
1881 return NoHazardFound;
1882 };
1883 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1885 State.VALUs += 1;
1887 State.TRANS += 1;
1888 };
1889
1890 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1891 std::next(MI->getReverseIterator())))
1892 return false;
1893
1894 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1895 // avoided.
1896 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1897 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1899
1900 return true;
1901}
1902
1903bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
1904 if (!AMDGPU::isGFX1250(ST) || // Coexecution disabled.
1906 return false;
1907
1908 const SIInstrInfo *TII = ST.getInstrInfo();
1909 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1910
1911 auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1912 if (!SIInstrInfo::isTRANS(I))
1913 return false;
1914
1915 // RAW: Trans(I) writes, VALU(MI) reads.
1916 Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1917 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
1918 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1919 return true;
1920 }
1921
1922 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
1923 if (!ValuDst || !ValuDst->isReg())
1924 return false;
1925
1926 // WAR: Trans(I) reads, VALU(MI) writes.
1927 Register ValuDef = ValuDst->getReg();
1928 for (const MachineOperand &TransUse : I.explicit_uses()) {
1929 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1930 return true;
1931 }
1932
1933 return false;
1934 };
1935
1936 auto IsExpiredFn = [](const MachineInstr &I, int) {
1937 return SIInstrInfo::isVALU(I);
1938 };
1939
1940 const int HasVALU = std::numeric_limits<int>::max();
1941 if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU)
1942 return false;
1943
1944 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1945 return true;
1946}
1947
1948bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1950 return false;
1951
1952 const SIInstrInfo *TII = ST.getInstrInfo();
1953 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1954
1955 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1957 return false;
1958
1959 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1960 // with the dest(matrix D) of the previous wmma.
1961 const Register CurSrc0Reg =
1962 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1963 const Register CurSrc1Reg =
1964 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1965
1966 const Register PrevDstReg =
1967 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1968
1969 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1970 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1971 return true;
1972 }
1973
1974 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1975 // but Index can't overlap with PrevDstReg.
1976 if (AMDGPU::isGFX12Plus(ST)) {
1977 if (SIInstrInfo::isSWMMAC(*MI)) {
1978 const Register CurIndex =
1979 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1980 if (TRI->regsOverlap(PrevDstReg, CurIndex))
1981 return true;
1982 }
1983 return false;
1984 }
1985
1986 return false;
1987 };
1988
1989 auto IsExpiredFn = [](const MachineInstr &I, int) {
1990 return SIInstrInfo::isVALU(I);
1991 };
1992
1993 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1994 std::numeric_limits<int>::max())
1995 return false;
1996
1997 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1998
1999 return true;
2000}
2001
2004 !SIInstrInfo::isWMMA(MI) && !SIInstrInfo::isSWMMAC(MI); // What else?
2005}
2006
2008 const SIInstrInfo *TII, unsigned Latency,
2009 unsigned Category) {
2010 assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) &&
2011 "Handle me if the xdl wmma instruction latency changes");
2012
2013 switch (Category) {
2014 case 0: // Dense WMMA Instructions:
2015 // WMMA_*F16, WMMA_*BF16
2016 // WMMA_*FP8FP8
2017 // WMMA_*FP8BF8
2018 // WMMA_*BF8FP8
2019 // WMMA_*BF8BF8
2020 // WMMA_*F8F6F4 if SRCA & SRCB != F8
2021 return Latency == 8 && SIInstrInfo::isWMMA(MI);
2022
2023 case 1: // Dense WMMA Instructions:
2024 // WMMA_IU8
2025 // WMMA_IU4
2026 // WMMA_*F8F6F4 if SRCA OR SRCB == F8
2027 return Latency == 16 && SIInstrInfo::isWMMA(MI);
2028
2029 case 2: // Dense SWMMAC Instructions
2030 // SWMMAC_*F16, SWMMAC_*BF16,
2031 // SWMMAC_*FP8FP8
2032 // SWMMAC_*BF8FP8
2033 // SWMMAC_*FP8BF8
2034 // SWMMAC_*BF8BF8
2035 return Latency == 8 && SIInstrInfo::isSWMMAC(MI);
2036
2037 case 3: // Sparse WMMA Instructions:
2038 // SWMMAC_IU8
2039 // SWMMAC_IU4
2040 return Latency == 16 && SIInstrInfo::isSWMMAC(MI);
2041 default:
2042 break;
2043 } // end switch.
2044
2045 return false;
2046}
2047
2048bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
2049 if (!AMDGPU::isGFX1250(ST))
2050 return false;
2051
2052 const SIInstrInfo *TII = ST.getInstrInfo();
2053 if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
2054 return false;
2055
2056 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2057
2058 // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
2059 // be in between the first WMMA and the second instruction to cover the hazard
2060 // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
2061 // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
2062 // numbers, which depends on the category of the first WMMA.
2063 const int WMMAWaitStates[] = {5, 9, 3, 5};
2064 const int VALUWaitStates[] = {4, 8, 2, 4};
2065 unsigned Category = 0;
2066
2067 auto IsWMMAHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
2068 if (!TII->isXDLWMMA(I))
2069 return false;
2070
2071 unsigned Latency = TSchedModel.computeInstrLatency(&I);
2072 if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
2073 return false;
2074
2075 Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2076 Register A1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
2077 Register B1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
2078
2079 // WMMA0 wrires (D0), WMMA1 reads (A1/B1/Idx1).
2080 if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))
2081 return true;
2082
2083 if (SIInstrInfo::isSWMMAC(*MI)) {
2084 Register Idx1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
2085 if (TRI->regsOverlap(D0, Idx1))
2086 return true;
2087 }
2088
2089 return false;
2090 };
2091
2092 auto IsVALUHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
2093 if (!TII->isXDLWMMA(I))
2094 return false;
2095
2096 unsigned Latency = TSchedModel.computeInstrLatency(&I);
2097 if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
2098 return false;
2099
2100 // WMMA writes, VALU reads.
2101 Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2102 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
2103 if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))
2104 return true;
2105 }
2106
2107 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
2108 if (!ValuDst || !ValuDst->isReg())
2109 return false;
2110 Register D1 = ValuDst->getReg();
2111
2112 // WMMA writes, VALU writes.
2113 if (TRI->regsOverlap(D0, D1))
2114 return true;
2115
2116 // WMMA reads, VALU writes.
2117 Register A0 = TII->getNamedOperand(I, AMDGPU::OpName::src0)->getReg();
2118 Register B0 = TII->getNamedOperand(I, AMDGPU::OpName::src1)->getReg();
2119 if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))
2120 return true;
2121
2122 if (SIInstrInfo::isSWMMAC(I)) {
2123 Register Idx0 = TII->getNamedOperand(I, AMDGPU::OpName::src2)->getReg();
2124 if (TRI->regsOverlap(D1, Idx0))
2125 return true;
2126 }
2127
2128 return false;
2129 };
2130
2131 int Limit = 0;
2132 auto IsExpiredFn = [&Limit](const MachineInstr &, int WaitStates) {
2133 return WaitStates >= Limit;
2134 };
2135
2136 auto GetWaitStatesFn = [](const MachineInstr &I) {
2137 return SIInstrInfo::isVALU(I) ? 1 : 0;
2138 };
2139
2140 int WaitStatesNeeded = -1;
2141 if (TII->isXDLWMMA(*MI)) {
2142 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2143 Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
2144 DenseSet<const MachineBasicBlock *> Visited;
2145 // '::getWaitStatesSince' returns the number of VALUs in between if hazard
2146 // exists, and INT_MAX if there is no hazard. As a result, a negative
2147 // WaitStatesNeeded here means no hazard, and we will continue to search
2148 // for other categories.
2149 WaitStatesNeeded =
2150 Limit - ::getWaitStatesSince(IsWMMAHazardFn, MI->getParent(),
2151 std::next(MI->getReverseIterator()), 0,
2152 IsExpiredFn, Visited, GetWaitStatesFn);
2153 }
2154 } else { // Must be a co-executable VALU.
2155 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2156 Limit = VALUWaitStates[Category]; // for IsExpiredFn.
2157 DenseSet<const MachineBasicBlock *> Visited;
2158 // '::getWaitStatesSince' returns the number of VALUs in between if hazard
2159 // exists, and INT_MAX if there is no hazard. As a result, a negative
2160 // WaitStatesNeeded here means no hazard, and we will continue to search
2161 // for other categories.
2162 WaitStatesNeeded =
2163 Limit - ::getWaitStatesSince(IsVALUHazardFn, MI->getParent(),
2164 std::next(MI->getReverseIterator()), 0,
2165 IsExpiredFn, Visited, GetWaitStatesFn);
2166 }
2167 }
2168
2169 // WaitStatesNeeded now is the number of V_NOPs we need to insert, negative
2170 // means not needed.
2171 for (int i = 0; i < WaitStatesNeeded; i++)
2172 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2173 TII->get(AMDGPU::V_NOP_e32));
2174
2175 return true;
2176}
2177
2178bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
2179 if (!ST.hasShift64HighRegBug())
2180 return false;
2181 assert(!ST.hasExtendedWaitCounts());
2182
2183 switch (MI->getOpcode()) {
2184 default:
2185 return false;
2186 case AMDGPU::V_LSHLREV_B64_e64:
2187 case AMDGPU::V_LSHRREV_B64_e64:
2188 case AMDGPU::V_ASHRREV_I64_e64:
2189 break;
2190 }
2191
2192 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
2193 if (!Amt->isReg())
2194 return false;
2195
2196 Register AmtReg = Amt->getReg();
2197 const MachineRegisterInfo &MRI = MF.getRegInfo();
2198 // Check if this is a last VGPR in the allocation block.
2199 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2200 return false;
2201
2202 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
2203 return false;
2204
2205 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
2206 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
2207 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
2208 bool Overlapped = OverlappedSrc || OverlappedDst;
2209
2210 assert(!OverlappedDst || !OverlappedSrc ||
2211 Src1->getReg() == MI->getOperand(0).getReg());
2212 assert(ST.needsAlignedVGPRs());
2213 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2214
2215 Register NewReg;
2216 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2217 : AMDGPU::VGPR_32RegClass) {
2218 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
2219 NewReg = Reg;
2220 break;
2221 }
2222 }
2223
2224 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2225 : NewReg;
2226 Register NewAmtLo;
2227
2228 if (Overlapped)
2229 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2230
2231 DebugLoc DL = MI->getDebugLoc();
2232 MachineBasicBlock *MBB = MI->getParent();
2233 // Insert a full wait count because found register might be pending a wait.
2234 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
2235 .addImm(0);
2236
2237 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
2238 if (Overlapped)
2239 runOnInstruction(
2240 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
2241 .addDef(AmtReg - 1)
2242 .addReg(AmtReg - 1, RegState::Undef)
2243 .addReg(NewAmtLo, RegState::Undef));
2244 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2245 .addDef(AmtReg)
2246 .addReg(AmtReg, RegState::Undef)
2247 .addReg(NewAmt, RegState::Undef));
2248
2249 // Instructions emitted after the current instruction will be processed by the
2250 // parent loop of the hazard recognizer in a natural way.
2251 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2252 AmtReg)
2253 .addDef(NewAmt)
2254 .addReg(NewAmt)
2255 .addReg(AmtReg);
2256 if (Overlapped)
2257 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2258 AmtReg - 1)
2259 .addDef(NewAmtLo)
2260 .addReg(NewAmtLo)
2261 .addReg(AmtReg - 1);
2262
2263 // Re-running hazard recognizer on the modified instruction is not necessary,
2264 // inserted V_SWAP_B32 has already both read and write new registers so
2265 // hazards related to these register has already been handled.
2266 Amt->setReg(NewAmt);
2267 Amt->setIsKill(false);
2268 // We do not update liveness, so verifier may see it as undef.
2269 Amt->setIsUndef();
2270 if (OverlappedDst)
2271 MI->getOperand(0).setReg(NewReg);
2272 if (OverlappedSrc) {
2273 Src1->setReg(NewReg);
2274 Src1->setIsKill(false);
2275 Src1->setIsUndef();
2276 }
2277
2278 return true;
2279}
2280
2281int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
2282 int NSAtoVMEMWaitStates = 1;
2283
2284 if (!ST.hasNSAtoVMEMBug())
2285 return 0;
2286
2288 return 0;
2289
2290 const SIInstrInfo *TII = ST.getInstrInfo();
2291 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2292 if (!Offset || (Offset->getImm() & 6) == 0)
2293 return 0;
2294
2295 auto IsHazardFn = [TII](const MachineInstr &I) {
2296 if (!SIInstrInfo::isMIMG(I))
2297 return false;
2298 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
2299 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2300 TII->getInstSizeInBytes(I) >= 16;
2301 };
2302
2303 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
2304}
2305
2306int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
2307 int FPAtomicToDenormModeWaitStates = 3;
2308
2309 if (!ST.hasFPAtomicToDenormModeHazard())
2310 return 0;
2311 assert(!ST.hasExtendedWaitCounts());
2312
2313 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2314 return 0;
2315
2316 auto IsHazardFn = [](const MachineInstr &I) {
2317 if (!SIInstrInfo::isVMEM(I))
2318 return false;
2319 return SIInstrInfo::isFPAtomic(I);
2320 };
2321
2322 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2323 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2324 return true;
2325
2326 return SIInstrInfo::isWaitcnt(MI.getOpcode());
2327 };
2328
2329 return FPAtomicToDenormModeWaitStates -
2330 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2331}
2332
2333int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2335
2336 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2337}
2338
2339int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2340 // Early exit if no padding is requested.
2341 if (MFMAPaddingRatio == 0)
2342 return 0;
2343
2344 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2345 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2346 return 0;
2347
2348 int NeighborMFMALatency = 0;
2349 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2350 this](const MachineInstr &MI) {
2351 if (!SIInstrInfo::isMFMA(MI))
2352 return false;
2353
2354 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2355 return true;
2356 };
2357
2358 const int MaxMFMAPipelineWaitStates = 16;
2359 int WaitStatesSinceNeighborMFMA =
2360 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2361
2362 int NeighborMFMAPaddingNeeded =
2363 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2364 WaitStatesSinceNeighborMFMA;
2365
2366 return std::max(0, NeighborMFMAPaddingNeeded);
2367}
2368
2369int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2370 int WaitStatesNeeded = 0;
2371 unsigned Opc = MI->getOpcode();
2372
2373 auto IsVALUFn = [](const MachineInstr &MI) {
2374 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2375 };
2376
2377 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2378 const int LegacyVALUWritesVGPRWaitStates = 2;
2379 const int VALUWritesExecWaitStates = 4;
2380 const int MaxWaitStates = 4;
2381
2382 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2383 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2384 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2385
2386 if (WaitStatesNeeded < MaxWaitStates) {
2387 for (const MachineOperand &Use : MI->explicit_uses()) {
2388 const int MaxWaitStates = 2;
2389
2390 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2391 continue;
2392
2393 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2394 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2395 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2396
2397 if (WaitStatesNeeded == MaxWaitStates)
2398 break;
2399 }
2400 }
2401 }
2402
2403 for (const MachineOperand &Op : MI->explicit_operands()) {
2404 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2405 continue;
2406
2407 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2408 continue;
2409
2410 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2411 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2412 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2413 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2414 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2415 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2416 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2417 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2418 const int MaxWaitStates = 18;
2419 Register Reg = Op.getReg();
2420 unsigned HazardDefLatency = 0;
2421
2422 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2423 this](const MachineInstr &MI) {
2424 if (!SIInstrInfo::isMFMA(MI))
2425 return false;
2426 Register DstReg = MI.getOperand(0).getReg();
2427 if (DstReg == Reg)
2428 return false;
2429 HazardDefLatency =
2430 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2431 return TRI.regsOverlap(DstReg, Reg);
2432 };
2433
2434 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2435 MaxWaitStates);
2436 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2437 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2438 int OpNo = Op.getOperandNo();
2439 if (OpNo == SrcCIdx) {
2440 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2441 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2442 switch (HazardDefLatency) {
2443 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2444 break;
2445 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2446 break;
2447 case 16: [[fallthrough]];
2448 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2449 break;
2450 }
2451 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2452 switch (HazardDefLatency) {
2453 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2454 break;
2455 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2456 break;
2457 case 16: [[fallthrough]];
2458 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2459 break;
2460 }
2461 }
2462
2463 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2464 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2465
2466 if (WaitStatesNeeded == MaxWaitStates)
2467 return WaitStatesNeeded; // Early exit.
2468
2469 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2470 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2471 return false;
2472 Register DstReg = MI.getOperand(0).getReg();
2473 return TRI.regsOverlap(Reg, DstReg);
2474 };
2475
2476 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2477 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2478 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2479 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2480 if (OpNo == SrcCIdx)
2481 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2482 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2483 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2484
2485 WaitStatesNeededForUse = NeedWaitStates -
2486 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2487 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2488
2489 if (WaitStatesNeeded == MaxWaitStates)
2490 return WaitStatesNeeded; // Early exit.
2491 }
2492
2493 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2494 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2495 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2496 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2497 const int MaxWaitStates = 13;
2498 Register DstReg = MI->getOperand(0).getReg();
2499 unsigned HazardDefLatency = 0;
2500
2501 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2502 this](const MachineInstr &MI) {
2503 if (!SIInstrInfo::isMFMA(MI))
2504 return false;
2505 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2506 HazardDefLatency =
2507 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2508 return TRI.regsOverlap(Reg, DstReg);
2509 };
2510
2511 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2512 int NeedWaitStates;
2513 switch (HazardDefLatency) {
2514 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2515 break;
2516 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2517 break;
2518 case 16: [[fallthrough]];
2519 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2520 break;
2521 }
2522
2523 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2524 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2525 }
2526
2527 // Pad neighboring MFMA with noops for better inter-wave performance.
2528 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2529
2530 return WaitStatesNeeded;
2531}
2532
2533static int
2535 bool IsGFX950) {
2536 // xdl def cycles | gfx940 | gfx950
2537 // 2 pass | 3 4
2538 // 4 pass | 5 6
2539 // 8 pass | 9 10
2540 // 16 pass | 17 18
2541 return NumPasses + 1 + IsGFX950;
2542}
2543
2544static int
2546 bool IsGFX950) {
2547 // xdl def cycles | gfx940 | gfx950
2548 // 2 pass | 3 3
2549 // 4 pass | 5 6
2550 // 8 pass | 9 10
2551 // 16 pass | 17 18
2552 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2553}
2554
2555static int
2557 // 2 pass -> 2
2558 // 4 pass -> 4
2559 // 8 pass -> 8
2560 // 16 pass -> 16
2561 return NumPasses;
2562}
2563
2564static int
2566 // 2 pass -> 4
2567 // 4 pass -> 6
2568 // 8 pass -> 10
2569 // 16 pass -> 18
2570 return NumPasses + 2;
2571}
2572
2574 bool IsGFX950) {
2575 // xdl def cycles | gfx942 | gfx950
2576 // 2 pass | 5 5
2577 // 4 pass | 7 8
2578 // 8 pass | 11 12
2579 // 16 pass | 19 20
2580 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2581}
2582
2583int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2584 int WaitStatesNeeded = 0;
2585 unsigned Opc = MI->getOpcode();
2586
2587 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2589 };
2590
2591 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2594 };
2595
2596 if (!SIInstrInfo::isMFMA(*MI))
2597 return WaitStatesNeeded;
2598
2599 const int VALUWritesExecWaitStates = 4;
2600 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2601 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2602 VALUWritesExecWaitStates);
2603 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2604
2605 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2606
2607 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2608 for (const MachineOperand &Use : MI->explicit_uses()) {
2609 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2610 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2611 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2612 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2613 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2614 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2615 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2616 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2617 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2618 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2619 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2620 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2621 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2622 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2623 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2624 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2625 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2626 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2627 const int MaxWaitStates = 19;
2628
2629 if (!Use.isReg())
2630 continue;
2631 Register Reg = Use.getReg();
2632 bool FullReg;
2633 const MachineInstr *MI1;
2634
2635 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2636 this](const MachineInstr &MI) {
2637 if (!SIInstrInfo::isMFMA(MI))
2638 return false;
2639 Register DstReg = MI.getOperand(0).getReg();
2640 FullReg = (DstReg == Reg);
2641 MI1 = &MI;
2642 return TRI.regsOverlap(DstReg, Reg);
2643 };
2644
2645 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2646 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2647 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2648
2649 int NumWaitStates =
2650 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2651 if (NumWaitStates == std::numeric_limits<int>::max())
2652 continue;
2653
2654 int OpNo = Use.getOperandNo();
2655 unsigned Opc1 = MI1->getOpcode();
2656 int NeedWaitStates = 0;
2657 if (OpNo == SrcCIdx) {
2658 if (!SIInstrInfo::isDGEMM(Opc) &&
2659 (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opc1))) {
2660 NeedWaitStates = 0;
2661 } else if (FullReg) {
2662 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2663 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2664 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2665 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2666 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2667 else if (ST.hasGFX940Insts() &&
2668 TSchedModel.computeInstrLatency(MI1) == 2)
2669 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2670 } else {
2671 switch (Opc1) {
2672 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2673 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2674 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2675 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2676 if (!TII.isXDL(*MI))
2677 NeedWaitStates =
2678 ST.hasGFX950Insts()
2679 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2680 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2681 break;
2682 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2683 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2684 if (!TII.isXDL(*MI))
2685 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2686 break;
2687 default:
2688 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2689 if (ST.hasGFX940Insts()) {
2690 if (TII.isXDL(*MI) && !TII.isXDL(*MI1))
2691 break;
2692
2693 NeedWaitStates =
2694 TII.isXDL(*MI1)
2695 ? (TII.isXDL(*MI)
2697 NumPasses, ST.hasGFX950Insts())
2699 NumPasses, ST.hasGFX950Insts()))
2701 NumPasses);
2702 break;
2703 }
2704
2705 switch (NumPasses) {
2706 case 2:
2707 NeedWaitStates =
2709 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2710 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2711 break;
2712 case 8:
2713 NeedWaitStates =
2715 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2716 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2717 break;
2718 case 16:
2719 NeedWaitStates =
2721 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2722 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2723 break;
2724 default:
2725 llvm_unreachable("unexpected number of passes");
2726 }
2727 }
2728 }
2729 } else {
2730 switch (Opc1) {
2731 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2732 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2733 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2734 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2735 NeedWaitStates =
2736 ST.hasGFX950Insts()
2737 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2738 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2739 break;
2740 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2741 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2742 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2743 break;
2744 default:
2745 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2746
2747 if (ST.hasGFX940Insts()) {
2748 NeedWaitStates =
2749 TII.isXDL(*MI1)
2751 NumPasses, ST.hasGFX950Insts())
2753 NumPasses);
2754 break;
2755 }
2756
2757 switch (NumPasses) {
2758 case 2:
2759 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2760 break;
2761 case 4:
2762 llvm_unreachable("unexpected number of passes for mfma");
2763 case 8:
2764 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2765 break;
2766 case 16:
2767 default:
2768 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2769 }
2770 }
2771 }
2772 if (WaitStatesNeeded >= NeedWaitStates)
2773 continue;
2774
2775 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2776 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2777
2778 if (WaitStatesNeeded == MaxWaitStates)
2779 break;
2780 }
2781
2782 // Pad neighboring MFMA with noops for better inter-wave performance.
2783 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2784
2785 return WaitStatesNeeded;
2786}
2787
2788int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2789 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2790 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2791 return 0;
2792
2793 int WaitStatesNeeded = 0;
2794
2795 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2796 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2797 };
2798
2799 for (const MachineOperand &Op : MI->explicit_uses()) {
2800 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2801 continue;
2802
2803 Register Reg = Op.getReg();
2804
2805 const int AccVgprReadLdStWaitStates = 2;
2806 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2807 const int MaxWaitStates = 2;
2808
2809 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2810 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2811 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2812
2813 if (WaitStatesNeeded == MaxWaitStates)
2814 return WaitStatesNeeded; // Early exit.
2815
2816 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2817 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2818 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2819 return false;
2820 auto IsVALUFn = [](const MachineInstr &MI) {
2822 };
2823 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2824 std::numeric_limits<int>::max();
2825 };
2826
2827 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2828 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2829 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2830 }
2831
2832 return WaitStatesNeeded;
2833}
2834
2835int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
2836 assert(!ST.hasVcmpxPermlaneHazard() &&
2837 "this is a different vcmpx+permlane hazard");
2838 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2839 const SIInstrInfo *TII = ST.getInstrInfo();
2840
2841 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2842 return isVCmpXWritesExec(*TII, *TRI, MI);
2843 };
2844
2845 auto IsVALUFn = [](const MachineInstr &MI) {
2846 return SIInstrInfo::isVALU(MI);
2847 };
2848
2849 const int VCmpXWritesExecWaitStates = 4;
2850 const int VALUWritesVDstWaitStates = 2;
2851 int WaitStatesNeeded = 0;
2852
2853 for (const MachineOperand &Op : MI->explicit_uses()) {
2854 if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
2855 continue;
2856 Register Reg = Op.getReg();
2857
2858 int WaitStatesSinceDef =
2859 VALUWritesVDstWaitStates -
2860 getWaitStatesSinceDef(Reg, IsVALUFn,
2861 /*MaxWaitStates=*/VALUWritesVDstWaitStates);
2862 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2863 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2864 break;
2865 }
2866
2867 int VCmpXHazardWaits =
2868 VCmpXWritesExecWaitStates -
2869 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2870
2871 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2872 return WaitStatesNeeded;
2873}
2874
2876 // 2 pass -> 4
2877 // 4 pass -> 6
2878 // 8 pass -> 10
2879 // 16 pass -> 18
2880 return NumPasses + 2;
2881}
2882
2884 bool IsGFX950) {
2885 // xdl def cycles | gfx942 | gfx950
2886 // 2 pass | 5 5
2887 // 4 pass | 7 8
2888 // 8 pass | 11 12
2889 // 16 pass | 19 20
2890 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2891}
2892
2894 bool IsGFX950) {
2895 // xdl def cycles | gfx942 | gfx950
2896 // 2 pass | 5 5
2897 // 4 pass | 7 8
2898 // 8 pass | 11 12
2899 // 16 pass | 19 20
2900 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2901}
2902
2904 // 2 pass -> 4
2905 // 4 pass -> 6
2906 // 8 pass -> 10
2907 // 16 pass -> 18
2908 return NumPasses + 2;
2909}
2910
2911int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2912 if (!ST.hasGFX90AInsts())
2913 return 0;
2914
2915 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2916 return SIInstrInfo::isDGEMM(MI.getOpcode());
2917 };
2918
2919 // This is checked in checkMAIHazards90A()
2920 if (SIInstrInfo::isMFMA(*MI))
2921 return 0;
2922
2923 const MachineRegisterInfo &MRI = MF.getRegInfo();
2924
2925 int WaitStatesNeeded = 0;
2926
2927 bool IsMem = SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI);
2928 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2929 bool IsVALU = SIInstrInfo::isVALU(*MI);
2930
2931 const MachineInstr *MFMA = nullptr;
2932 unsigned Reg;
2933 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2934 if (!SIInstrInfo::isMFMA(MI) ||
2935 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2936 return false;
2937 MFMA = &MI;
2938 return true;
2939 };
2940
2941 const MachineInstr *DOT = nullptr;
2942 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2943 if (!SIInstrInfo::isDOT(MI) ||
2944 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2945 return false;
2946 DOT = &MI;
2947 return true;
2948 };
2949
2950 bool DGEMMAfterVALUWrite = false;
2951 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2952 // Found DGEMM on reverse traversal to def.
2953 if (SIInstrInfo::isDGEMM(MI.getOpcode()))
2954 DGEMMAfterVALUWrite = true;
2955
2956 // Only hazard if register is defined by a VALU and a DGEMM is found after
2957 // after the def.
2958 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2959 return false;
2960
2961 return true;
2962 };
2963
2964 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2965 AMDGPU::OpName::src2);
2966
2967 if (IsMemOrExport || IsVALU) {
2968 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2969 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2970 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2971 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2972 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2973 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2974 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2975 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2976 const int DotWriteSameDotReadSrcAB = 3;
2977 const int DotWriteDifferentVALURead = 3;
2978 const int DMFMABetweenVALUWriteVMEMRead = 2;
2979 const int MaxWaitStates = 19;
2980
2981 for (const MachineOperand &Use : MI->explicit_uses()) {
2982 if (!Use.isReg())
2983 continue;
2984 Reg = Use.getReg();
2985
2986 DOT = nullptr;
2987 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2988 MaxWaitStates);
2989 if (DOT) {
2990 int NeedWaitStates = 0;
2991 if (DOT->getOpcode() == MI->getOpcode()) {
2992 if (&Use - &MI->getOperand(0) != SrcCIdx)
2993 NeedWaitStates = DotWriteSameDotReadSrcAB;
2994 } else {
2995 NeedWaitStates = DotWriteDifferentVALURead;
2996 }
2997
2998 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2999 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3000 }
3001
3002 // Workaround for HW data hazard bug observed only in GFX90A. When there
3003 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
3004 // causes the SQ to incorrectly not insert two wait states between the two
3005 // instructions needed to avoid data hazard.
3006 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3007 DGEMMAfterVALUWrite = false;
3008 if (TRI.isVectorRegister(MRI, Reg)) {
3009 int WaitStatesNeededForUse =
3010 DMFMABetweenVALUWriteVMEMRead -
3011 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
3012 DMFMABetweenVALUWriteVMEMRead);
3013
3014 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3015 }
3016 }
3017
3018 MFMA = nullptr;
3019 WaitStatesSinceDef =
3020 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3021 if (!MFMA)
3022 continue;
3023
3024 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3025 int NumPasses = HazardDefLatency;
3026 int NeedWaitStates = MaxWaitStates;
3027
3028 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3029 switch (HazardDefLatency) {
3030 case 4:
3031 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3032 : DMFMA4x4WriteVgprVALUReadWaitStates;
3033 break;
3034 case 8:
3035 case 16:
3036 NeedWaitStates =
3037 IsMemOrExport
3038 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3039 : (ST.hasGFX950Insts()
3040 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3041 : DMFMA16x16WriteVgprVALUReadWaitStates);
3042 break;
3043 default:
3044 llvm_unreachable("unexpected dgemm");
3045 }
3046 } else if (ST.hasGFX940Insts()) {
3047 NeedWaitStates =
3048 TII.isXDL(*MFMA)
3050 NumPasses, ST.hasGFX950Insts())
3052 NumPasses);
3053 } else {
3054 switch (HazardDefLatency) {
3055 case 2:
3056 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3057 break;
3058 case 8:
3059 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3060 break;
3061 case 16:
3062 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3063 break;
3064 default:
3065 llvm_unreachable("unexpected number of passes for mfma");
3066 }
3067 }
3068
3069 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3070 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3071
3072 if (WaitStatesNeeded == MaxWaitStates)
3073 break;
3074 }
3075 }
3076
3077 unsigned Opc = MI->getOpcode();
3078 const int DMFMAToFMA64WaitStates = 2;
3079 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
3080 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
3081 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3082 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3083 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3084 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3085 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3086 }
3087
3088 if (!IsVALU && !IsMemOrExport)
3089 return WaitStatesNeeded;
3090
3091 for (const MachineOperand &Def : MI->defs()) {
3092 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3093 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3094 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3095 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3096 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3097 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3098 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3099 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3100 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3101 const int DotWriteDifferentVALUWrite = 3;
3102 const int MaxWaitStates = 19;
3103 const int MaxWarWaitStates = 15;
3104
3105 Reg = Def.getReg();
3106
3107 DOT = nullptr;
3108 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3109 MaxWaitStates);
3110 if (DOT && DOT->getOpcode() != MI->getOpcode())
3111 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3112 WaitStatesSinceDef);
3113
3114 MFMA = nullptr;
3115 WaitStatesSinceDef =
3116 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3117 if (MFMA) {
3118 int NeedWaitStates = MaxWaitStates;
3119 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
3120
3121 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3122 switch (NumPasses) {
3123 case 4:
3124 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3125 break;
3126 case 8:
3127 case 16:
3128 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3129 break;
3130 default:
3131 llvm_unreachable("unexpected number of cycles for dgemm");
3132 }
3133 } else if (ST.hasGFX940Insts()) {
3134 NeedWaitStates =
3135 TII.isXDL(*MFMA)
3137 NumPasses, ST.hasGFX950Insts())
3139 } else {
3140 switch (NumPasses) {
3141 case 2:
3142 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3143 break;
3144 case 8:
3145 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3146 break;
3147 case 16:
3148 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3149 break;
3150 default:
3151 llvm_unreachable("Unexpected number of passes for mfma");
3152 }
3153 }
3154
3155 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3156 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3157
3158 if (WaitStatesNeeded == MaxWaitStates)
3159 break;
3160 }
3161
3162 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3163 if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(MI.getOpcode()) ||
3164 !MI.readsRegister(Reg, &TRI))
3165 return false;
3166
3167 if (ST.hasGFX940Insts() && !TII.isXDL(MI))
3168 return false;
3169
3170 const MachineOperand *SrcC =
3171 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
3172 assert(SrcC);
3173 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
3174 return false;
3175
3176 MFMA = &MI;
3177 return true;
3178 };
3179
3180 MFMA = nullptr;
3181 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3182 MaxWarWaitStates);
3183 if (!MFMA)
3184 continue;
3185
3186 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3187 int NeedWaitStates = MaxWaitStates;
3188 switch (HazardDefLatency) {
3189 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3190 break;
3191 case 4: assert(ST.hasGFX940Insts());
3192 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3193 break;
3194 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3195 break;
3196 case 16: [[fallthrough]];
3197 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3198 break;
3199 }
3200
3201 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3202 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3203 }
3204
3205 return WaitStatesNeeded;
3206}
3207
3209 if (!SU->isInstr())
3210 return false;
3211
3212 const MachineInstr *MAI = nullptr;
3213
3214 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
3215 MAI = nullptr;
3217 MAI = &MI;
3218 return MAI != nullptr;
3219 };
3220
3221 MachineInstr *MI = SU->getInstr();
3222 if (IsMFMAFn(*MI)) {
3223 int W = getWaitStatesSince(IsMFMAFn, 16);
3224 if (MAI)
3225 return W < (int)TSchedModel.computeInstrLatency(MAI);
3226 }
3227
3228 return false;
3229}
3230
3231// Adjust global offsets for instructions bundled with S_GETPC_B64 after
3232// insertion of a new instruction.
3233static void updateGetPCBundle(MachineInstr *NewMI) {
3234 if (!NewMI->isBundled())
3235 return;
3236
3237 // Find start of bundle.
3238 auto I = NewMI->getIterator();
3239 while (I->isBundledWithPred())
3240 I--;
3241 if (I->isBundle())
3242 I++;
3243
3244 // Bail if this is not an S_GETPC bundle.
3245 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
3246 return;
3247
3248 // Update offsets of any references in the bundle.
3249 const unsigned NewBytes = 4;
3250 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3251 "Unexpected instruction insertion in bundle");
3252 auto NextMI = std::next(NewMI->getIterator());
3253 auto End = NewMI->getParent()->end();
3254 while (NextMI != End && NextMI->isBundledWithPred()) {
3255 for (auto &Operand : NextMI->operands()) {
3256 if (Operand.isGlobal())
3257 Operand.setOffset(Operand.getOffset() + NewBytes);
3258 }
3259 NextMI++;
3260 }
3261}
3262
3263bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
3264 if (!ST.hasVALUMaskWriteHazard())
3265 return false;
3266 assert(!ST.hasExtendedWaitCounts());
3267
3268 if (!ST.isWave64())
3269 return false;
3270
3271 const bool IsSALU = SIInstrInfo::isSALU(*MI);
3272 const bool IsVALU = SIInstrInfo::isVALU(*MI);
3273 if (!IsSALU && !IsVALU)
3274 return false;
3275
3276 // The hazard sequence is three instructions:
3277 // 1. VALU reads SGPR as mask
3278 // 2. VALU/SALU writes SGPR
3279 // 3. VALU/SALU reads SGPR
3280 // The hazard can expire if the distance between 2 and 3 is sufficient,
3281 // or (2) is VALU and (3) is SALU.
3282 // In practice this happens <10% of the time, hence always assume the hazard
3283 // exists if (1) and (2) are present to avoid searching all SGPR reads.
3284
3285 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3286 const MachineRegisterInfo &MRI = MF.getRegInfo();
3287
3288 auto IgnoreableSGPR = [](const Register Reg) {
3289 switch (Reg) {
3290 case AMDGPU::EXEC:
3291 case AMDGPU::EXEC_LO:
3292 case AMDGPU::EXEC_HI:
3293 case AMDGPU::M0:
3294 case AMDGPU::SGPR_NULL:
3295 case AMDGPU::SGPR_NULL64:
3296 case AMDGPU::SCC:
3297 return true;
3298 default:
3299 return false;
3300 }
3301 };
3302 auto IsVCC = [](const Register Reg) {
3303 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3304 };
3305
3306 struct StateType {
3307 SmallSet<Register, 2> HazardSGPRs;
3308
3309 static unsigned getHashValue(const StateType &State) {
3310 return hash_combine_range(State.HazardSGPRs);
3311 }
3312 static bool isEqual(const StateType &LHS, const StateType &RHS) {
3313 return LHS.HazardSGPRs == RHS.HazardSGPRs;
3314 }
3315 };
3316
3317 SmallVector<const MachineInstr *> WaitInstrs;
3318 bool HasSGPRRead = false;
3319 StateType InitialState;
3320
3321 // Look for SGPR write.
3322 MachineOperand *HazardDef = nullptr;
3323 for (MachineOperand &Op : MI->operands()) {
3324 if (!Op.isReg())
3325 continue;
3326 if (Op.isDef() && HazardDef)
3327 continue;
3328
3329 Register Reg = Op.getReg();
3330 if (IgnoreableSGPR(Reg))
3331 continue;
3332 if (!IsVCC(Reg)) {
3333 if (Op.isImplicit())
3334 continue;
3335 if (!TRI->isSGPRReg(MRI, Reg))
3336 continue;
3337 }
3338 // Also check for SGPR reads.
3339 if (Op.isUse()) {
3340 HasSGPRRead = true;
3341 continue;
3342 }
3343
3344 assert(!HazardDef);
3345 HazardDef = &Op;
3346 }
3347
3348 if (!HazardDef)
3349 return false;
3350
3351 // Setup to track writes to individual SGPRs
3352 const Register HazardReg = HazardDef->getReg();
3353 if (AMDGPU::SReg_32RegClass.contains(HazardReg)) {
3354 InitialState.HazardSGPRs.insert(HazardReg);
3355 } else {
3356 assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
3357 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3358 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3359 }
3360
3361 auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
3362 if (State.HazardSGPRs.empty())
3363 return HazardExpired;
3364
3365 switch (I.getOpcode()) {
3366 case AMDGPU::V_ADDC_U32_e32:
3367 case AMDGPU::V_ADDC_U32_dpp:
3368 case AMDGPU::V_CNDMASK_B16_t16_e32:
3369 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3370 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3371 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3372 case AMDGPU::V_CNDMASK_B32_e32:
3373 case AMDGPU::V_CNDMASK_B32_dpp:
3374 case AMDGPU::V_DIV_FMAS_F32_e64:
3375 case AMDGPU::V_DIV_FMAS_F64_e64:
3376 case AMDGPU::V_SUBB_U32_e32:
3377 case AMDGPU::V_SUBB_U32_dpp:
3378 case AMDGPU::V_SUBBREV_U32_e32:
3379 case AMDGPU::V_SUBBREV_U32_dpp: {
3380 // These implicitly read VCC as mask source.
3381 return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
3382 }
3383 case AMDGPU::V_ADDC_U32_e64:
3384 case AMDGPU::V_ADDC_U32_e64_dpp:
3385 case AMDGPU::V_CNDMASK_B16_t16_e64:
3386 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3387 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3388 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3389 case AMDGPU::V_CNDMASK_B32_e64:
3390 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3391 case AMDGPU::V_SUBB_U32_e64:
3392 case AMDGPU::V_SUBB_U32_e64_dpp:
3393 case AMDGPU::V_SUBBREV_U32_e64:
3394 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3395 // Only check mask register overlaps.
3396 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
3397 assert(SSRCOp);
3398 bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg);
3399 return Result ? HazardFound : NoHazardFound;
3400 }
3401 default:
3402 return NoHazardFound;
3403 }
3404 };
3405
3406 const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
3408 0),
3409 0);
3410 auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
3411 switch (I.getOpcode()) {
3412 case AMDGPU::S_WAITCNT_DEPCTR:
3413 // Record mergable waits within region of instructions free of SGPR reads.
3414 if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
3415 (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3416 WaitInstrs.push_back(&I);
3417 break;
3418 default:
3419 // Update tracking of SGPR reads and writes.
3420 for (auto &Op : I.operands()) {
3421 if (!Op.isReg())
3422 continue;
3423
3424 Register Reg = Op.getReg();
3425 if (IgnoreableSGPR(Reg))
3426 continue;
3427 if (!IsVCC(Reg)) {
3428 if (Op.isImplicit())
3429 continue;
3430 if (!TRI->isSGPRReg(MRI, Reg))
3431 continue;
3432 }
3433 if (Op.isUse()) {
3434 HasSGPRRead = true;
3435 continue;
3436 }
3437
3438 // Stop tracking any SGPRs with writes on the basis that they will
3439 // already have an appropriate wait inserted afterwards.
3441 for (Register SGPR : State.HazardSGPRs) {
3442 if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR))
3443 Found.push_back(SGPR);
3444 }
3445 for (Register SGPR : Found)
3446 State.HazardSGPRs.erase(SGPR);
3447 }
3448 break;
3449 }
3450 };
3451
3452 // Check for hazard
3453 if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
3454 MI->getParent(),
3455 std::next(MI->getReverseIterator())))
3456 return false;
3457
3458 // Compute counter mask
3459 unsigned DepCtr =
3460 IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0, ST)
3461 : AMDGPU::DepCtr::encodeFieldVaSdst(0, ST))
3462 : AMDGPU::DepCtr::encodeFieldSaSdst(0, ST);
3463
3464 // Try to merge previous waits into this one for regions with no SGPR reads.
3465 if (!WaitInstrs.empty()) {
3466 // Note: WaitInstrs contains const pointers, so walk backward from MI to
3467 // obtain a mutable pointer to each instruction to be merged.
3468 // This is expected to be a very short walk within the same block.
3469 SmallVector<MachineInstr *> ToErase;
3470 unsigned Found = 0;
3471 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3472 End = MI->getParent()->rend();
3473 Found < WaitInstrs.size() && It != End; ++It) {
3474 MachineInstr *WaitMI = &*It;
3475 // Find next wait instruction.
3476 if (std::as_const(WaitMI) != WaitInstrs[Found])
3477 continue;
3478 Found++;
3479 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3480 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3481 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3482 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3483 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3484 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3485 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3486 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3487 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3488 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3489 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3490 ToErase.push_back(WaitMI);
3491 }
3492 assert(Found == WaitInstrs.size());
3493 for (MachineInstr *WaitMI : ToErase)
3494 WaitMI->eraseFromParent();
3495 }
3496
3497 // Add s_waitcnt_depctr after SGPR write.
3498 auto NextMI = std::next(MI->getIterator());
3499 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3500 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3501 .addImm(DepCtr);
3502
3503 // SALU write may be s_getpc in a bundle.
3504 updateGetPCBundle(NewMI);
3505
3506 return true;
3507}
3508
3509static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3510 const SIInstrInfo &TII) {
3511 MachineBasicBlock &EntryMBB = MF->front();
3512 if (EntryMBB.begin() != EntryMBB.end()) {
3513 auto &EntryMI = *EntryMBB.begin();
3514 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3515 EntryMI.getOperand(0).getImm() >= Priority)
3516 return false;
3517 }
3518
3519 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3520 .addImm(Priority);
3521 return true;
3522}
3523
3524bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3525 if (!ST.hasRequiredExportPriority())
3526 return false;
3527
3528 // Assume the following shader types will never have exports,
3529 // and avoid adding or adjusting S_SETPRIO.
3530 MachineBasicBlock *MBB = MI->getParent();
3531 MachineFunction *MF = MBB->getParent();
3532 auto CC = MF->getFunction().getCallingConv();
3533 switch (CC) {
3538 return false;
3539 default:
3540 break;
3541 }
3542
3543 const int MaxPriority = 3;
3544 const int NormalPriority = 2;
3545 const int PostExportPriority = 0;
3546
3547 auto It = MI->getIterator();
3548 switch (MI->getOpcode()) {
3549 case AMDGPU::S_ENDPGM:
3550 case AMDGPU::S_ENDPGM_SAVED:
3551 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3552 case AMDGPU::SI_RETURN_TO_EPILOG:
3553 // Ensure shader with calls raises priority at entry.
3554 // This ensures correct priority if exports exist in callee.
3555 if (MF->getFrameInfo().hasCalls())
3556 return ensureEntrySetPrio(MF, NormalPriority, TII);
3557 return false;
3558 case AMDGPU::S_SETPRIO: {
3559 // Raise minimum priority unless in workaround.
3560 auto &PrioOp = MI->getOperand(0);
3561 int Prio = PrioOp.getImm();
3562 bool InWA = (Prio == PostExportPriority) &&
3563 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3564 if (InWA || Prio >= NormalPriority)
3565 return false;
3566 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3567 return true;
3568 }
3569 default:
3570 if (!TII.isEXP(*MI))
3571 return false;
3572 break;
3573 }
3574
3575 // Check entry priority at each export (as there will only be a few).
3576 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3577 bool Changed = false;
3579 Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3580
3581 auto NextMI = std::next(It);
3582 bool EndOfShader = false;
3583 if (NextMI != MBB->end()) {
3584 // Only need WA at end of sequence of exports.
3585 if (TII.isEXP(*NextMI))
3586 return Changed;
3587 // Assume appropriate S_SETPRIO after export means WA already applied.
3588 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3589 NextMI->getOperand(0).getImm() == PostExportPriority)
3590 return Changed;
3591 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3592 }
3593
3594 const DebugLoc &DL = MI->getDebugLoc();
3595
3596 // Lower priority.
3597 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3598 .addImm(PostExportPriority);
3599
3600 if (!EndOfShader) {
3601 // Wait for exports to complete.
3602 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3603 .addReg(AMDGPU::SGPR_NULL)
3604 .addImm(0);
3605 }
3606
3607 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3608 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3609
3610 if (!EndOfShader) {
3611 // Return to normal (higher) priority.
3612 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3613 .addImm(NormalPriority);
3614 }
3615
3616 return true;
3617}
3618
3619bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
3620 if (!isSGetReg(MI->getOpcode()))
3621 return false;
3622
3623 const SIInstrInfo *TII = ST.getInstrInfo();
3624 switch (getHWReg(TII, *MI)) {
3625 default:
3626 return false;
3631 break;
3632 }
3633
3634 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3635 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3636 .addImm(0);
3637 return true;
3638}
3639
3640bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
3641 if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3642 return false;
3643
3644 const SIInstrInfo *TII = ST.getInstrInfo();
3645 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3646 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3647 .addImm(0xFFE3);
3648 BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(),
3649 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3650 .addImm(0xFFE3);
3651
3652 return true;
3653}
3654
3655bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
3656 // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3657 // for hazard to trigger.
3658 if (!IsHazardRecognizerMode)
3659 return false;
3660
3661 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3662 const SIInstrInfo *TII = ST.getInstrInfo();
3663 // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3664 const int FlatScrBaseWaitStates = 10;
3665
3666 bool ReadsFlatScrLo =
3667 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3668 bool ReadsFlatScrHi =
3669 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3670 if (isSGetReg(MI->getOpcode())) {
3671 switch (getHWReg(TII, *MI)) {
3672 default:
3673 break;
3675 ReadsFlatScrLo = true;
3676 break;
3678 ReadsFlatScrHi = true;
3679 break;
3680 }
3681 }
3682
3683 const MachineRegisterInfo &MRI = MF.getRegInfo();
3684
3685 auto IsRegDefHazard = [&](Register Reg) -> bool {
3686 DenseSet<const MachineBasicBlock *> Visited;
3687 auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3688 return MI.modifiesRegister(Reg, TRI);
3689 };
3690
3691 // This literally abuses the idea of waitstates. Instead of waitstates it
3692 // returns 1 for SGPR written and 0 otherwise.
3693 auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3694 if (!TII->isSALU(MI) && !TII->isVALU(MI))
3695 return 0;
3696 for (const MachineOperand &MO : MI.all_defs()) {
3697 if (TRI->isSGPRReg(MRI, MO.getReg()))
3698 return 1;
3699 }
3700 return 0;
3701 };
3702
3703 auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3704 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3705 unsigned Wait = MI.getOperand(0).getImm();
3708 return true;
3709 }
3710 return SgprWrites >= FlatScrBaseWaitStates;
3711 };
3712
3713 return ::getWaitStatesSince(
3714 IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()),
3715 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3716 };
3717
3718 if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
3719 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3720 (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
3721 !IsRegDefHazard(AMDGPU::SGPR103)))
3722 return false;
3723
3724 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3725 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3728 return true;
3729}
3730
3731bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
3732 if (!isSSetReg(MI->getOpcode()) ||
3733 MI->getOperand(1).getImm() != AMDGPU::Hwreg::ID_MODE)
3734 return false;
3735
3736 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3737 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3738 return true;
3739}
unsigned const MachineRegisterInfo * MRI
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, unsigned Latency, unsigned Category)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(DataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
Definition LineTable.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
BitVector & set()
Definition BitVector.h:370
A debug info location.
Definition DebugLoc.h:124
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Definition DenseMap.h:274
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
unsigned PreEmitNoopsCommon(MachineInstr *)
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
Instructions::const_reverse_iterator const_reverse_instr_iterator
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:101
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:149
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:337
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:472
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
A range adaptor for a pair of iterators.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
@ Entry
Definition COFF.h:862
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
constexpr double e
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Wait
Definition Threading.h:60
Op::Description Desc
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...