LLVM 23.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
17#include "llvm/ADT/Statistic.h"
22#include "llvm/Support/Debug.h"
24
25using namespace llvm;
26
27#define DEBUG_TYPE "gcn-hazard-recognizer"
28
29STATISTIC(NumWMMANopsHoisted,
30 "Number of WMMA hazard V_NOPs hoisted from loops");
31STATISTIC(NumWMMAHoistingBailed,
32 "Number of WMMA hazards where V_NOP hoisting was not possible");
33
34namespace {
35
36struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
37 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
38
39 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
40 if (Arg.getAsInteger(0, Value))
41 return O.error("'" + Arg + "' value invalid for uint argument!");
42
43 if (Value > 100)
44 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
45
46 return false;
47 }
48};
49
50} // end anonymous namespace
51
53 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
54 cl::desc("Fill a percentage of the latency between "
55 "neighboring MFMA with s_nops."));
56
57// This is intended for debugging purposes only.
59 NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden,
60 cl::desc("Insert a s_nop x before every instruction"));
61
63 "amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden,
64 cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"));
65
66//===----------------------------------------------------------------------===//
67// Hazard Recognizer Implementation
68//===----------------------------------------------------------------------===//
69
71 const GCNSubtarget &ST);
72
74 MachineLoopInfo *MLI)
75 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
76 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
77 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
78 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
79 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
80 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
81}
82
84 EmittedInstrs.clear();
85}
86
90
92 CurrCycleInstr = MI;
93}
94
95static bool isDivFMas(unsigned Opcode) {
96 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
97}
98
99static bool isSGetReg(unsigned Opcode) {
100 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
101}
102
103static bool isSSetReg(unsigned Opcode) {
104 switch (Opcode) {
105 case AMDGPU::S_SETREG_B32:
106 case AMDGPU::S_SETREG_B32_mode:
107 case AMDGPU::S_SETREG_IMM32_B32:
108 case AMDGPU::S_SETREG_IMM32_B32_mode:
109 return true;
110 }
111 return false;
112}
113
114static bool isRWLane(unsigned Opcode) {
115 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
116}
117
118static bool isRFE(unsigned Opcode) {
119 return Opcode == AMDGPU::S_RFE_B64;
120}
121
122static bool isSMovRel(unsigned Opcode) {
123 switch (Opcode) {
124 case AMDGPU::S_MOVRELS_B32:
125 case AMDGPU::S_MOVRELS_B64:
126 case AMDGPU::S_MOVRELD_B32:
127 case AMDGPU::S_MOVRELD_B64:
128 return true;
129 default:
130 return false;
131 }
132}
133
135 const MachineInstr &MI) {
136 if (TII.isAlwaysGDS(MI.getOpcode()))
137 return true;
138
139 switch (MI.getOpcode()) {
140 case AMDGPU::S_SENDMSG:
141 case AMDGPU::S_SENDMSGHALT:
142 case AMDGPU::S_TTRACEDATA:
143 return true;
144 // These DS opcodes don't support GDS.
145 case AMDGPU::DS_NOP:
146 case AMDGPU::DS_PERMUTE_B32:
147 case AMDGPU::DS_BPERMUTE_B32:
148 return false;
149 default:
150 if (TII.isDS(MI.getOpcode())) {
151 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
152 AMDGPU::OpName::gds);
153 if (MI.getOperand(GDS).getImm())
154 return true;
155 }
156 return false;
157 }
158}
159
160static bool isPermlane(const MachineInstr &MI) {
161 unsigned Opcode = MI.getOpcode();
162 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
163 Opcode == AMDGPU::V_PERMLANE64_B32 ||
164 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
165 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
167 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
168 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
169 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
170 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
171 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
173 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
175 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
176}
177
178static bool isLdsDma(const MachineInstr &MI) {
179 return SIInstrInfo::isVALU(MI) &&
181}
182
183static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
184 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
185 AMDGPU::OpName::simm16);
186 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
187}
188
191 MachineInstr *MI = SU->getInstr();
192 // If we are not in "HazardRecognizerMode" and therefore not being run from
193 // the scheduler, track possible stalls from hazards but don't insert noops.
194 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
195
196 if (MI->isBundle())
197 return NoHazard;
198
199 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
200 return HazardType;
201
202 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
203 return HazardType;
204
205 if (checkFPAtomicToDenormModeHazard(MI) > 0)
206 return HazardType;
207
208 // Hazards which cannot be mitigated with S_NOPs.
209 if (!IsHazardRecognizerMode) {
210 if (checkWMMACoexecutionHazards(MI) > 0)
211 return Hazard;
212 }
213
214 if (ST.hasNoDataDepHazard())
215 return NoHazard;
216
217 if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
218 return HazardType;
219
220 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
221 return HazardType;
222
223 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
224 return HazardType;
225
226 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
227 return HazardType;
228
229 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
230 return HazardType;
231
234 checkMAIVALUHazards(MI) > 0)
235 return HazardType;
236
237 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
238 return HazardType;
239
240 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
241 return HazardType;
242
243 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
244 return HazardType;
245
246 if (((ST.hasReadM0MovRelInterpHazard() &&
247 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
248 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
249 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
250 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
251 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
252 (ST.hasReadM0LdsDirectHazard() &&
253 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
254 checkReadM0Hazards(MI) > 0)
255 return HazardType;
256
257 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
258 return HazardType;
259
261 checkMAILdStHazards(MI) > 0)
262 return HazardType;
263
264 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
265 return HazardType;
266
267 return NoHazard;
268}
269
271 unsigned Quantity) {
272 while (Quantity > 0) {
273 unsigned Arg = std::min(Quantity, 8u);
274 Quantity -= Arg;
275 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
276 .addImm(Arg - 1);
277 }
278}
279
280unsigned
281GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
282 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
283 assert(TSchedModel.getWriteProcResBegin(SC) !=
284 TSchedModel.getWriteProcResEnd(SC));
285 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
286}
287
288void GCNHazardRecognizer::processBundle() {
289 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
290 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
291 // Check bundled MachineInstr's for hazards.
292 for (; MI != E && MI->isInsideBundle(); ++MI) {
293 CurrCycleInstr = &*MI;
294 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
295
296 if (IsHazardRecognizerMode) {
297 fixHazards(CurrCycleInstr);
298
299 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
300 }
301
302 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
303 // include the bundled MI directly after, only add a maximum of
304 // (MaxLookAhead - 1) noops to EmittedInstrs.
305 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
306 EmittedInstrs.push_front(nullptr);
307
308 EmittedInstrs.push_front(CurrCycleInstr);
309 EmittedInstrs.resize(MaxLookAhead);
310 }
311 CurrCycleInstr = nullptr;
312}
313
314void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
315 assert(IsHazardRecognizerMode);
316
317 unsigned NumPreNoops = PreEmitNoops(MI);
318 EmitNoops(NumPreNoops);
319 if (MI->isInsideBundle())
320 insertNoopsInBundle(MI, TII, NumPreNoops);
321 else
322 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
323 NumPreNoops);
325 AdvanceCycle();
326}
327
329 IsHazardRecognizerMode = true;
330 CurrCycleInstr = MI;
331 unsigned W = PreEmitNoopsCommon(MI);
332 fixHazards(MI);
333 CurrCycleInstr = nullptr;
334 return std::max(W, NopPadding.getValue());
335}
336
340
342 if (MI->isBundle())
343 return 0;
344
345 int WaitStates = 0;
346
348 return std::max(WaitStates, checkSMRDHazards(MI));
349
350 if (ST.hasNSAtoVMEMBug())
351 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
352
353 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
354
355 if (ST.hasNoDataDepHazard())
356 return WaitStates;
357
359 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
360
362 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
363
365 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
366
367 if (isDivFMas(MI->getOpcode()))
368 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
369
370 if (isRWLane(MI->getOpcode()))
371 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
372
375 checkMAIVALUHazards(MI) > 0)
376 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
377
378 if (MI->isInlineAsm())
379 return std::max(WaitStates, checkInlineAsmHazards(MI));
380
381 if (isSGetReg(MI->getOpcode()))
382 return std::max(WaitStates, checkGetRegHazards(MI));
383
384 if (isSSetReg(MI->getOpcode()))
385 return std::max(WaitStates, checkSetRegHazards(MI));
386
387 if (isRFE(MI->getOpcode()))
388 return std::max(WaitStates, checkRFEHazards(MI));
389
390 if ((ST.hasReadM0MovRelInterpHazard() &&
391 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
392 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
393 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
394 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
395 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
396 (ST.hasReadM0LdsDirectHazard() &&
397 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
398 return std::max(WaitStates, checkReadM0Hazards(MI));
399
401 return std::max(WaitStates, checkMAIHazards(MI));
402
404 return std::max(WaitStates, checkMAILdStHazards(MI));
405
406 if (ST.hasGFX950Insts() && isPermlane(*MI))
407 return std::max(WaitStates, checkPermlaneHazards(MI));
408
409 return WaitStates;
410}
411
413 EmittedInstrs.push_front(nullptr);
414}
415
417 // When the scheduler detects a stall, it will call AdvanceCycle() without
418 // emitting any instructions.
419 if (!CurrCycleInstr) {
420 EmittedInstrs.push_front(nullptr);
421 return;
422 }
423
424 if (CurrCycleInstr->isBundle()) {
425 processBundle();
426 return;
427 }
428
429 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
430 if (!NumWaitStates) {
431 CurrCycleInstr = nullptr;
432 return;
433 }
434
435 // Keep track of emitted instructions
436 EmittedInstrs.push_front(CurrCycleInstr);
437
438 // Add a nullptr for each additional wait state after the first. Make sure
439 // not to add more than getMaxLookAhead() items to the list, since we
440 // truncate the list to that size right after this loop.
441 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
442 i < e; ++i) {
443 EmittedInstrs.push_front(nullptr);
444 }
445
446 // getMaxLookahead() is the largest number of wait states we will ever need
447 // to insert, so there is no point in keeping track of more than that many
448 // wait states.
449 EmittedInstrs.resize(getMaxLookAhead());
450
451 CurrCycleInstr = nullptr;
452}
453
455 assert(!IsHazardRecognizerMode &&
456 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
457}
458
459//===----------------------------------------------------------------------===//
460// Helper Functions
461//===----------------------------------------------------------------------===//
462
464
465// Search for a hazard in a block and its predecessors.
466template <typename StateT>
467static bool
468hasHazard(StateT InitialState,
469 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
470 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
471 const MachineBasicBlock *InitialMBB,
473 struct StateMapKey {
475 unsigned Idx;
476 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
477 return LHS.States == RHS.States && LHS.Idx == RHS.Idx;
478 }
479 };
480 struct StateMapKeyTraits : DenseMapInfo<StateMapKey> {
481 static inline StateMapKey getEmptyKey() {
482 return {static_cast<SmallVectorImpl<StateT> *>(
485 }
486 static inline StateMapKey getTombstoneKey() {
487 return {static_cast<SmallVectorImpl<StateT> *>(
490 }
491 static unsigned getHashValue(const StateMapKey &Key) {
492 return StateT::getHashValue((*Key.States)[Key.Idx]);
493 }
494 static unsigned getHashValue(const StateT &State) {
495 return StateT::getHashValue(State);
496 }
497 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
498 const auto EKey = getEmptyKey();
499 const auto TKey = getTombstoneKey();
500 if (StateMapKey::isEqual(LHS, EKey) || StateMapKey::isEqual(RHS, EKey) ||
501 StateMapKey::isEqual(LHS, TKey) || StateMapKey::isEqual(RHS, TKey))
502 return StateMapKey::isEqual(LHS, RHS);
503 return StateT::isEqual((*LHS.States)[LHS.Idx], (*RHS.States)[RHS.Idx]);
504 }
505 static bool isEqual(const StateT &LHS, const StateMapKey &RHS) {
506 if (StateMapKey::isEqual(RHS, getEmptyKey()) ||
507 StateMapKey::isEqual(RHS, getTombstoneKey()))
508 return false;
509 return StateT::isEqual(LHS, (*RHS.States)[RHS.Idx]);
510 }
511 };
512
515
517 const MachineBasicBlock *MBB = InitialMBB;
518 StateT State = InitialState;
519
521 unsigned WorkIdx = 0;
522 for (;;) {
523 bool Expired = false;
524 for (auto E = MBB->instr_rend(); I != E; ++I) {
525 // No need to look at parent BUNDLE instructions.
526 if (I->isBundle())
527 continue;
528
529 auto Result = IsHazard(State, *I);
530 if (Result == HazardFound)
531 return true;
532 if (Result == HazardExpired) {
533 Expired = true;
534 break;
535 }
536
537 if (I->isInlineAsm() || I->isMetaInstruction())
538 continue;
539
540 UpdateState(State, *I);
541 }
542
543 if (!Expired) {
544 unsigned StateIdx = States.size();
545 StateMapKey Key = {&States, StateIdx};
546 auto Insertion = StateMap.insert_as(std::pair(Key, StateIdx), State);
547 if (Insertion.second) {
548 States.emplace_back(State);
549 } else {
550 StateIdx = Insertion.first->second;
551 }
552 for (MachineBasicBlock *Pred : MBB->predecessors())
553 Worklist.insert(std::pair(Pred, StateIdx));
554 }
555
556 if (WorkIdx == Worklist.size())
557 break;
558
559 unsigned StateIdx;
560 std::tie(MBB, StateIdx) = Worklist[WorkIdx++];
561 State = States[StateIdx];
562 I = MBB->instr_rbegin();
563 }
564
565 return false;
566}
567
568// Returns a minimum wait states since \p I walking all predecessors.
569// Only scans until \p IsExpired does not return true.
570// Can only be run in a hazard recognizer mode.
571static int
573 const MachineBasicBlock *MBB,
575 int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired,
579 for (auto E = MBB->instr_rend(); I != E; ++I) {
580 // Don't add WaitStates for parent BUNDLE instructions.
581 if (I->isBundle())
582 continue;
583
584 if (IsHazard(*I))
585 return WaitStates;
586
587 if (I->isInlineAsm())
588 continue;
589
590 WaitStates += GetNumWaitStates(*I);
591
592 if (IsExpired(*I, WaitStates))
593 return std::numeric_limits<int>::max();
594 }
595
596 int MinWaitStates = std::numeric_limits<int>::max();
597 for (MachineBasicBlock *Pred : MBB->predecessors()) {
598 if (!Visited.insert(Pred).second)
599 continue;
600
601 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
602 IsExpired, Visited, GetNumWaitStates);
603
604 MinWaitStates = std::min(MinWaitStates, W);
605 }
606
607 return MinWaitStates;
608}
609
610static int
612 const MachineInstr *MI,
617 return getWaitStatesSince(IsHazard, MI->getParent(),
618 std::next(MI->getReverseIterator()), 0, IsExpired,
619 Visited, GetNumWaitStates);
620}
621
622int GCNHazardRecognizer::getWaitStatesSince(
623 IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) const {
624 if (IsHazardRecognizerMode) {
625 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
626 return WaitStates >= Limit;
627 };
628 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn,
629 GetNumWaitStates);
630 }
631
632 int WaitStates = 0;
633 for (MachineInstr *MI : EmittedInstrs) {
634 if (MI) {
635 if (IsHazard(*MI))
636 return WaitStates;
637
638 if (MI->isInlineAsm())
639 continue;
640 }
641 WaitStates += MI ? GetNumWaitStates(*MI) : 1;
642
643 if (WaitStates >= Limit)
644 break;
645 }
646 return std::numeric_limits<int>::max();
647}
648
649int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
650 int Limit) const {
651 return getWaitStatesSince(IsHazard, Limit, SIInstrInfo::getNumWaitStates);
652}
653
654int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
655 IsHazardFn IsHazardDef,
656 int Limit) const {
657 const SIRegisterInfo *TRI = ST.getRegisterInfo();
658
659 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
660 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
661 };
662
663 return getWaitStatesSince(IsHazardFn, Limit);
664}
665
666int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
667 int Limit) const {
668 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
669 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
670 };
671
672 return getWaitStatesSince(IsHazardFn, Limit);
673}
674
675//===----------------------------------------------------------------------===//
676// No-op Hazard Detection
677//===----------------------------------------------------------------------===//
678
679static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
680 MCRegister Reg) {
681 for (MCRegUnit Unit : TRI.regunits(Reg))
682 BV.set(static_cast<unsigned>(Unit));
683}
684
685static void addRegsToSet(const SIRegisterInfo &TRI,
687 BitVector &DefSet, BitVector &UseSet) {
688 for (const MachineOperand &Op : Ops) {
689 if (Op.isReg())
690 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
691 }
692}
693
694void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) const {
695 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
696}
697
699 return !SIInstrInfo::isSMRD(*MI);
700}
701
703 return !SIInstrInfo::isVMEM(*MI);
704}
705
706int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) const {
707 // SMEM soft clause are only present on VI+, and only matter if xnack is
708 // enabled.
709 if (!ST.isXNACKEnabled())
710 return 0;
711
712 bool IsSMRD = TII.isSMRD(*MEM);
713
714 resetClause();
715
716 // A soft-clause is any group of consecutive SMEM instructions. The
717 // instructions in this group may return out of order and/or may be
718 // replayed (i.e. the same instruction issued more than once).
719 //
720 // In order to handle these situations correctly we need to make sure that
721 // when a clause has more than one instruction, no instruction in the clause
722 // writes to a register that is read by another instruction in the clause
723 // (including itself). If we encounter this situation, we need to break the
724 // clause by inserting a non SMEM instruction.
725
726 for (MachineInstr *MI : EmittedInstrs) {
727 // When we hit a non-SMEM instruction then we have passed the start of the
728 // clause and we can stop.
729 if (!MI)
730 break;
731
733 break;
734
735 addClauseInst(*MI);
736 }
737
738 if (ClauseDefs.none())
739 return 0;
740
741 // We need to make sure not to put loads and stores in the same clause if they
742 // use the same address. For now, just start a new clause whenever we see a
743 // store.
744 if (MEM->mayStore())
745 return 1;
746
747 addClauseInst(*MEM);
748
749 // If the set of defs and uses intersect then we cannot add this instruction
750 // to the clause, so we have a hazard.
751 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
752}
753
754int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) const {
755 int WaitStatesNeeded = 0;
756
757 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
758
759 // This SMRD hazard only affects SI.
760 if (!ST.hasSMRDReadVALUDefHazard())
761 return WaitStatesNeeded;
762
763 // A read of an SGPR by SMRD instruction requires 4 wait states when the
764 // SGPR was written by a VALU instruction.
765 int SmrdSgprWaitStates = 4;
766 auto IsHazardDefFn = [this](const MachineInstr &MI) {
767 return TII.isVALU(MI);
768 };
769 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
770 return TII.isSALU(MI);
771 };
772
773 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
774
775 for (const MachineOperand &Use : SMRD->uses()) {
776 if (!Use.isReg())
777 continue;
778 int WaitStatesNeededForUse =
779 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
780 SmrdSgprWaitStates);
781 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
782
783 // This fixes what appears to be undocumented hardware behavior in SI where
784 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
785 // needs some number of nops in between. We don't know how many we need, but
786 // let's use 4. This wasn't discovered before probably because the only
787 // case when this happens is when we expand a 64-bit pointer into a full
788 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
789 // probably never encountered in the closed-source land.
790 if (IsBufferSMRD) {
791 int WaitStatesNeededForUse =
792 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
793 IsBufferHazardDefFn,
794 SmrdSgprWaitStates);
795 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
796 }
797 }
798
799 return WaitStatesNeeded;
800}
801
802int GCNHazardRecognizer::checkVMEMHazards(MachineInstr *VMEM) const {
803 if (!ST.hasVMEMReadSGPRVALUDefHazard())
804 return 0;
805
806 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
807
808 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
809 // SGPR was written by a VALU Instruction.
810 const int VmemSgprWaitStates = 5;
811 auto IsHazardDefFn = [this](const MachineInstr &MI) {
812 return TII.isVALU(MI);
813 };
814 for (const MachineOperand &Use : VMEM->uses()) {
815 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
816 continue;
817
818 int WaitStatesNeededForUse =
819 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
820 VmemSgprWaitStates);
821 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
822 }
823 return WaitStatesNeeded;
824}
825
826int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) const {
827 const SIRegisterInfo *TRI = ST.getRegisterInfo();
828 const SIInstrInfo *TII = ST.getInstrInfo();
829
830 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
831 int DppVgprWaitStates = 2;
832 int DppExecWaitStates = 5;
833 int WaitStatesNeeded = 0;
834 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
835 return TII->isVALU(MI);
836 };
837
838 for (const MachineOperand &Use : DPP->uses()) {
839 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
840 continue;
841 int WaitStatesNeededForUse =
842 DppVgprWaitStates - getWaitStatesSinceDef(
843 Use.getReg(),
844 [](const MachineInstr &) { return true; },
845 DppVgprWaitStates);
846 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
847 }
848
849 WaitStatesNeeded = std::max(
850 WaitStatesNeeded,
851 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
852 DppExecWaitStates));
853
854 return WaitStatesNeeded;
855}
856
857int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) const {
858 const SIInstrInfo *TII = ST.getInstrInfo();
859
860 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
861 // instruction.
862 const int DivFMasWaitStates = 4;
863 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
864 return TII->isVALU(MI);
865 };
866 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
867 DivFMasWaitStates);
868
869 return DivFMasWaitStates - WaitStatesNeeded;
870}
871
872int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) const {
873 const SIInstrInfo *TII = ST.getInstrInfo();
874 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
875
876 const int GetRegWaitStates = 2;
877 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
878 return GetRegHWReg == getHWReg(TII, MI);
879 };
880 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
881
882 return GetRegWaitStates - WaitStatesNeeded;
883}
884
885int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) const {
886 const SIInstrInfo *TII = ST.getInstrInfo();
887 unsigned HWReg = getHWReg(TII, *SetRegInstr);
888
889 const int SetRegWaitStates = ST.getSetRegWaitStates();
890 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
891 return HWReg == getHWReg(TII, MI);
892 };
893 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
894 return SetRegWaitStates - WaitStatesNeeded;
895}
896
897int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) const {
898 if (!MI.mayStore())
899 return -1;
900
901 const SIInstrInfo *TII = ST.getInstrInfo();
902 unsigned Opcode = MI.getOpcode();
903 const MCInstrDesc &Desc = MI.getDesc();
904
905 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
906 int VDataRCID = -1;
907 if (VDataIdx != -1)
908 VDataRCID = TII->getOpRegClassID(Desc.operands()[VDataIdx]);
909
910 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
911 // There is no hazard if the instruction does not use vector regs
912 // (like wbinvl1)
913 if (VDataIdx == -1)
914 return -1;
915 // For MUBUF/MTBUF instructions this hazard only exists if the
916 // instruction is not using a register in the soffset field.
917 const MachineOperand *SOffset =
918 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
919 // If we have no soffset operand, then assume this field has been
920 // hardcoded to zero.
921 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
922 (!SOffset || !SOffset->isReg()))
923 return VDataIdx;
924 }
925
926 // MIMG instructions create a hazard if they don't use a 256-bit T# and
927 // the store size is greater than 8 bytes and they have more than two bits
928 // of their dmask set.
929 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
930 if (TII->isMIMG(MI)) {
931 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
932 assert(SRsrcIdx != -1 && AMDGPU::getRegBitWidth(TII->getOpRegClassID(
933 Desc.operands()[SRsrcIdx])) == 256);
934 (void)SRsrcIdx;
935 }
936
937 if (TII->isFLAT(MI)) {
938 // There is no hazard if the instruction does not use vector regs
939 if (VDataIdx == -1)
940 return -1;
941
942 if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
943 return VDataIdx;
944 }
945
946 return -1;
947}
948
949int GCNHazardRecognizer::checkVALUHazardsHelper(
950 const MachineOperand &Def, const MachineRegisterInfo &MRI) const {
951 // Helper to check for the hazard where VMEM instructions that store more than
952 // 8 bytes can have there store data over written by the next instruction.
953 const SIRegisterInfo *TRI = ST.getRegisterInfo();
954
955 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
956 int WaitStatesNeeded = 0;
957
958 if (!TRI->isVectorRegister(MRI, Def.getReg()))
959 return WaitStatesNeeded;
960 Register Reg = Def.getReg();
961 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
962 int DataIdx = createsVALUHazard(MI);
963 return DataIdx >= 0 &&
964 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
965 };
966
967 int WaitStatesNeededForDef =
968 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
969 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
970
971 return WaitStatesNeeded;
972}
973
974/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
975/// pack the computed value into correct bit position of the dest register. This
976/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
977/// dst_sel that is not aligned to the register. This function analayzes the \p
978/// MI and \returns an operand with dst forwarding issue, or nullptr if
979/// none exists.
980static const MachineOperand *
983 return nullptr;
984
985 const SIInstrInfo *TII = ST.getInstrInfo();
986
987 unsigned Opcode = MI.getOpcode();
988
989 // There are three different types of instructions
990 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
991 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
992 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
993 // op_sel[3:2]
994 // != 0
995 if (SIInstrInfo::isSDWA(MI)) {
996 // Type 1: SDWA with dst_sel != DWORD
997 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
998 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
999 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1000 }
1001
1002 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
1003 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
1004 // Type 2: VOP3 which write the hi bits
1005 if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
1007 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1008
1009 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
1010 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
1011 (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
1013 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1014 }
1015
1016 // Special case: nop is required for all the opsel values for fp4 sr variant
1017 // cvt scale instructions
1018 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
1019 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1020
1021 return nullptr;
1022}
1023
1024/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
1025/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
1026/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
1028 const MachineOperand *Dst,
1029 const SIRegisterInfo *TRI) {
1030 // We must consider implicit reads of the VALU. SDWA with dst_sel and
1031 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
1032 // and we must account for that hazard.
1033 // We also must account for WAW hazards. In particular, WAW with dest
1034 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
1035 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
1036 // check for ECC. Without accounting for this hazard, the ECC will be
1037 // wrong.
1038 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
1039 // complete zeroesHigh16BitsOfDest)
1040 for (auto &Operand : VALU->operands()) {
1041 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1042 return true;
1043 }
1044 }
1045 return false;
1046}
1047
1048int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) const {
1049 int WaitStatesNeeded = 0;
1050
1051 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
1052 const int TransDefWaitstates = 1;
1053
1054 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
1056 return false;
1057 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1058 const SIInstrInfo *TII = ST.getInstrInfo();
1059 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
1060
1061 for (const MachineOperand &Use : VALU->explicit_uses()) {
1062 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
1063 return true;
1064 }
1065
1066 return false;
1067 };
1068
1069 int WaitStatesNeededForDef =
1070 TransDefWaitstates -
1071 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1072 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1073 }
1074
1075 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1076 const int Shift16DefWaitstates = 1;
1077
1078 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1079 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1080 const MachineOperand *ForwardedDst =
1081 getDstSelForwardingOperand(ProducerMI, ST);
1082 if (ForwardedDst) {
1083 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
1084 }
1085
1086 if (ProducerMI.isInlineAsm()) {
1087 // Assume inline asm has dst forwarding hazard
1088 for (auto &Def : ProducerMI.all_defs()) {
1089 if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
1090 return true;
1091 }
1092 }
1093
1094 return false;
1095 };
1096
1097 int WaitStatesNeededForDef =
1098 Shift16DefWaitstates -
1099 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1100 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1101 }
1102
1103 if (ST.hasVDecCoExecHazard()) {
1104 const int VALUWriteSGPRVALUReadWaitstates = 2;
1105 const int VALUWriteEXECRWLane = 4;
1106 const int VALUWriteVGPRReadlaneRead = 1;
1107
1108 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1109 const MachineRegisterInfo &MRI = MF.getRegInfo();
1111 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1112 if (!SIInstrInfo::isVALU(MI))
1113 return false;
1114 return MI.modifiesRegister(UseReg, TRI);
1115 };
1116
1117 for (const MachineOperand &Use : VALU->explicit_uses()) {
1118 if (!Use.isReg())
1119 continue;
1120
1121 UseReg = Use.getReg();
1122 if (TRI->isSGPRReg(MRI, UseReg)) {
1123 int WaitStatesNeededForDef =
1124 VALUWriteSGPRVALUReadWaitstates -
1125 getWaitStatesSince(IsVALUDefSGPRFn,
1126 VALUWriteSGPRVALUReadWaitstates);
1127 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1128 }
1129 }
1130
1131 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1132 UseReg = AMDGPU::VCC;
1133 int WaitStatesNeededForDef =
1134 VALUWriteSGPRVALUReadWaitstates -
1135 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1136 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1137 }
1138
1139 switch (VALU->getOpcode()) {
1140 case AMDGPU::V_READLANE_B32:
1141 case AMDGPU::V_READFIRSTLANE_B32: {
1142 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1143 UseReg = Src->getReg();
1144 int WaitStatesNeededForDef =
1145 VALUWriteVGPRReadlaneRead -
1146 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1147 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1148 }
1149 [[fallthrough]];
1150 case AMDGPU::V_WRITELANE_B32: {
1151 UseReg = AMDGPU::EXEC;
1152 int WaitStatesNeededForDef =
1153 VALUWriteEXECRWLane -
1154 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1155 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1156 break;
1157 }
1158 default:
1159 break;
1160 }
1161 }
1162
1163 // This checks for the hazard where VMEM instructions that store more than
1164 // 8 bytes can have there store data over written by the next instruction.
1165 if (!ST.has12DWordStoreHazard())
1166 return WaitStatesNeeded;
1167
1168 const MachineRegisterInfo &MRI = MF.getRegInfo();
1169
1170 for (const MachineOperand &Def : VALU->defs()) {
1171 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1172 }
1173
1174 return WaitStatesNeeded;
1175}
1176
1177int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) const {
1178 // This checks for hazards associated with inline asm statements.
1179 // Since inline asms can contain just about anything, we use this
1180 // to call/leverage other check*Hazard routines. Note that
1181 // this function doesn't attempt to address all possible inline asm
1182 // hazards (good luck), but is a collection of what has been
1183 // problematic thus far.
1184
1185 // see checkVALUHazards()
1186 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1187 !ST.hasCvtScaleForwardingHazard())
1188 return 0;
1189
1190 const MachineRegisterInfo &MRI = MF.getRegInfo();
1191 int WaitStatesNeeded = 0;
1192
1193 for (const MachineOperand &Op :
1195 if (Op.isReg() && Op.isDef()) {
1196 if (!TRI.isVectorRegister(MRI, Op.getReg()))
1197 continue;
1198
1199 if (ST.has12DWordStoreHazard()) {
1200 WaitStatesNeeded =
1201 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1202 }
1203 }
1204 }
1205
1206 if (ST.hasDstSelForwardingHazard()) {
1207 const int Shift16DefWaitstates = 1;
1208
1209 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1210 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1211 // Assume inline asm reads the dst
1212 if (Dst)
1213 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1214 IA->readsRegister(Dst->getReg(), &TRI);
1215
1216 if (ProducerMI.isInlineAsm()) {
1217 // If MI is inline asm, assume it has dst forwarding hazard
1218 for (auto &Def : ProducerMI.all_defs()) {
1219 if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1220 IA->readsRegister(Def.getReg(), &TRI)) {
1221 return true;
1222 }
1223 }
1224 }
1225
1226 return false;
1227 };
1228
1229 int WaitStatesNeededForDef =
1230 Shift16DefWaitstates -
1231 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1232 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1233 }
1234
1235 return WaitStatesNeeded;
1236}
1237
1238int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) const {
1239 const SIInstrInfo *TII = ST.getInstrInfo();
1240 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1241 const MachineRegisterInfo &MRI = MF.getRegInfo();
1242
1243 const MachineOperand *LaneSelectOp =
1244 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1245
1246 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1247 return 0;
1248
1249 Register LaneSelectReg = LaneSelectOp->getReg();
1250 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1251
1252 const int RWLaneWaitStates = 4;
1253 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1254 RWLaneWaitStates);
1255 return RWLaneWaitStates - WaitStatesSince;
1256}
1257
1258int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) const {
1259 if (!ST.hasRFEHazards())
1260 return 0;
1261
1262 const SIInstrInfo *TII = ST.getInstrInfo();
1263
1264 const int RFEWaitStates = 1;
1265
1266 auto IsHazardFn = [TII](const MachineInstr &MI) {
1267 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1268 };
1269 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1270 return RFEWaitStates - WaitStatesNeeded;
1271}
1272
1273int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) const {
1274 const SIInstrInfo *TII = ST.getInstrInfo();
1275 const int ReadM0WaitStates = 1;
1276 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1277 return ReadM0WaitStates -
1278 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1279}
1280
1281void GCNHazardRecognizer::emitVNops(MachineBasicBlock &MBB,
1283 int WaitStatesNeeded, bool IsHoisting) {
1284 const DebugLoc &DL = IsHoisting ? DebugLoc() : InsertPt->getDebugLoc();
1285 for (int I = 0; I < WaitStatesNeeded; ++I)
1286 BuildMI(MBB, InsertPt, DL, TII.get(AMDGPU::V_NOP_e32));
1287}
1288
1289void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1290 fixVMEMtoScalarWriteHazards(MI);
1291 fixVcmpxPermlaneHazards(MI);
1292 fixSMEMtoVectorWriteHazards(MI);
1293 fixVcmpxExecWARHazard(MI);
1294 fixLdsBranchVmemWARHazard(MI);
1295 if (ST.hasLdsDirect()) {
1296 fixLdsDirectVALUHazard(MI);
1297 fixLdsDirectVMEMHazard(MI);
1298 }
1299 fixVALUPartialForwardingHazard(MI);
1300 fixVALUTransUseHazard(MI);
1301 fixVALUTransCoexecutionHazards(MI);
1302 fixWMMAHazards(MI); // fall-through if co-execution is enabled.
1303 fixWMMACoexecutionHazards(MI);
1304 fixShift64HighRegBug(MI);
1305 fixVALUMaskWriteHazard(MI);
1306 fixRequiredExportPriority(MI);
1307 if (ST.requiresWaitIdleBeforeGetReg())
1308 fixGetRegWaitIdle(MI);
1309 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1310 fixDsAtomicAsyncBarrierArriveB64(MI);
1311 if (ST.hasScratchBaseForwardingHazard())
1312 fixScratchBaseForwardingHazard(MI);
1313 if (ST.setRegModeNeedsVNOPs())
1314 fixSetRegMode(MI);
1315}
1316
1318 const MachineInstr &MI) {
1319 return (TII.isVOPC(MI) ||
1320 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1321 MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1322}
1323
1324bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1325 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1326 return false;
1327
1328 const SIInstrInfo *TII = ST.getInstrInfo();
1329 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1330 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1331 return isVCmpXWritesExec(*TII, *TRI, MI);
1332 };
1333
1334 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1335 unsigned Opc = MI.getOpcode();
1336 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1337 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1338 };
1339
1340 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1341 std::numeric_limits<int>::max())
1342 return false;
1343
1344 // V_NOP will be discarded by SQ.
1345 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1346 // which is always a VGPR and available.
1347 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1348 Register Reg = Src0->getReg();
1349 bool IsUndef = Src0->isUndef();
1350 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1351 TII->get(AMDGPU::V_MOV_B32_e32))
1354
1355 return true;
1356}
1357
1358bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1359 if (!ST.hasVMEMtoScalarWriteHazard())
1360 return false;
1361 assert(!ST.hasExtendedWaitCounts());
1362
1364 return false;
1365
1366 if (MI->getNumDefs() == 0)
1367 return false;
1368
1369 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1370
1371 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1373 return false;
1374
1375 for (const MachineOperand &Def : MI->defs()) {
1376 const MachineOperand *Op =
1377 I.findRegisterUseOperand(Def.getReg(), TRI, false);
1378 if (!Op)
1379 continue;
1380 return true;
1381 }
1382 return false;
1383 };
1384
1385 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1386 return SIInstrInfo::isVALU(MI) ||
1387 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1388 !MI.getOperand(0).getImm()) ||
1389 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1390 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1391 };
1392
1393 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1394 std::numeric_limits<int>::max())
1395 return false;
1396
1397 const SIInstrInfo *TII = ST.getInstrInfo();
1398 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1399 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1401 return true;
1402}
1403
1404bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1405 if (!ST.hasSMEMtoVectorWriteHazard())
1406 return false;
1407 assert(!ST.hasExtendedWaitCounts());
1408
1409 if (!SIInstrInfo::isVALU(*MI))
1410 return false;
1411
1412 AMDGPU::OpName SDSTName;
1413 switch (MI->getOpcode()) {
1414 case AMDGPU::V_READLANE_B32:
1415 case AMDGPU::V_READFIRSTLANE_B32:
1416 SDSTName = AMDGPU::OpName::vdst;
1417 break;
1418 default:
1419 SDSTName = AMDGPU::OpName::sdst;
1420 break;
1421 }
1422
1423 const SIInstrInfo *TII = ST.getInstrInfo();
1424 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1425 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1426 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1427 if (!SDST) {
1428 for (const auto &MO : MI->implicit_operands()) {
1429 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1430 SDST = &MO;
1431 break;
1432 }
1433 }
1434 }
1435
1436 if (!SDST)
1437 return false;
1438
1439 const Register SDSTReg = SDST->getReg();
1440 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1441 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1442 };
1443
1444 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1445 if (TII->isSALU(MI)) {
1446 switch (MI.getOpcode()) {
1447 case AMDGPU::S_SETVSKIP:
1448 case AMDGPU::S_VERSION:
1449 case AMDGPU::S_WAITCNT_VSCNT:
1450 case AMDGPU::S_WAITCNT_VMCNT:
1451 case AMDGPU::S_WAITCNT_EXPCNT:
1452 // These instructions cannot not mitigate the hazard.
1453 return false;
1454 case AMDGPU::S_WAITCNT_LGKMCNT:
1455 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1456 return (MI.getOperand(1).getImm() == 0) &&
1457 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1458 case AMDGPU::S_WAITCNT: {
1459 const int64_t Imm = MI.getOperand(0).getImm();
1460 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1461 // DsCnt corresponds to LGKMCnt here.
1462 return Decoded.get(AMDGPU::DS_CNT) == 0;
1463 }
1464 default:
1465 assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
1466 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1467 "unexpected wait count instruction");
1468 // SOPP instructions cannot mitigate the hazard.
1469 if (TII->isSOPP(MI))
1470 return false;
1471 // At this point the SALU can be assumed to mitigate the hazard
1472 // because either:
1473 // (a) it is independent of the at risk SMEM (breaking chain),
1474 // or
1475 // (b) it is dependent on the SMEM, in which case an appropriate
1476 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1477 // SMEM instruction.
1478 return true;
1479 }
1480 }
1481 return false;
1482 };
1483
1484 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1485 std::numeric_limits<int>::max())
1486 return false;
1487
1488 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1489 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1490 .addImm(0);
1491 return true;
1492}
1493
1494bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1495 if (!ST.hasVcmpxExecWARHazard())
1496 return false;
1497 assert(!ST.hasExtendedWaitCounts());
1498
1499 if (!SIInstrInfo::isVALU(*MI))
1500 return false;
1501
1502 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1503 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1504 return false;
1505
1506 auto IsHazardFn = [TRI](const MachineInstr &I) {
1508 return false;
1509 return I.readsRegister(AMDGPU::EXEC, TRI);
1510 };
1511
1512 const SIInstrInfo *TII = ST.getInstrInfo();
1513 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1514 if (SIInstrInfo::isVALU(MI)) {
1515 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1516 return true;
1517 for (auto MO : MI.implicit_operands())
1518 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1519 return true;
1520 }
1521 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1522 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1523 return true;
1524 return false;
1525 };
1526
1527 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1528 std::numeric_limits<int>::max())
1529 return false;
1530
1531 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1532 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1534 return true;
1535}
1536
1538 const GCNSubtarget &ST) {
1539 if (!ST.hasLdsBranchVmemWARHazard())
1540 return false;
1541
1542 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1543 // instructions need to appear in the same function.
1544 bool HasLds = false;
1545 bool HasVmem = false;
1546 for (auto &MBB : MF) {
1547 for (auto &MI : MBB) {
1549 HasVmem |= SIInstrInfo::isVMEM(MI);
1550 if (HasLds && HasVmem)
1551 return true;
1552 }
1553 }
1554 return false;
1555}
1556
1558 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1559 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1560 !I.getOperand(1).getImm();
1561}
1562
1563bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1564 if (!RunLdsBranchVmemWARHazardFixup)
1565 return false;
1566
1567 assert(ST.hasLdsBranchVmemWARHazard());
1568 assert(!ST.hasExtendedWaitCounts());
1569
1570 auto IsHazardInst = [](const MachineInstr &MI) {
1572 return 1;
1574 return 2;
1575 return 0;
1576 };
1577
1578 auto InstType = IsHazardInst(*MI);
1579 if (!InstType)
1580 return false;
1581
1582 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1583 return IsHazardInst(I) || isStoreCountWaitZero(I);
1584 };
1585
1586 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1587 if (!I.isBranch())
1588 return false;
1589
1590 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1591 auto InstType2 = IsHazardInst(I);
1592 return InstType2 && InstType != InstType2;
1593 };
1594
1595 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1596 auto InstType2 = IsHazardInst(I);
1597 if (InstType == InstType2)
1598 return true;
1599
1600 return isStoreCountWaitZero(I);
1601 };
1602
1603 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1604 std::numeric_limits<int>::max();
1605 };
1606
1607 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1608 std::numeric_limits<int>::max())
1609 return false;
1610
1611 const SIInstrInfo *TII = ST.getInstrInfo();
1612 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1613 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1614 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1615 .addImm(0);
1616
1617 return true;
1618}
1619
1620bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1622 return false;
1623
1624 const int NoHazardWaitStates = 15;
1625 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1626 const Register VDSTReg = VDST->getReg();
1627
1628 bool VisitedTrans = false;
1629 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1630 if (!SIInstrInfo::isVALU(I))
1631 return false;
1632 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1633 // Cover both WAR and WAW
1634 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1635 };
1636 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1637 if (WaitStates >= NoHazardWaitStates)
1638 return true;
1639 // Instructions which cause va_vdst==0 expire hazard
1642 };
1643 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1644 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1645 };
1646
1647 DenseSet<const MachineBasicBlock *> Visited;
1648 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1649 std::next(MI->getReverseIterator()), 0,
1650 IsExpiredFn, Visited, GetWaitStatesFn);
1651
1652 // Transcendentals can execute in parallel to other VALUs.
1653 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1654 if (VisitedTrans)
1655 Count = 0;
1656
1657 MachineOperand *WaitVdstOp =
1658 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1659 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1660
1661 return true;
1662}
1663
1664bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1666 return false;
1667
1668 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1669 const Register VDSTReg = VDST->getReg();
1670
1671 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1673 return false;
1674 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1675 };
1676 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1677 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1678 // according to the type of VMEM instruction.
1679 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1681 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1682 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1683 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1684 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1685 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1686 };
1687
1688 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1689 std::numeric_limits<int>::max())
1690 return false;
1691
1692 if (LdsdirCanWait) {
1693 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1694 } else {
1695 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1696 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1698 }
1699
1700 return true;
1701}
1702
1703bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1704 if (!ST.hasVALUPartialForwardingHazard())
1705 return false;
1706 assert(!ST.hasExtendedWaitCounts());
1707
1708 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1709 return false;
1710
1711 SmallSetVector<Register, 4> SrcVGPRs;
1712
1713 for (const MachineOperand &Use : MI->explicit_uses()) {
1714 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1715 SrcVGPRs.insert(Use.getReg());
1716 }
1717
1718 // Only applies with >= 2 unique VGPR sources
1719 if (SrcVGPRs.size() <= 1)
1720 return false;
1721
1722 // Look for the following pattern:
1723 // Va <- VALU [PreExecPos]
1724 // intv1
1725 // Exec <- SALU [ExecPos]
1726 // intv2
1727 // Vb <- VALU [PostExecPos]
1728 // intv3
1729 // MI Va, Vb (WaitState = 0)
1730 //
1731 // Where:
1732 // intv1 + intv2 <= 2 VALUs
1733 // intv3 <= 4 VALUs
1734 //
1735 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1736
1737 const int Intv1plus2MaxVALUs = 2;
1738 const int Intv3MaxVALUs = 4;
1739 const int IntvMaxVALUs = 6;
1740 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1741
1742 struct StateType {
1743 SmallDenseMap<Register, int, 4> DefPos;
1744 int ExecPos = std::numeric_limits<int>::max();
1745 int VALUs = 0;
1746
1747 static unsigned getHashValue(const StateType &State) {
1748 return hash_combine(State.ExecPos, State.VALUs,
1749 hash_combine_range(State.DefPos));
1750 }
1751 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1752 return LHS.DefPos == RHS.DefPos && LHS.ExecPos == RHS.ExecPos &&
1753 LHS.VALUs == RHS.VALUs;
1754 }
1755 };
1756
1757 StateType State;
1758
1759 // This overloads expiry testing with all the hazard detection
1760 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1761 // Too many VALU states have passed
1762 if (State.VALUs > NoHazardVALUWaitStates)
1763 return HazardExpired;
1764
1765 // Instructions which cause va_vdst==0 expire hazard
1768 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1769 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1770 return HazardExpired;
1771
1772 // Track registers writes
1773 bool Changed = false;
1774 if (SIInstrInfo::isVALU(I)) {
1775 for (Register Src : SrcVGPRs) {
1776 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1777 State.DefPos[Src] = State.VALUs;
1778 Changed = true;
1779 }
1780 }
1781 } else if (SIInstrInfo::isSALU(I)) {
1782 if (State.ExecPos == std::numeric_limits<int>::max()) {
1783 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1784 State.ExecPos = State.VALUs;
1785 Changed = true;
1786 }
1787 }
1788 }
1789
1790 // Early expiration: too many VALUs in intv3
1791 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1792 return HazardExpired;
1793
1794 // Only evaluate state if something changed
1795 if (!Changed)
1796 return NoHazardFound;
1797
1798 // Determine positions of VALUs pre/post exec change
1799 if (State.ExecPos == std::numeric_limits<int>::max())
1800 return NoHazardFound;
1801
1802 int PreExecPos = std::numeric_limits<int>::max();
1803 int PostExecPos = std::numeric_limits<int>::max();
1804
1805 for (auto Entry : State.DefPos) {
1806 int DefVALUs = Entry.second;
1807 if (DefVALUs != std::numeric_limits<int>::max()) {
1808 if (DefVALUs >= State.ExecPos)
1809 PreExecPos = std::min(PreExecPos, DefVALUs);
1810 else
1811 PostExecPos = std::min(PostExecPos, DefVALUs);
1812 }
1813 }
1814
1815 // Need a VALUs post exec change
1816 if (PostExecPos == std::numeric_limits<int>::max())
1817 return NoHazardFound;
1818
1819 // Too many VALUs in intv3?
1820 int Intv3VALUs = PostExecPos;
1821 if (Intv3VALUs > Intv3MaxVALUs)
1822 return HazardExpired;
1823
1824 // Too many VALUs in intv2?
1825 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1826 if (Intv2VALUs > Intv1plus2MaxVALUs)
1827 return HazardExpired;
1828
1829 // Need a VALUs pre exec change
1830 if (PreExecPos == std::numeric_limits<int>::max())
1831 return NoHazardFound;
1832
1833 // Too many VALUs in intv1?
1834 int Intv1VALUs = PreExecPos - State.ExecPos;
1835 if (Intv1VALUs > Intv1plus2MaxVALUs)
1836 return HazardExpired;
1837
1838 // Too many VALUs in intv1 + intv2
1839 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1840 return HazardExpired;
1841
1842 return HazardFound;
1843 };
1844 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1846 State.VALUs += 1;
1847 };
1848
1849 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1850 std::next(MI->getReverseIterator())))
1851 return false;
1852
1853 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1854 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1856
1857 return true;
1858}
1859
1860bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1861 if (!ST.hasVALUTransUseHazard())
1862 return false;
1863 assert(!ST.hasExtendedWaitCounts());
1864
1865 if (!SIInstrInfo::isVALU(*MI))
1866 return false;
1867
1868 SmallSet<Register, 4> SrcVGPRs;
1869
1870 for (const MachineOperand &Use : MI->explicit_uses()) {
1871 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1872 SrcVGPRs.insert(Use.getReg());
1873 }
1874
1875 // Look for the following pattern:
1876 // Va <- TRANS VALU
1877 // intv
1878 // MI Va (WaitState = 0)
1879 //
1880 // Where:
1881 // intv <= 5 VALUs / 1 TRANS
1882 //
1883 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1884
1885 const int IntvMaxVALUs = 5;
1886 const int IntvMaxTRANS = 1;
1887
1888 struct StateType {
1889 int VALUs = 0;
1890 int TRANS = 0;
1891
1892 static unsigned getHashValue(const StateType &State) {
1893 return hash_combine(State.VALUs, State.TRANS);
1894 }
1895 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1896 return LHS.VALUs == RHS.VALUs && LHS.TRANS == RHS.TRANS;
1897 }
1898 };
1899
1900 StateType State;
1901
1902 // This overloads expiry testing with all the hazard detection
1903 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1904 // Too many VALU states have passed
1905 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1906 return HazardExpired;
1907
1908 // Instructions which cause va_vdst==0 expire hazard
1911 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1912 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1913 return HazardExpired;
1914
1915 // Track registers writes
1916 if (SIInstrInfo::isTRANS(I)) {
1917 for (Register Src : SrcVGPRs) {
1918 if (I.modifiesRegister(Src, &TRI)) {
1919 return HazardFound;
1920 }
1921 }
1922 }
1923
1924 return NoHazardFound;
1925 };
1926 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1928 State.VALUs += 1;
1930 State.TRANS += 1;
1931 };
1932
1933 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1934 std::next(MI->getReverseIterator())))
1935 return false;
1936
1937 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1938 // avoided.
1939 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1940 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1942
1943 return true;
1944}
1945
1946bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
1947 if (!ST.hasGFX1250Insts() || // Coexecution disabled.
1949 return false;
1950
1951 const SIInstrInfo *TII = ST.getInstrInfo();
1952 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1953
1954 auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1955 if (!SIInstrInfo::isTRANS(I))
1956 return false;
1957
1958 // RAW: Trans(I) writes, VALU(MI) reads.
1959 Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1960 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
1961 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1962 return true;
1963 }
1964
1965 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
1966 if (!ValuDst || !ValuDst->isReg())
1967 return false;
1968
1969 // WAR: Trans(I) reads, VALU(MI) writes.
1970 Register ValuDef = ValuDst->getReg();
1971 for (const MachineOperand &TransUse : I.explicit_uses()) {
1972 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1973 return true;
1974 }
1975
1976 return false;
1977 };
1978
1979 auto IsExpiredFn = [](const MachineInstr &I, int) {
1980 return SIInstrInfo::isVALU(I);
1981 };
1982
1983 const int HasVALU = std::numeric_limits<int>::max();
1984 if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU)
1985 return false;
1986
1987 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1988 return true;
1989}
1990
1991bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1993 return false;
1994
1995 const SIInstrInfo *TII = ST.getInstrInfo();
1996 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1997
1998 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
2000 return false;
2001
2002 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
2003 // with the dest(matrix D) of the previous wmma.
2004 const Register CurSrc0Reg =
2005 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
2006 const Register CurSrc1Reg =
2007 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
2008
2009 const Register PrevDstReg =
2010 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2011
2012 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2013 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2014 return true;
2015 }
2016
2017 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
2018 // but Index can't overlap with PrevDstReg.
2019 if (AMDGPU::isGFX12Plus(ST)) {
2020 if (SIInstrInfo::isSWMMAC(*MI)) {
2021 const Register CurIndex =
2022 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
2023 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2024 return true;
2025 }
2026 return false;
2027 }
2028
2029 return false;
2030 };
2031
2032 auto IsExpiredFn = [](const MachineInstr &I, int) {
2033 return SIInstrInfo::isVALU(I);
2034 };
2035
2036 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2037 std::numeric_limits<int>::max())
2038 return false;
2039
2040 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2041
2042 return true;
2043}
2044
2049
2051 const SIInstrInfo *TII, unsigned Latency,
2052 unsigned Category) {
2053 assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) &&
2054 "Handle me if the xdl wmma instruction latency changes");
2055
2056 switch (Category) {
2057 case 0: // Dense WMMA Instructions:
2058 // WMMA_*F16, WMMA_*BF16
2059 // WMMA_*FP8FP8
2060 // WMMA_*FP8BF8
2061 // WMMA_*BF8FP8
2062 // WMMA_*BF8BF8
2063 // WMMA_*F8F6F4 if SRCA & SRCB != F8
2064 return Latency == 8 && SIInstrInfo::isWMMA(MI);
2065
2066 case 1: // Dense WMMA Instructions:
2067 // WMMA_IU8
2068 // WMMA_IU4
2069 // WMMA_*F8F6F4 if SRCA OR SRCB == F8
2070 return Latency == 16 && SIInstrInfo::isWMMA(MI);
2071
2072 case 2: // Dense SWMMAC Instructions
2073 // SWMMAC_*F16, SWMMAC_*BF16,
2074 // SWMMAC_*FP8FP8
2075 // SWMMAC_*BF8FP8
2076 // SWMMAC_*FP8BF8
2077 // SWMMAC_*BF8BF8
2078 return Latency == 8 && SIInstrInfo::isSWMMAC(MI);
2079
2080 case 3: // Sparse WMMA Instructions:
2081 // SWMMAC_IU8
2082 // SWMMAC_IU4
2083 return Latency == 16 && SIInstrInfo::isSWMMAC(MI);
2084 default:
2085 break;
2086 } // end switch.
2087
2088 return false;
2089}
2090
2091int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) const {
2092 if (!ST.hasGFX1250Insts())
2093 return 0;
2094
2095 const SIInstrInfo *TII = ST.getInstrInfo();
2096 if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
2097 return 0;
2098
2099 // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
2100 // be in between the first WMMA and the second instruction to cover the hazard
2101 // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
2102 // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
2103 // numbers, which depends on the category of the first WMMA.
2104 const int WMMAWaitStates[] = {5, 9, 3, 5};
2105 const int VALUWaitStates[] = {4, 8, 2, 4};
2106 unsigned Category = 0;
2107
2108 auto IsWMMAHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2109 if (!TII->isXDLWMMA(I))
2110 return false;
2111
2112 unsigned Latency = TSchedModel.computeInstrLatency(&I);
2113 if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
2114 return false;
2115
2116 return hasWMMAToWMMARegOverlap(I, *MI);
2117 };
2118
2119 auto IsVALUHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2120 if (!TII->isXDLWMMA(I))
2121 return false;
2122
2123 unsigned Latency = TSchedModel.computeInstrLatency(&I);
2124 if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
2125 return false;
2126
2127 return hasWMMAToVALURegOverlap(I, *MI);
2128 };
2129
2130 int Limit = 0;
2131
2132 auto GetWaitStatesFn = [](const MachineInstr &I) {
2133 return SIInstrInfo::isVALU(I) ? 1 : 0;
2134 };
2135
2136 int WaitStatesNeeded = -1;
2137 if (TII->isXDLWMMA(*MI)) {
2138 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2139 Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
2140 // 'getWaitStatesSince' returns the number of VALUs in between if hazard
2141 // exists, and INT_MAX if there is no hazard. As a result, a negative
2142 // WaitStatesNeeded here means no hazard, and we will continue to search
2143 // for other categories.
2144 WaitStatesNeeded =
2145 Limit - getWaitStatesSince(IsWMMAHazardFn, Limit, GetWaitStatesFn);
2146 }
2147 } else { // Must be a co-executable VALU.
2148 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2149 Limit = VALUWaitStates[Category]; // for IsExpiredFn.
2150 // 'getWaitStatesSince' returns the number of VALUs in between if hazard
2151 // exists, and INT_MAX if there is no hazard. As a result, a negative
2152 // WaitStatesNeeded here means no hazard, and we will continue to search
2153 // for other categories.
2154 WaitStatesNeeded =
2155 Limit - getWaitStatesSince(IsVALUHazardFn, Limit, GetWaitStatesFn);
2156 }
2157 }
2158
2159 return WaitStatesNeeded;
2160}
2161
2162bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
2163 const MachineInstr &WMMA, const MachineInstr &MI) const {
2164 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2165 Register A1 = TII.getNamedOperand(MI, AMDGPU::OpName::src0)->getReg();
2166 Register B1 = TII.getNamedOperand(MI, AMDGPU::OpName::src1)->getReg();
2167
2168 // WMMA0 writes (D0), WMMA1 reads (A1/B1/Idx1).
2169 if (TRI.regsOverlap(D0, A1) || TRI.regsOverlap(D0, B1))
2170 return true;
2171
2173 Register Idx1 = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2174 if (TRI.regsOverlap(D0, Idx1))
2175 return true;
2176 }
2177 return false;
2178}
2179
2180bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
2181 const MachineInstr &WMMA, const MachineInstr &MI) const {
2182 // WMMA writes, VALU reads.
2183 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2184 for (const MachineOperand &ValuUse : MI.explicit_uses()) {
2185 if (ValuUse.isReg() && TRI.regsOverlap(D0, ValuUse.getReg()))
2186 return true;
2187 }
2188
2189 // WMMA reads or writes, VALU writes.
2190 Register A0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src0)->getReg();
2191 Register B0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src1)->getReg();
2192 SmallVector<Register, 4> WMMARegs({D0, A0, B0});
2193
2194 if (SIInstrInfo::isSWMMAC(WMMA)) {
2195 Register Idx0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src2)->getReg();
2196 WMMARegs.push_back(Idx0);
2197 }
2198
2199 for (const MachineOperand &ValuDef : MI.defs()) {
2200 Register VDstReg = ValuDef.getReg();
2201 for (Register WMMAReg : WMMARegs) {
2202 if (TRI.regsOverlap(VDstReg, WMMAReg))
2203 return true;
2204 }
2205 }
2206 return false;
2207}
2208
2209bool GCNHazardRecognizer::isCoexecutionHazardFor(const MachineInstr &I,
2210 const MachineInstr &MI) const {
2211 // I is the potential WMMA hazard source, MI is the instruction being checked
2212 // for hazard.
2213 if (!TII.isXDLWMMA(I))
2214 return false;
2215
2216 // Dispatch based on MI type
2217 if (TII.isXDLWMMA(MI))
2218 return hasWMMAToWMMARegOverlap(I, MI);
2220 return hasWMMAToVALURegOverlap(I, MI);
2221
2222 return false;
2223}
2224
2225bool GCNHazardRecognizer::hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
2226 bool IncludeSubloops) {
2227 // Scan loop for any WMMA that hazards MI.
2228 // TODO: Avoid full loop scan when WMMA is beyond VALU distance.
2229 for (MachineBasicBlock *MBB : L->getBlocks()) {
2230 if (!IncludeSubloops && MLI->getLoopFor(MBB) != L)
2231 continue;
2232 for (MachineInstr &I : *MBB) {
2233 if (&I == MI)
2234 continue;
2235 if (isCoexecutionHazardFor(I, *MI))
2236 return true;
2237 }
2238 }
2239 return false;
2240}
2241
2242bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
2243 int WaitStatesNeeded) {
2244 if (!MLI)
2245 return false;
2246
2247 MachineLoop *L = MLI->getLoopFor(MI->getParent());
2248 if (!L) {
2249 ++NumWMMAHoistingBailed;
2250 return false;
2251 }
2252
2253 // If innermost loop has WMMA hazard, we can't hoist at all
2254 if (hasWMMAHazardInLoop(L, MI)) {
2255 ++NumWMMAHoistingBailed;
2256 return false;
2257 }
2258
2259 // Find outermost loop with no internal hazard
2260 MachineLoop *TargetLoop = L;
2261 while (MachineLoop *Parent = TargetLoop->getParentLoop()) {
2262 if (hasWMMAHazardInLoop(Parent, MI, false))
2263 break; // Parent has hazard in its own blocks, stop here
2264 TargetLoop = Parent; // Safe to hoist further out
2265 }
2266
2267 // Need valid preheader to insert V_NOPs
2268 MachineBasicBlock *Preheader = TargetLoop->getLoopPreheader();
2269 if (!Preheader) {
2270 ++NumWMMAHoistingBailed;
2271 return false;
2272 }
2273
2274 LLVM_DEBUG(dbgs() << "WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
2275 << " V_NOPs from loop to " << printMBBReference(*Preheader)
2276 << "\n");
2277
2278 emitVNops(*Preheader, Preheader->getFirstTerminator(), WaitStatesNeeded,
2279 /*IsHoisting=*/true);
2280 NumWMMANopsHoisted += WaitStatesNeeded;
2281 return true;
2282}
2283
2284bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
2285 int WaitStatesNeeded = checkWMMACoexecutionHazards(MI);
2286 if (WaitStatesNeeded <= 0)
2287 return false;
2288
2289 if (EnableWMMAVnopHoisting && tryHoistWMMAVnopsFromLoop(MI, WaitStatesNeeded))
2290 return true;
2291
2292 emitVNops(*MI->getParent(), MI->getIterator(), WaitStatesNeeded);
2293 return true;
2294}
2295
2296bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
2297 if (!ST.hasShift64HighRegBug())
2298 return false;
2299 assert(!ST.hasExtendedWaitCounts());
2300
2301 switch (MI->getOpcode()) {
2302 default:
2303 return false;
2304 case AMDGPU::V_LSHLREV_B64_e64:
2305 case AMDGPU::V_LSHRREV_B64_e64:
2306 case AMDGPU::V_ASHRREV_I64_e64:
2307 break;
2308 }
2309
2310 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
2311 if (!Amt->isReg())
2312 return false;
2313
2314 Register AmtReg = Amt->getReg();
2315 const MachineRegisterInfo &MRI = MF.getRegInfo();
2316 // Check if this is a last VGPR in the allocation block.
2317 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2318 return false;
2319
2320 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
2321 return false;
2322
2323 assert(ST.needsAlignedVGPRs());
2324 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2325
2326 const DebugLoc &DL = MI->getDebugLoc();
2327 MachineBasicBlock *MBB = MI->getParent();
2328 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
2329
2330 // In:
2331 //
2332 // Dst = shiftrev64 Amt, Src1
2333 //
2334 // if Dst!=Src1 then avoid the bug with:
2335 //
2336 // Dst.sub0 = Amt
2337 // Dst = shift64 Dst.sub0, Src1
2338
2339 Register DstReg = MI->getOperand(0).getReg();
2340 if (!Src1->isReg() || Src1->getReg() != DstReg) {
2341 Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
2342 runOnInstruction(
2343 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstLo).add(*Amt));
2344 Amt->setReg(DstLo);
2345 Amt->setIsKill(true);
2346 return true;
2347 }
2348
2349 bool Overlapped = MI->modifiesRegister(AmtReg, &TRI);
2350 Register NewReg;
2351 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2352 : AMDGPU::VGPR_32RegClass) {
2353 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
2354 NewReg = Reg;
2355 break;
2356 }
2357 }
2358
2359 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2360 : NewReg;
2361 Register NewAmtLo;
2362
2363 if (Overlapped)
2364 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2365
2366 // Insert a full wait count because found register might be pending a wait.
2367 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
2368 .addImm(0);
2369
2370 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
2371 if (Overlapped)
2372 runOnInstruction(
2373 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
2374 .addDef(AmtReg - 1)
2375 .addReg(AmtReg - 1, RegState::Undef)
2376 .addReg(NewAmtLo, RegState::Undef));
2377 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2378 .addDef(AmtReg)
2379 .addReg(AmtReg, RegState::Undef)
2380 .addReg(NewAmt, RegState::Undef));
2381
2382 // Instructions emitted after the current instruction will be processed by the
2383 // parent loop of the hazard recognizer in a natural way.
2384 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2385 AmtReg)
2386 .addDef(NewAmt)
2387 .addReg(NewAmt)
2388 .addReg(AmtReg);
2389 if (Overlapped)
2390 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2391 AmtReg - 1)
2392 .addDef(NewAmtLo)
2393 .addReg(NewAmtLo)
2394 .addReg(AmtReg - 1);
2395
2396 // Re-running hazard recognizer on the modified instruction is not necessary,
2397 // inserted V_SWAP_B32 has already both read and write new registers so
2398 // hazards related to these register has already been handled.
2399 Amt->setReg(NewAmt);
2400 Amt->setIsKill(false);
2401 // We do not update liveness, so verifier may see it as undef.
2402 Amt->setIsUndef();
2403 if (Overlapped) {
2404 MI->getOperand(0).setReg(NewReg);
2405 Src1->setReg(NewReg);
2406 Src1->setIsKill(false);
2407 Src1->setIsUndef();
2408 }
2409
2410 return true;
2411}
2412
2413int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) const {
2414 int NSAtoVMEMWaitStates = 1;
2415
2416 if (!ST.hasNSAtoVMEMBug())
2417 return 0;
2418
2420 return 0;
2421
2422 const SIInstrInfo *TII = ST.getInstrInfo();
2423 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2424 if (!Offset || (Offset->getImm() & 6) == 0)
2425 return 0;
2426
2427 auto IsHazardFn = [TII](const MachineInstr &I) {
2428 if (!SIInstrInfo::isMIMG(I))
2429 return false;
2430 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
2431 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2432 TII->getInstSizeInBytes(I) >= 16;
2433 };
2434
2435 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
2436}
2437
2438int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
2439 MachineInstr *MI) const {
2440 int FPAtomicToDenormModeWaitStates = 3;
2441
2442 if (!ST.hasFPAtomicToDenormModeHazard())
2443 return 0;
2444 assert(!ST.hasExtendedWaitCounts());
2445
2446 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2447 return 0;
2448
2449 auto IsHazardFn = [](const MachineInstr &I) {
2450 if (!SIInstrInfo::isVMEM(I))
2451 return false;
2452 return SIInstrInfo::isFPAtomic(I);
2453 };
2454
2455 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2456 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2457 return true;
2458
2459 return SIInstrInfo::isWaitcnt(MI.getOpcode());
2460 };
2461
2462 return FPAtomicToDenormModeWaitStates -
2463 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2464}
2465
2466int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) const {
2468
2469 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2470}
2471
2472int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) const {
2473 // Early exit if no padding is requested.
2474 if (MFMAPaddingRatio == 0)
2475 return 0;
2476
2477 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2478 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2479 return 0;
2480
2481 int NeighborMFMALatency = 0;
2482 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2483 this](const MachineInstr &MI) {
2484 if (!SIInstrInfo::isMFMA(MI))
2485 return false;
2486
2487 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2488 return true;
2489 };
2490
2491 const int MaxMFMAPipelineWaitStates = 16;
2492 int WaitStatesSinceNeighborMFMA =
2493 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2494
2495 int NeighborMFMAPaddingNeeded =
2496 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2497 WaitStatesSinceNeighborMFMA;
2498
2499 return std::max(0, NeighborMFMAPaddingNeeded);
2500}
2501
2502int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) const {
2503 int WaitStatesNeeded = 0;
2504 unsigned Opc = MI->getOpcode();
2505
2506 auto IsVALUFn = [](const MachineInstr &MI) {
2507 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2508 };
2509
2510 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2511 const int LegacyVALUWritesVGPRWaitStates = 2;
2512 const int VALUWritesExecWaitStates = 4;
2513 const int MaxWaitStates = 4;
2514
2515 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2516 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2517 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2518
2519 if (WaitStatesNeeded < MaxWaitStates) {
2520 for (const MachineOperand &Use : MI->explicit_uses()) {
2521 const int MaxWaitStates = 2;
2522
2523 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2524 continue;
2525
2526 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2527 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2528 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2529
2530 if (WaitStatesNeeded == MaxWaitStates)
2531 break;
2532 }
2533 }
2534 }
2535
2536 for (const MachineOperand &Op : MI->explicit_operands()) {
2537 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2538 continue;
2539
2540 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2541 continue;
2542
2543 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2544 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2545 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2546 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2547 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2548 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2549 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2550 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2551 const int MaxWaitStates = 18;
2552 Register Reg = Op.getReg();
2553 unsigned HazardDefLatency = 0;
2554
2555 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2556 this](const MachineInstr &MI) {
2557 if (!SIInstrInfo::isMFMA(MI))
2558 return false;
2559 Register DstReg = MI.getOperand(0).getReg();
2560 if (DstReg == Reg)
2561 return false;
2562 HazardDefLatency =
2563 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2564 return TRI.regsOverlap(DstReg, Reg);
2565 };
2566
2567 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2568 MaxWaitStates);
2569 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2570 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2571 int OpNo = Op.getOperandNo();
2572 if (OpNo == SrcCIdx) {
2573 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2574 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2575 switch (HazardDefLatency) {
2576 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2577 break;
2578 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2579 break;
2580 case 16: [[fallthrough]];
2581 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2582 break;
2583 }
2584 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2585 switch (HazardDefLatency) {
2586 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2587 break;
2588 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2589 break;
2590 case 16: [[fallthrough]];
2591 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2592 break;
2593 }
2594 }
2595
2596 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2597 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2598
2599 if (WaitStatesNeeded == MaxWaitStates)
2600 return WaitStatesNeeded; // Early exit.
2601
2602 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2603 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2604 return false;
2605 Register DstReg = MI.getOperand(0).getReg();
2606 return TRI.regsOverlap(Reg, DstReg);
2607 };
2608
2609 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2610 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2611 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2612 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2613 if (OpNo == SrcCIdx)
2614 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2615 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2616 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2617
2618 WaitStatesNeededForUse = NeedWaitStates -
2619 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2620 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2621
2622 if (WaitStatesNeeded == MaxWaitStates)
2623 return WaitStatesNeeded; // Early exit.
2624 }
2625
2626 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2627 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2628 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2629 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2630 const int MaxWaitStates = 13;
2631 Register DstReg = MI->getOperand(0).getReg();
2632 unsigned HazardDefLatency = 0;
2633
2634 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2635 this](const MachineInstr &MI) {
2636 if (!SIInstrInfo::isMFMA(MI))
2637 return false;
2638 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2639 HazardDefLatency =
2640 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2641 return TRI.regsOverlap(Reg, DstReg);
2642 };
2643
2644 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2645 int NeedWaitStates;
2646 switch (HazardDefLatency) {
2647 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2648 break;
2649 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2650 break;
2651 case 16: [[fallthrough]];
2652 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2653 break;
2654 }
2655
2656 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2657 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2658 }
2659
2660 // Pad neighboring MFMA with noops for better inter-wave performance.
2661 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2662
2663 return WaitStatesNeeded;
2664}
2665
2666static int
2668 bool IsGFX950) {
2669 // xdl def cycles | gfx940 | gfx950
2670 // 2 pass | 3 4
2671 // 4 pass | 5 6
2672 // 8 pass | 9 10
2673 // 16 pass | 17 18
2674 return NumPasses + 1 + IsGFX950;
2675}
2676
2677static int
2679 bool IsGFX950) {
2680 // xdl def cycles | gfx940 | gfx950
2681 // 2 pass | 3 3
2682 // 4 pass | 5 6
2683 // 8 pass | 9 10
2684 // 16 pass | 17 18
2685 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2686}
2687
2688static int
2690 // 2 pass -> 2
2691 // 4 pass -> 4
2692 // 8 pass -> 8
2693 // 16 pass -> 16
2694 return NumPasses;
2695}
2696
2697static int
2699 // 2 pass -> 4
2700 // 4 pass -> 6
2701 // 8 pass -> 10
2702 // 16 pass -> 18
2703 return NumPasses + 2;
2704}
2705
2707 bool IsGFX950) {
2708 // xdl def cycles | gfx942 | gfx950
2709 // 2 pass | 5 5
2710 // 4 pass | 7 8
2711 // 8 pass | 11 12
2712 // 16 pass | 19 20
2713 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2714}
2715
2716int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) const {
2717 int WaitStatesNeeded = 0;
2718 unsigned Opc = MI->getOpcode();
2719
2720 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2722 };
2723
2724 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2727 };
2728
2729 if (!SIInstrInfo::isMFMA(*MI))
2730 return WaitStatesNeeded;
2731
2732 const int VALUWritesExecWaitStates = 4;
2733 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2734 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2735 VALUWritesExecWaitStates);
2736 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2737
2738 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2739
2740 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2741 for (const MachineOperand &Use : MI->explicit_uses()) {
2742 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2743 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2744 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2745 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2746 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2747 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2748 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2749 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2750 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2751 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2752 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2753 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2754 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2755 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2756 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2757 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2758 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2759 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2760 const int MaxWaitStates = 19;
2761
2762 if (!Use.isReg())
2763 continue;
2764 Register Reg = Use.getReg();
2765 bool FullReg;
2766 const MachineInstr *MI1;
2767
2768 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2769 this](const MachineInstr &MI) {
2770 if (!SIInstrInfo::isMFMA(MI))
2771 return false;
2772 Register DstReg = MI.getOperand(0).getReg();
2773 FullReg = (DstReg == Reg);
2774 MI1 = &MI;
2775 return TRI.regsOverlap(DstReg, Reg);
2776 };
2777
2778 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2779 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2780 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2781
2782 int NumWaitStates =
2783 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2784 if (NumWaitStates == std::numeric_limits<int>::max())
2785 continue;
2786
2787 int OpNo = Use.getOperandNo();
2788 unsigned Opc1 = MI1->getOpcode();
2789 int NeedWaitStates = 0;
2790 if (OpNo == SrcCIdx) {
2791 if (!SIInstrInfo::isDGEMM(Opc) &&
2792 (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opc1))) {
2793 NeedWaitStates = 0;
2794 } else if (FullReg) {
2795 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2796 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2797 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2798 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2799 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2800 else if (ST.hasGFX940Insts() &&
2801 TSchedModel.computeInstrLatency(MI1) == 2)
2802 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2803 } else {
2804 switch (Opc1) {
2805 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2806 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2807 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2808 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2809 if (!TII.isXDL(*MI))
2810 NeedWaitStates =
2811 ST.hasGFX950Insts()
2812 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2813 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2814 break;
2815 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2816 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2817 if (!TII.isXDL(*MI))
2818 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2819 break;
2820 default:
2821 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2822 if (ST.hasGFX940Insts()) {
2823 if (TII.isXDL(*MI) && !TII.isXDL(*MI1))
2824 break;
2825
2826 NeedWaitStates =
2827 TII.isXDL(*MI1)
2828 ? (TII.isXDL(*MI)
2830 NumPasses, ST.hasGFX950Insts())
2832 NumPasses, ST.hasGFX950Insts()))
2834 NumPasses);
2835 break;
2836 }
2837
2838 switch (NumPasses) {
2839 case 2:
2840 NeedWaitStates =
2842 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2843 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2844 break;
2845 case 8:
2846 NeedWaitStates =
2848 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2849 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2850 break;
2851 case 16:
2852 NeedWaitStates =
2854 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2855 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2856 break;
2857 default:
2858 llvm_unreachable("unexpected number of passes");
2859 }
2860 }
2861 }
2862 } else {
2863 switch (Opc1) {
2864 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2865 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2866 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2867 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2868 NeedWaitStates =
2869 ST.hasGFX950Insts()
2870 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2871 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2872 break;
2873 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2874 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2875 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2876 break;
2877 default:
2878 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2879
2880 if (ST.hasGFX940Insts()) {
2881 NeedWaitStates =
2882 TII.isXDL(*MI1)
2884 NumPasses, ST.hasGFX950Insts())
2886 NumPasses);
2887 break;
2888 }
2889
2890 switch (NumPasses) {
2891 case 2:
2892 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2893 break;
2894 case 4:
2895 llvm_unreachable("unexpected number of passes for mfma");
2896 case 8:
2897 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2898 break;
2899 case 16:
2900 default:
2901 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2902 }
2903 }
2904 }
2905 if (WaitStatesNeeded >= NeedWaitStates)
2906 continue;
2907
2908 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2909 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2910
2911 if (WaitStatesNeeded == MaxWaitStates)
2912 break;
2913 }
2914
2915 // Pad neighboring MFMA with noops for better inter-wave performance.
2916 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2917
2918 return WaitStatesNeeded;
2919}
2920
2921int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) const {
2922 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2923 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2924 return 0;
2925
2926 int WaitStatesNeeded = 0;
2927
2928 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2929 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2930 };
2931
2932 for (const MachineOperand &Op : MI->explicit_uses()) {
2933 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2934 continue;
2935
2936 Register Reg = Op.getReg();
2937
2938 const int AccVgprReadLdStWaitStates = 2;
2939 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2940 const int MaxWaitStates = 2;
2941
2942 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2943 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2944 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2945
2946 if (WaitStatesNeeded == MaxWaitStates)
2947 return WaitStatesNeeded; // Early exit.
2948
2949 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2950 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2951 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2952 return false;
2953 auto IsVALUFn = [](const MachineInstr &MI) {
2955 };
2956 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2957 std::numeric_limits<int>::max();
2958 };
2959
2960 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2961 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2962 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2963 }
2964
2965 return WaitStatesNeeded;
2966}
2967
2968int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) const {
2969 assert(!ST.hasVcmpxPermlaneHazard() &&
2970 "this is a different vcmpx+permlane hazard");
2971 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2972 const SIInstrInfo *TII = ST.getInstrInfo();
2973
2974 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2975 return isVCmpXWritesExec(*TII, *TRI, MI);
2976 };
2977
2978 auto IsVALUFn = [](const MachineInstr &MI) {
2979 return SIInstrInfo::isVALU(MI);
2980 };
2981
2982 const int VCmpXWritesExecWaitStates = 4;
2983 const int VALUWritesVDstWaitStates = 2;
2984 int WaitStatesNeeded = 0;
2985
2986 for (const MachineOperand &Op : MI->explicit_uses()) {
2987 if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
2988 continue;
2989 Register Reg = Op.getReg();
2990
2991 int WaitStatesSinceDef =
2992 VALUWritesVDstWaitStates -
2993 getWaitStatesSinceDef(Reg, IsVALUFn,
2994 /*MaxWaitStates=*/VALUWritesVDstWaitStates);
2995 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2996 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2997 break;
2998 }
2999
3000 int VCmpXHazardWaits =
3001 VCmpXWritesExecWaitStates -
3002 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
3003
3004 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
3005 return WaitStatesNeeded;
3006}
3007
3009 // 2 pass -> 4
3010 // 4 pass -> 6
3011 // 8 pass -> 10
3012 // 16 pass -> 18
3013 return NumPasses + 2;
3014}
3015
3017 bool IsGFX950) {
3018 // xdl def cycles | gfx942 | gfx950
3019 // 2 pass | 5 5
3020 // 4 pass | 7 8
3021 // 8 pass | 11 12
3022 // 16 pass | 19 20
3023 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3024}
3025
3027 bool IsGFX950) {
3028 // xdl def cycles | gfx942 | gfx950
3029 // 2 pass | 5 5
3030 // 4 pass | 7 8
3031 // 8 pass | 11 12
3032 // 16 pass | 19 20
3033 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3034}
3035
3037 // 2 pass -> 4
3038 // 4 pass -> 6
3039 // 8 pass -> 10
3040 // 16 pass -> 18
3041 return NumPasses + 2;
3042}
3043
3044int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) const {
3045 if (!ST.hasGFX90AInsts())
3046 return 0;
3047
3048 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
3049 return SIInstrInfo::isDGEMM(MI.getOpcode());
3050 };
3051
3052 // This is checked in checkMAIHazards90A()
3053 if (SIInstrInfo::isMFMA(*MI))
3054 return 0;
3055
3056 const MachineRegisterInfo &MRI = MF.getRegInfo();
3057
3058 int WaitStatesNeeded = 0;
3059
3060 bool IsMem = SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI);
3061 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
3062 bool IsVALU = SIInstrInfo::isVALU(*MI);
3063
3064 const MachineInstr *MFMA = nullptr;
3065 unsigned Reg;
3066 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3067 if (!SIInstrInfo::isMFMA(MI) ||
3068 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
3069 return false;
3070 MFMA = &MI;
3071 return true;
3072 };
3073
3074 const MachineInstr *DOT = nullptr;
3075 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
3076 if (!SIInstrInfo::isDOT(MI) ||
3077 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
3078 return false;
3079 DOT = &MI;
3080 return true;
3081 };
3082
3083 bool DGEMMAfterVALUWrite = false;
3084 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
3085 // Found DGEMM on reverse traversal to def.
3086 if (SIInstrInfo::isDGEMM(MI.getOpcode()))
3087 DGEMMAfterVALUWrite = true;
3088
3089 // Only hazard if register is defined by a VALU and a DGEMM is found after
3090 // after the def.
3091 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
3092 return false;
3093
3094 return true;
3095 };
3096
3097 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
3098 AMDGPU::OpName::src2);
3099
3100 if (IsMemOrExport || IsVALU) {
3101 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
3102 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3103 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3104 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3105 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3106 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3107 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3108 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3109 const int DotWriteSameDotReadSrcAB = 3;
3110 const int DotWriteDifferentVALURead = 3;
3111 const int DMFMABetweenVALUWriteVMEMRead = 2;
3112 const int MaxWaitStates = 19;
3113
3114 for (const MachineOperand &Use : MI->explicit_uses()) {
3115 if (!Use.isReg())
3116 continue;
3117 Reg = Use.getReg();
3118
3119 DOT = nullptr;
3120 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3121 MaxWaitStates);
3122 if (DOT) {
3123 int NeedWaitStates = 0;
3124 if (DOT->getOpcode() == MI->getOpcode()) {
3125 if (&Use - &MI->getOperand(0) != SrcCIdx)
3126 NeedWaitStates = DotWriteSameDotReadSrcAB;
3127 } else {
3128 NeedWaitStates = DotWriteDifferentVALURead;
3129 }
3130
3131 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3132 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3133 }
3134
3135 // Workaround for HW data hazard bug observed only in GFX90A. When there
3136 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
3137 // causes the SQ to incorrectly not insert two wait states between the two
3138 // instructions needed to avoid data hazard.
3139 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3140 DGEMMAfterVALUWrite = false;
3141 if (TRI.isVectorRegister(MRI, Reg)) {
3142 int WaitStatesNeededForUse =
3143 DMFMABetweenVALUWriteVMEMRead -
3144 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
3145 DMFMABetweenVALUWriteVMEMRead);
3146
3147 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3148 }
3149 }
3150
3151 MFMA = nullptr;
3152 WaitStatesSinceDef =
3153 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3154 if (!MFMA)
3155 continue;
3156
3157 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3158 int NumPasses = HazardDefLatency;
3159 int NeedWaitStates = MaxWaitStates;
3160
3161 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3162 switch (HazardDefLatency) {
3163 case 4:
3164 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3165 : DMFMA4x4WriteVgprVALUReadWaitStates;
3166 break;
3167 case 8:
3168 case 16:
3169 NeedWaitStates =
3170 IsMemOrExport
3171 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3172 : (ST.hasGFX950Insts()
3173 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3174 : DMFMA16x16WriteVgprVALUReadWaitStates);
3175 break;
3176 default:
3177 llvm_unreachable("unexpected dgemm");
3178 }
3179 } else if (ST.hasGFX940Insts()) {
3180 NeedWaitStates =
3181 TII.isXDL(*MFMA)
3183 NumPasses, ST.hasGFX950Insts())
3185 NumPasses);
3186 } else {
3187 switch (HazardDefLatency) {
3188 case 2:
3189 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3190 break;
3191 case 8:
3192 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3193 break;
3194 case 16:
3195 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3196 break;
3197 default:
3198 llvm_unreachable("unexpected number of passes for mfma");
3199 }
3200 }
3201
3202 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3203 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3204
3205 if (WaitStatesNeeded == MaxWaitStates)
3206 break;
3207 }
3208 }
3209
3210 unsigned Opc = MI->getOpcode();
3211 const int DMFMAToFMA64WaitStates = 2;
3212 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
3213 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
3214 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3215 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3216 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3217 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3218 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3219 }
3220
3221 if (!IsVALU && !IsMemOrExport)
3222 return WaitStatesNeeded;
3223
3224 for (const MachineOperand &Def : MI->defs()) {
3225 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3226 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3227 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3228 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3229 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3230 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3231 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3232 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3233 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3234 const int DotWriteDifferentVALUWrite = 3;
3235 const int MaxWaitStates = 19;
3236 const int MaxWarWaitStates = 15;
3237
3238 Reg = Def.getReg();
3239
3240 DOT = nullptr;
3241 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3242 MaxWaitStates);
3243 if (DOT && DOT->getOpcode() != MI->getOpcode())
3244 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3245 WaitStatesSinceDef);
3246
3247 MFMA = nullptr;
3248 WaitStatesSinceDef =
3249 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3250 if (MFMA) {
3251 int NeedWaitStates = MaxWaitStates;
3252 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
3253
3254 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3255 switch (NumPasses) {
3256 case 4:
3257 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3258 break;
3259 case 8:
3260 case 16:
3261 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3262 break;
3263 default:
3264 llvm_unreachable("unexpected number of cycles for dgemm");
3265 }
3266 } else if (ST.hasGFX940Insts()) {
3267 NeedWaitStates =
3268 TII.isXDL(*MFMA)
3270 NumPasses, ST.hasGFX950Insts())
3272 } else {
3273 switch (NumPasses) {
3274 case 2:
3275 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3276 break;
3277 case 8:
3278 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3279 break;
3280 case 16:
3281 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3282 break;
3283 default:
3284 llvm_unreachable("Unexpected number of passes for mfma");
3285 }
3286 }
3287
3288 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3289 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3290
3291 if (WaitStatesNeeded == MaxWaitStates)
3292 break;
3293 }
3294
3295 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3296 if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(MI.getOpcode()) ||
3297 !MI.readsRegister(Reg, &TRI))
3298 return false;
3299
3300 if (ST.hasGFX940Insts() && !TII.isXDL(MI))
3301 return false;
3302
3303 const MachineOperand *SrcC =
3304 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
3305 assert(SrcC);
3306 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
3307 return false;
3308
3309 MFMA = &MI;
3310 return true;
3311 };
3312
3313 MFMA = nullptr;
3314 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3315 MaxWarWaitStates);
3316 if (!MFMA)
3317 continue;
3318
3319 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3320 int NeedWaitStates = MaxWaitStates;
3321 switch (HazardDefLatency) {
3322 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3323 break;
3324 case 4: assert(ST.hasGFX940Insts());
3325 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3326 break;
3327 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3328 break;
3329 case 16: [[fallthrough]];
3330 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3331 break;
3332 }
3333
3334 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3335 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3336 }
3337
3338 return WaitStatesNeeded;
3339}
3340
3342 if (!SU->isInstr())
3343 return false;
3344
3345 const MachineInstr *MAI = nullptr;
3346
3347 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
3348 MAI = nullptr;
3350 MAI = &MI;
3351 return MAI != nullptr;
3352 };
3353
3354 MachineInstr *MI = SU->getInstr();
3355 if (IsMFMAFn(*MI)) {
3356 int W = getWaitStatesSince(IsMFMAFn, 16);
3357 if (MAI)
3358 return W < (int)TSchedModel.computeInstrLatency(MAI);
3359 }
3360
3361 return false;
3362}
3363
3364// Adjust global offsets for instructions bundled with S_GETPC_B64 after
3365// insertion of a new instruction.
3366static void updateGetPCBundle(MachineInstr *NewMI) {
3367 if (!NewMI->isBundled())
3368 return;
3369
3370 // Find start of bundle.
3371 auto I = NewMI->getIterator();
3372 while (I->isBundledWithPred())
3373 I--;
3374 if (I->isBundle())
3375 I++;
3376
3377 // Bail if this is not an S_GETPC bundle.
3378 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
3379 return;
3380
3381 // Update offsets of any references in the bundle.
3382 const unsigned NewBytes = 4;
3383 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3384 "Unexpected instruction insertion in bundle");
3385 auto NextMI = std::next(NewMI->getIterator());
3386 auto End = NewMI->getParent()->end();
3387 while (NextMI != End && NextMI->isBundledWithPred()) {
3388 for (auto &Operand : NextMI->operands()) {
3389 if (Operand.isGlobal())
3390 Operand.setOffset(Operand.getOffset() + NewBytes);
3391 }
3392 NextMI++;
3393 }
3394}
3395
3396bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
3397 if (!ST.hasVALUMaskWriteHazard())
3398 return false;
3399 assert(!ST.hasExtendedWaitCounts());
3400
3401 if (!ST.isWave64())
3402 return false;
3403
3404 const bool IsSALU = SIInstrInfo::isSALU(*MI);
3405 const bool IsVALU = SIInstrInfo::isVALU(*MI);
3406 if (!IsSALU && !IsVALU)
3407 return false;
3408
3409 // The hazard sequence is three instructions:
3410 // 1. VALU reads SGPR as mask
3411 // 2. VALU/SALU writes SGPR
3412 // 3. VALU/SALU reads SGPR
3413 // The hazard can expire if the distance between 2 and 3 is sufficient,
3414 // or (2) is VALU and (3) is SALU.
3415 // In practice this happens <10% of the time, hence always assume the hazard
3416 // exists if (1) and (2) are present to avoid searching all SGPR reads.
3417
3418 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3419 const MachineRegisterInfo &MRI = MF.getRegInfo();
3420
3421 auto IgnoreableSGPR = [](const Register Reg) {
3422 switch (Reg) {
3423 case AMDGPU::EXEC:
3424 case AMDGPU::EXEC_LO:
3425 case AMDGPU::EXEC_HI:
3426 case AMDGPU::M0:
3427 case AMDGPU::SGPR_NULL:
3428 case AMDGPU::SGPR_NULL64:
3429 case AMDGPU::SCC:
3430 return true;
3431 default:
3432 return false;
3433 }
3434 };
3435 auto IsVCC = [](const Register Reg) {
3436 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3437 };
3438
3439 struct StateType {
3440 SmallSet<Register, 2> HazardSGPRs;
3441
3442 static unsigned getHashValue(const StateType &State) {
3443 return hash_combine_range(State.HazardSGPRs);
3444 }
3445 static bool isEqual(const StateType &LHS, const StateType &RHS) {
3446 return LHS.HazardSGPRs == RHS.HazardSGPRs;
3447 }
3448 };
3449
3450 SmallVector<const MachineInstr *> WaitInstrs;
3451 bool HasSGPRRead = false;
3452 StateType InitialState;
3453
3454 // Look for SGPR write.
3455 MachineOperand *HazardDef = nullptr;
3456 for (MachineOperand &Op : MI->operands()) {
3457 if (!Op.isReg())
3458 continue;
3459 if (Op.isDef() && HazardDef)
3460 continue;
3461
3462 Register Reg = Op.getReg();
3463 if (IgnoreableSGPR(Reg))
3464 continue;
3465 if (!IsVCC(Reg)) {
3466 if (Op.isImplicit())
3467 continue;
3468 if (!TRI->isSGPRReg(MRI, Reg))
3469 continue;
3470 }
3471 // Also check for SGPR reads.
3472 if (Op.isUse()) {
3473 HasSGPRRead = true;
3474 continue;
3475 }
3476
3477 assert(!HazardDef);
3478 HazardDef = &Op;
3479 }
3480
3481 if (!HazardDef)
3482 return false;
3483
3484 // Setup to track writes to individual SGPRs
3485 const Register HazardReg = HazardDef->getReg();
3486 if (AMDGPU::SReg_32RegClass.contains(HazardReg)) {
3487 InitialState.HazardSGPRs.insert(HazardReg);
3488 } else {
3489 assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
3490 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3491 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3492 }
3493
3494 auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
3495 if (State.HazardSGPRs.empty())
3496 return HazardExpired;
3497
3498 switch (I.getOpcode()) {
3499 case AMDGPU::V_ADDC_U32_e32:
3500 case AMDGPU::V_ADDC_U32_dpp:
3501 case AMDGPU::V_CNDMASK_B16_t16_e32:
3502 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3503 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3504 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3505 case AMDGPU::V_CNDMASK_B32_e32:
3506 case AMDGPU::V_CNDMASK_B32_dpp:
3507 case AMDGPU::V_DIV_FMAS_F32_e64:
3508 case AMDGPU::V_DIV_FMAS_F64_e64:
3509 case AMDGPU::V_SUBB_U32_e32:
3510 case AMDGPU::V_SUBB_U32_dpp:
3511 case AMDGPU::V_SUBBREV_U32_e32:
3512 case AMDGPU::V_SUBBREV_U32_dpp: {
3513 // These implicitly read VCC as mask source.
3514 return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
3515 }
3516 case AMDGPU::V_ADDC_U32_e64:
3517 case AMDGPU::V_ADDC_U32_e64_dpp:
3518 case AMDGPU::V_CNDMASK_B16_t16_e64:
3519 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3520 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3521 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3522 case AMDGPU::V_CNDMASK_B32_e64:
3523 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3524 case AMDGPU::V_SUBB_U32_e64:
3525 case AMDGPU::V_SUBB_U32_e64_dpp:
3526 case AMDGPU::V_SUBBREV_U32_e64:
3527 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3528 // Only check mask register overlaps.
3529 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
3530 assert(SSRCOp);
3531 bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg);
3532 return Result ? HazardFound : NoHazardFound;
3533 }
3534 default:
3535 return NoHazardFound;
3536 }
3537 };
3538
3539 const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
3541 0),
3542 0);
3543 auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
3544 switch (I.getOpcode()) {
3545 case AMDGPU::S_WAITCNT_DEPCTR:
3546 // Record mergable waits within region of instructions free of SGPR reads.
3547 if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
3548 (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3549 WaitInstrs.push_back(&I);
3550 break;
3551 default:
3552 // Update tracking of SGPR reads and writes.
3553 for (auto &Op : I.operands()) {
3554 if (!Op.isReg())
3555 continue;
3556
3557 Register Reg = Op.getReg();
3558 if (IgnoreableSGPR(Reg))
3559 continue;
3560 if (!IsVCC(Reg)) {
3561 if (Op.isImplicit())
3562 continue;
3563 if (!TRI->isSGPRReg(MRI, Reg))
3564 continue;
3565 }
3566 if (Op.isUse()) {
3567 HasSGPRRead = true;
3568 continue;
3569 }
3570
3571 // Stop tracking any SGPRs with writes on the basis that they will
3572 // already have an appropriate wait inserted afterwards.
3574 for (Register SGPR : State.HazardSGPRs) {
3575 if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR))
3576 Found.push_back(SGPR);
3577 }
3578 for (Register SGPR : Found)
3579 State.HazardSGPRs.erase(SGPR);
3580 }
3581 break;
3582 }
3583 };
3584
3585 // Check for hazard
3586 if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
3587 MI->getParent(),
3588 std::next(MI->getReverseIterator())))
3589 return false;
3590
3591 // Compute counter mask
3592 unsigned DepCtr =
3593 IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0, ST)
3594 : AMDGPU::DepCtr::encodeFieldVaSdst(0, ST))
3595 : AMDGPU::DepCtr::encodeFieldSaSdst(0, ST);
3596
3597 // Try to merge previous waits into this one for regions with no SGPR reads.
3598 if (!WaitInstrs.empty()) {
3599 // Note: WaitInstrs contains const pointers, so walk backward from MI to
3600 // obtain a mutable pointer to each instruction to be merged.
3601 // This is expected to be a very short walk within the same block.
3602 SmallVector<MachineInstr *> ToErase;
3603 unsigned Found = 0;
3604 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3605 End = MI->getParent()->rend();
3606 Found < WaitInstrs.size() && It != End; ++It) {
3607 MachineInstr *WaitMI = &*It;
3608 // Find next wait instruction.
3609 if (std::as_const(WaitMI) != WaitInstrs[Found])
3610 continue;
3611 Found++;
3612 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3613 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3614 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3615 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3616 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3617 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3618 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3619 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3620 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3621 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3622 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3623 ToErase.push_back(WaitMI);
3624 }
3625 assert(Found == WaitInstrs.size());
3626 for (MachineInstr *WaitMI : ToErase)
3627 WaitMI->eraseFromParent();
3628 }
3629
3630 // Add s_waitcnt_depctr after SGPR write.
3631 auto NextMI = std::next(MI->getIterator());
3632 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3633 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3634 .addImm(DepCtr);
3635
3636 // SALU write may be s_getpc in a bundle.
3637 updateGetPCBundle(NewMI);
3638
3639 return true;
3640}
3641
3642static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3643 const SIInstrInfo &TII) {
3644 MachineBasicBlock &EntryMBB = MF->front();
3645 if (EntryMBB.begin() != EntryMBB.end()) {
3646 auto &EntryMI = *EntryMBB.begin();
3647 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3648 EntryMI.getOperand(0).getImm() >= Priority)
3649 return false;
3650 }
3651
3652 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3653 .addImm(Priority);
3654 return true;
3655}
3656
3657bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3658 if (!ST.hasRequiredExportPriority())
3659 return false;
3660
3661 // Assume the following shader types will never have exports,
3662 // and avoid adding or adjusting S_SETPRIO.
3663 MachineBasicBlock *MBB = MI->getParent();
3664 MachineFunction *MF = MBB->getParent();
3665 auto CC = MF->getFunction().getCallingConv();
3666 switch (CC) {
3671 return false;
3672 default:
3673 break;
3674 }
3675
3676 const int MaxPriority = 3;
3677 const int NormalPriority = 2;
3678 const int PostExportPriority = 0;
3679
3680 auto It = MI->getIterator();
3681 switch (MI->getOpcode()) {
3682 case AMDGPU::S_ENDPGM:
3683 case AMDGPU::S_ENDPGM_SAVED:
3684 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3685 case AMDGPU::SI_RETURN_TO_EPILOG:
3686 // Ensure shader with calls raises priority at entry.
3687 // This ensures correct priority if exports exist in callee.
3688 if (MF->getFrameInfo().hasCalls())
3689 return ensureEntrySetPrio(MF, NormalPriority, TII);
3690 return false;
3691 case AMDGPU::S_SETPRIO: {
3692 // Raise minimum priority unless in workaround.
3693 auto &PrioOp = MI->getOperand(0);
3694 int Prio = PrioOp.getImm();
3695 bool InWA = (Prio == PostExportPriority) &&
3696 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3697 if (InWA || Prio >= NormalPriority)
3698 return false;
3699 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3700 return true;
3701 }
3702 default:
3703 if (!TII.isEXP(*MI))
3704 return false;
3705 break;
3706 }
3707
3708 // Check entry priority at each export (as there will only be a few).
3709 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3710 bool Changed = false;
3712 Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3713
3714 auto NextMI = std::next(It);
3715 bool EndOfShader = false;
3716 if (NextMI != MBB->end()) {
3717 // Only need WA at end of sequence of exports.
3718 if (TII.isEXP(*NextMI))
3719 return Changed;
3720 // Assume appropriate S_SETPRIO after export means WA already applied.
3721 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3722 NextMI->getOperand(0).getImm() == PostExportPriority)
3723 return Changed;
3724 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3725 }
3726
3727 const DebugLoc &DL = MI->getDebugLoc();
3728
3729 // Lower priority.
3730 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3731 .addImm(PostExportPriority);
3732
3733 if (!EndOfShader) {
3734 // Wait for exports to complete.
3735 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3736 .addReg(AMDGPU::SGPR_NULL)
3737 .addImm(0);
3738 }
3739
3740 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3741 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3742
3743 if (!EndOfShader) {
3744 // Return to normal (higher) priority.
3745 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3746 .addImm(NormalPriority);
3747 }
3748
3749 return true;
3750}
3751
3752bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
3753 if (!isSGetReg(MI->getOpcode()))
3754 return false;
3755
3756 const SIInstrInfo *TII = ST.getInstrInfo();
3757 switch (getHWReg(TII, *MI)) {
3758 default:
3759 return false;
3764 break;
3765 }
3766
3767 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3768 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3769 .addImm(0);
3770 return true;
3771}
3772
3773bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
3774 if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3775 return false;
3776
3777 const SIInstrInfo *TII = ST.getInstrInfo();
3778 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3779 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3781 BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(),
3782 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3784
3785 return true;
3786}
3787
3788bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
3789 // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3790 // for hazard to trigger.
3791 if (!IsHazardRecognizerMode)
3792 return false;
3793
3794 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3795 const SIInstrInfo *TII = ST.getInstrInfo();
3796 // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3797 const int FlatScrBaseWaitStates = 10;
3798
3799 bool ReadsFlatScrLo =
3800 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3801 bool ReadsFlatScrHi =
3802 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3803 if (isSGetReg(MI->getOpcode())) {
3804 switch (getHWReg(TII, *MI)) {
3805 default:
3806 break;
3808 ReadsFlatScrLo = true;
3809 break;
3811 ReadsFlatScrHi = true;
3812 break;
3813 }
3814 }
3815
3816 const MachineRegisterInfo &MRI = MF.getRegInfo();
3817
3818 auto IsRegDefHazard = [&](Register Reg) -> bool {
3819 DenseSet<const MachineBasicBlock *> Visited;
3820 auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3821 return MI.modifiesRegister(Reg, TRI);
3822 };
3823
3824 // This literally abuses the idea of waitstates. Instead of waitstates it
3825 // returns 1 for SGPR written and 0 otherwise.
3826 auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3827 if (!TII->isSALU(MI) && !TII->isVALU(MI))
3828 return 0;
3829 for (const MachineOperand &MO : MI.all_defs()) {
3830 if (TRI->isSGPRReg(MRI, MO.getReg()))
3831 return 1;
3832 }
3833 return 0;
3834 };
3835
3836 auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3837 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3838 unsigned Wait = MI.getOperand(0).getImm();
3841 return true;
3842 }
3843 return SgprWrites >= FlatScrBaseWaitStates;
3844 };
3845
3846 return ::getWaitStatesSince(
3847 IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()),
3848 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3849 };
3850
3851 if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
3852 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3853 (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
3854 !IsRegDefHazard(AMDGPU::SGPR103)))
3855 return false;
3856
3857 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3858 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3861 return true;
3862}
3863
3864bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
3865 if (!isSSetReg(MI->getOpcode()) ||
3866 MI->getOperand(1).getImm() != AMDGPU::Hwreg::ID_MODE)
3867 return false;
3868
3869 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3870 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3871 return true;
3872}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static cl::opt< bool > EnableWMMAVnopHoisting("amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden, cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"))
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, unsigned Latency, unsigned Category)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(DataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
Definition LineTable.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
unsigned get(InstCounterType T) const
BitVector & set()
Definition BitVector.h:370
A debug info location.
Definition DebugLoc.h:123
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Definition DenseMap.h:274
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
unsigned getHazardWaitStates(MachineInstr *MI) const
Returns the number of wait states until all hazards for MI are resolved.
unsigned PreEmitNoopsCommon(MachineInstr *) const
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
bool ShouldPreferAnother(SUnit *SU) const override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
GCNHazardRecognizer(const MachineFunction &MF, MachineLoopInfo *MLI=nullptr)
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
Instructions::const_reverse_iterator const_reverse_instr_iterator
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool isConstantPhysReg(MCRegister PhysReg) const
Returns true if PhysReg is unallocatable and constant throughout the function.
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:490
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
A range adaptor for a pair of iterators.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ Entry
Definition COFF.h:862
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
This namespace contains all of the command line option processing machinery.
Definition CommandLine.h:52
initializer< Ty > init(const Ty &Val)
constexpr double e
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Wait
Definition Threading.h:60
constexpr RegState getDeadRegState(bool B)
Op::Description Desc
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...