LLVM 23.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
22
23using namespace llvm;
24
25namespace {
26
27struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
28 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
29
30 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
31 if (Arg.getAsInteger(0, Value))
32 return O.error("'" + Arg + "' value invalid for uint argument!");
33
34 if (Value > 100)
35 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
36
37 return false;
38 }
39};
40
41} // end anonymous namespace
42
44 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
45 cl::desc("Fill a percentage of the latency between "
46 "neighboring MFMA with s_nops."));
47
48// This is intended for debugging purposes only.
50 NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden,
51 cl::desc("Insert a s_nop x before every instruction"));
52
53//===----------------------------------------------------------------------===//
54// Hazard Recognizer Implementation
55//===----------------------------------------------------------------------===//
56
58 const GCNSubtarget &ST);
59
61 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
62 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
63 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
64 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
65 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
66 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
67}
68
70 EmittedInstrs.clear();
71}
72
76
78 CurrCycleInstr = MI;
79}
80
81static bool isDivFMas(unsigned Opcode) {
82 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
83}
84
85static bool isSGetReg(unsigned Opcode) {
86 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
87}
88
89static bool isSSetReg(unsigned Opcode) {
90 switch (Opcode) {
91 case AMDGPU::S_SETREG_B32:
92 case AMDGPU::S_SETREG_B32_mode:
93 case AMDGPU::S_SETREG_IMM32_B32:
94 case AMDGPU::S_SETREG_IMM32_B32_mode:
95 return true;
96 }
97 return false;
98}
99
100static bool isRWLane(unsigned Opcode) {
101 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
102}
103
104static bool isRFE(unsigned Opcode) {
105 return Opcode == AMDGPU::S_RFE_B64;
106}
107
108static bool isSMovRel(unsigned Opcode) {
109 switch (Opcode) {
110 case AMDGPU::S_MOVRELS_B32:
111 case AMDGPU::S_MOVRELS_B64:
112 case AMDGPU::S_MOVRELD_B32:
113 case AMDGPU::S_MOVRELD_B64:
114 return true;
115 default:
116 return false;
117 }
118}
119
121 const MachineInstr &MI) {
122 if (TII.isAlwaysGDS(MI.getOpcode()))
123 return true;
124
125 switch (MI.getOpcode()) {
126 case AMDGPU::S_SENDMSG:
127 case AMDGPU::S_SENDMSGHALT:
128 case AMDGPU::S_TTRACEDATA:
129 return true;
130 // These DS opcodes don't support GDS.
131 case AMDGPU::DS_NOP:
132 case AMDGPU::DS_PERMUTE_B32:
133 case AMDGPU::DS_BPERMUTE_B32:
134 return false;
135 default:
136 if (TII.isDS(MI.getOpcode())) {
137 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
138 AMDGPU::OpName::gds);
139 if (MI.getOperand(GDS).getImm())
140 return true;
141 }
142 return false;
143 }
144}
145
146static bool isPermlane(const MachineInstr &MI) {
147 unsigned Opcode = MI.getOpcode();
148 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
149 Opcode == AMDGPU::V_PERMLANE64_B32 ||
150 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
151 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
152 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
153 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
154 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
155 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
156 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
157 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
158 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
159 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
160 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
161 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
162}
163
164static bool isLdsDma(const MachineInstr &MI) {
165 return SIInstrInfo::isVALU(MI) &&
167}
168
169static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
170 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
171 AMDGPU::OpName::simm16);
172 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
173}
174
177 MachineInstr *MI = SU->getInstr();
178 // If we are not in "HazardRecognizerMode" and therefore not being run from
179 // the scheduler, track possible stalls from hazards but don't insert noops.
180 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
181
182 if (MI->isBundle())
183 return NoHazard;
184
185 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
186 return HazardType;
187
188 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
189 return HazardType;
190
191 if (checkFPAtomicToDenormModeHazard(MI) > 0)
192 return HazardType;
193
194 // Hazards which cannot be mitigated with S_NOPs.
195 if (!IsHazardRecognizerMode) {
196 if (checkWMMACoexecutionHazards(MI) > 0)
197 return Hazard;
198 }
199
200 if (ST.hasNoDataDepHazard())
201 return NoHazard;
202
203 if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
204 return HazardType;
205
206 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
207 return HazardType;
208
209 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
210 return HazardType;
211
212 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
213 return HazardType;
214
215 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
216 return HazardType;
217
220 checkMAIVALUHazards(MI) > 0)
221 return HazardType;
222
223 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
224 return HazardType;
225
226 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
227 return HazardType;
228
229 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
230 return HazardType;
231
232 if (((ST.hasReadM0MovRelInterpHazard() &&
233 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
234 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
235 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
236 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
237 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
238 (ST.hasReadM0LdsDirectHazard() &&
239 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
240 checkReadM0Hazards(MI) > 0)
241 return HazardType;
242
243 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
244 return HazardType;
245
247 checkMAILdStHazards(MI) > 0)
248 return HazardType;
249
250 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
251 return HazardType;
252
253 return NoHazard;
254}
255
257 unsigned Quantity) {
258 while (Quantity > 0) {
259 unsigned Arg = std::min(Quantity, 8u);
260 Quantity -= Arg;
261 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
262 .addImm(Arg - 1);
263 }
264}
265
266unsigned
267GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
268 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
269 assert(TSchedModel.getWriteProcResBegin(SC) !=
270 TSchedModel.getWriteProcResEnd(SC));
271 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
272}
273
274void GCNHazardRecognizer::processBundle() {
275 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
276 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
277 // Check bundled MachineInstr's for hazards.
278 for (; MI != E && MI->isInsideBundle(); ++MI) {
279 CurrCycleInstr = &*MI;
280 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
281
282 if (IsHazardRecognizerMode) {
283 fixHazards(CurrCycleInstr);
284
285 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
286 }
287
288 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
289 // include the bundled MI directly after, only add a maximum of
290 // (MaxLookAhead - 1) noops to EmittedInstrs.
291 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
292 EmittedInstrs.push_front(nullptr);
293
294 EmittedInstrs.push_front(CurrCycleInstr);
295 EmittedInstrs.resize(MaxLookAhead);
296 }
297 CurrCycleInstr = nullptr;
298}
299
300void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
301 assert(IsHazardRecognizerMode);
302
303 unsigned NumPreNoops = PreEmitNoops(MI);
304 EmitNoops(NumPreNoops);
305 if (MI->isInsideBundle())
306 insertNoopsInBundle(MI, TII, NumPreNoops);
307 else
308 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
309 NumPreNoops);
311 AdvanceCycle();
312}
313
315 IsHazardRecognizerMode = true;
316 CurrCycleInstr = MI;
317 unsigned W = PreEmitNoopsCommon(MI);
318 fixHazards(MI);
319 CurrCycleInstr = nullptr;
320 return std::max(W, NopPadding.getValue());
321}
322
324 if (MI->isBundle())
325 return 0;
326
327 int WaitStates = 0;
328
330 return std::max(WaitStates, checkSMRDHazards(MI));
331
332 if (ST.hasNSAtoVMEMBug())
333 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
334
335 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
336
337 if (ST.hasNoDataDepHazard())
338 return WaitStates;
339
341 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
342
344 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
345
347 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
348
349 if (isDivFMas(MI->getOpcode()))
350 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
351
352 if (isRWLane(MI->getOpcode()))
353 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
354
357 checkMAIVALUHazards(MI) > 0)
358 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
359
360 if (MI->isInlineAsm())
361 return std::max(WaitStates, checkInlineAsmHazards(MI));
362
363 if (isSGetReg(MI->getOpcode()))
364 return std::max(WaitStates, checkGetRegHazards(MI));
365
366 if (isSSetReg(MI->getOpcode()))
367 return std::max(WaitStates, checkSetRegHazards(MI));
368
369 if (isRFE(MI->getOpcode()))
370 return std::max(WaitStates, checkRFEHazards(MI));
371
372 if ((ST.hasReadM0MovRelInterpHazard() &&
373 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
374 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
375 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
376 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
377 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
378 (ST.hasReadM0LdsDirectHazard() &&
379 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
380 return std::max(WaitStates, checkReadM0Hazards(MI));
381
383 return std::max(WaitStates, checkMAIHazards(MI));
384
386 return std::max(WaitStates, checkMAILdStHazards(MI));
387
388 if (ST.hasGFX950Insts() && isPermlane(*MI))
389 return std::max(WaitStates, checkPermlaneHazards(MI));
390
391 return WaitStates;
392}
393
395 EmittedInstrs.push_front(nullptr);
396}
397
399 // When the scheduler detects a stall, it will call AdvanceCycle() without
400 // emitting any instructions.
401 if (!CurrCycleInstr) {
402 EmittedInstrs.push_front(nullptr);
403 return;
404 }
405
406 if (CurrCycleInstr->isBundle()) {
407 processBundle();
408 return;
409 }
410
411 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
412 if (!NumWaitStates) {
413 CurrCycleInstr = nullptr;
414 return;
415 }
416
417 // Keep track of emitted instructions
418 EmittedInstrs.push_front(CurrCycleInstr);
419
420 // Add a nullptr for each additional wait state after the first. Make sure
421 // not to add more than getMaxLookAhead() items to the list, since we
422 // truncate the list to that size right after this loop.
423 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
424 i < e; ++i) {
425 EmittedInstrs.push_front(nullptr);
426 }
427
428 // getMaxLookahead() is the largest number of wait states we will ever need
429 // to insert, so there is no point in keeping track of more than that many
430 // wait states.
431 EmittedInstrs.resize(getMaxLookAhead());
432
433 CurrCycleInstr = nullptr;
434}
435
437 assert(!IsHazardRecognizerMode &&
438 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
439}
440
441//===----------------------------------------------------------------------===//
442// Helper Functions
443//===----------------------------------------------------------------------===//
444
446
447// Search for a hazard in a block and its predecessors.
448template <typename StateT>
449static bool
450hasHazard(StateT InitialState,
451 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
452 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
453 const MachineBasicBlock *InitialMBB,
455 struct StateMapKey {
457 unsigned Idx;
458 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
459 return LHS.States == RHS.States && LHS.Idx == RHS.Idx;
460 }
461 };
462 struct StateMapKeyTraits : DenseMapInfo<StateMapKey> {
463 static inline StateMapKey getEmptyKey() {
464 return {static_cast<SmallVectorImpl<StateT> *>(
467 }
468 static inline StateMapKey getTombstoneKey() {
469 return {static_cast<SmallVectorImpl<StateT> *>(
472 }
473 static unsigned getHashValue(const StateMapKey &Key) {
474 return StateT::getHashValue((*Key.States)[Key.Idx]);
475 }
476 static unsigned getHashValue(const StateT &State) {
477 return StateT::getHashValue(State);
478 }
479 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
480 const auto EKey = getEmptyKey();
481 const auto TKey = getTombstoneKey();
482 if (StateMapKey::isEqual(LHS, EKey) || StateMapKey::isEqual(RHS, EKey) ||
483 StateMapKey::isEqual(LHS, TKey) || StateMapKey::isEqual(RHS, TKey))
484 return StateMapKey::isEqual(LHS, RHS);
485 return StateT::isEqual((*LHS.States)[LHS.Idx], (*RHS.States)[RHS.Idx]);
486 }
487 static bool isEqual(const StateT &LHS, const StateMapKey &RHS) {
488 if (StateMapKey::isEqual(RHS, getEmptyKey()) ||
489 StateMapKey::isEqual(RHS, getTombstoneKey()))
490 return false;
491 return StateT::isEqual(LHS, (*RHS.States)[RHS.Idx]);
492 }
493 };
494
497
499 const MachineBasicBlock *MBB = InitialMBB;
500 StateT State = InitialState;
501
503 unsigned WorkIdx = 0;
504 for (;;) {
505 bool Expired = false;
506 for (auto E = MBB->instr_rend(); I != E; ++I) {
507 // No need to look at parent BUNDLE instructions.
508 if (I->isBundle())
509 continue;
510
511 auto Result = IsHazard(State, *I);
512 if (Result == HazardFound)
513 return true;
514 if (Result == HazardExpired) {
515 Expired = true;
516 break;
517 }
518
519 if (I->isInlineAsm() || I->isMetaInstruction())
520 continue;
521
522 UpdateState(State, *I);
523 }
524
525 if (!Expired) {
526 unsigned StateIdx = States.size();
527 StateMapKey Key = {&States, StateIdx};
528 auto Insertion = StateMap.insert_as(std::pair(Key, StateIdx), State);
529 if (Insertion.second) {
530 States.emplace_back(State);
531 } else {
532 StateIdx = Insertion.first->second;
533 }
534 for (MachineBasicBlock *Pred : MBB->predecessors())
535 Worklist.insert(std::pair(Pred, StateIdx));
536 }
537
538 if (WorkIdx == Worklist.size())
539 break;
540
541 unsigned StateIdx;
542 std::tie(MBB, StateIdx) = Worklist[WorkIdx++];
543 State = States[StateIdx];
544 I = MBB->instr_rbegin();
545 }
546
547 return false;
548}
549
550// Returns a minimum wait states since \p I walking all predecessors.
551// Only scans until \p IsExpired does not return true.
552// Can only be run in a hazard recognizer mode.
553static int
555 const MachineBasicBlock *MBB,
557 int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired,
561 for (auto E = MBB->instr_rend(); I != E; ++I) {
562 // Don't add WaitStates for parent BUNDLE instructions.
563 if (I->isBundle())
564 continue;
565
566 if (IsHazard(*I))
567 return WaitStates;
568
569 if (I->isInlineAsm())
570 continue;
571
572 WaitStates += GetNumWaitStates(*I);
573
574 if (IsExpired(*I, WaitStates))
575 return std::numeric_limits<int>::max();
576 }
577
578 int MinWaitStates = std::numeric_limits<int>::max();
579 for (MachineBasicBlock *Pred : MBB->predecessors()) {
580 if (!Visited.insert(Pred).second)
581 continue;
582
583 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
584 IsExpired, Visited, GetNumWaitStates);
585
586 MinWaitStates = std::min(MinWaitStates, W);
587 }
588
589 return MinWaitStates;
590}
591
592static int
594 const MachineInstr *MI,
599 return getWaitStatesSince(IsHazard, MI->getParent(),
600 std::next(MI->getReverseIterator()), 0, IsExpired,
601 Visited, GetNumWaitStates);
602}
603
604int GCNHazardRecognizer::getWaitStatesSince(
605 IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) {
606 if (IsHazardRecognizerMode) {
607 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
608 return WaitStates >= Limit;
609 };
610 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn,
611 GetNumWaitStates);
612 }
613
614 int WaitStates = 0;
615 for (MachineInstr *MI : EmittedInstrs) {
616 if (MI) {
617 if (IsHazard(*MI))
618 return WaitStates;
619
620 if (MI->isInlineAsm())
621 continue;
622 }
623 WaitStates += MI ? GetNumWaitStates(*MI) : 1;
624
625 if (WaitStates >= Limit)
626 break;
627 }
628 return std::numeric_limits<int>::max();
629}
630
631int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
632 return getWaitStatesSince(IsHazard, Limit, SIInstrInfo::getNumWaitStates);
633}
634
635int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
636 IsHazardFn IsHazardDef,
637 int Limit) {
638 const SIRegisterInfo *TRI = ST.getRegisterInfo();
639
640 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
641 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
642 };
643
644 return getWaitStatesSince(IsHazardFn, Limit);
645}
646
647int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
648 int Limit) {
649 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
650 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
651 };
652
653 return getWaitStatesSince(IsHazardFn, Limit);
654}
655
656//===----------------------------------------------------------------------===//
657// No-op Hazard Detection
658//===----------------------------------------------------------------------===//
659
660static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
661 MCRegister Reg) {
662 for (MCRegUnit Unit : TRI.regunits(Reg))
663 BV.set(static_cast<unsigned>(Unit));
664}
665
666static void addRegsToSet(const SIRegisterInfo &TRI,
668 BitVector &DefSet, BitVector &UseSet) {
669 for (const MachineOperand &Op : Ops) {
670 if (Op.isReg())
671 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
672 }
673}
674
675void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
676 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
677}
678
680 return !SIInstrInfo::isSMRD(*MI);
681}
682
684 return !SIInstrInfo::isVMEM(*MI);
685}
686
687int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
688 // SMEM soft clause are only present on VI+, and only matter if xnack is
689 // enabled.
690 if (!ST.isXNACKEnabled())
691 return 0;
692
693 bool IsSMRD = TII.isSMRD(*MEM);
694
695 resetClause();
696
697 // A soft-clause is any group of consecutive SMEM instructions. The
698 // instructions in this group may return out of order and/or may be
699 // replayed (i.e. the same instruction issued more than once).
700 //
701 // In order to handle these situations correctly we need to make sure that
702 // when a clause has more than one instruction, no instruction in the clause
703 // writes to a register that is read by another instruction in the clause
704 // (including itself). If we encounter this situation, we need to break the
705 // clause by inserting a non SMEM instruction.
706
707 for (MachineInstr *MI : EmittedInstrs) {
708 // When we hit a non-SMEM instruction then we have passed the start of the
709 // clause and we can stop.
710 if (!MI)
711 break;
712
714 break;
715
716 addClauseInst(*MI);
717 }
718
719 if (ClauseDefs.none())
720 return 0;
721
722 // We need to make sure not to put loads and stores in the same clause if they
723 // use the same address. For now, just start a new clause whenever we see a
724 // store.
725 if (MEM->mayStore())
726 return 1;
727
728 addClauseInst(*MEM);
729
730 // If the set of defs and uses intersect then we cannot add this instruction
731 // to the clause, so we have a hazard.
732 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
733}
734
735int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
736 int WaitStatesNeeded = 0;
737
738 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
739
740 // This SMRD hazard only affects SI.
741 if (!ST.hasSMRDReadVALUDefHazard())
742 return WaitStatesNeeded;
743
744 // A read of an SGPR by SMRD instruction requires 4 wait states when the
745 // SGPR was written by a VALU instruction.
746 int SmrdSgprWaitStates = 4;
747 auto IsHazardDefFn = [this](const MachineInstr &MI) {
748 return TII.isVALU(MI);
749 };
750 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
751 return TII.isSALU(MI);
752 };
753
754 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
755
756 for (const MachineOperand &Use : SMRD->uses()) {
757 if (!Use.isReg())
758 continue;
759 int WaitStatesNeededForUse =
760 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
761 SmrdSgprWaitStates);
762 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
763
764 // This fixes what appears to be undocumented hardware behavior in SI where
765 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
766 // needs some number of nops in between. We don't know how many we need, but
767 // let's use 4. This wasn't discovered before probably because the only
768 // case when this happens is when we expand a 64-bit pointer into a full
769 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
770 // probably never encountered in the closed-source land.
771 if (IsBufferSMRD) {
772 int WaitStatesNeededForUse =
773 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
774 IsBufferHazardDefFn,
775 SmrdSgprWaitStates);
776 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
777 }
778 }
779
780 return WaitStatesNeeded;
781}
782
783int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
784 if (!ST.hasVMEMReadSGPRVALUDefHazard())
785 return 0;
786
787 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
788
789 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
790 // SGPR was written by a VALU Instruction.
791 const int VmemSgprWaitStates = 5;
792 auto IsHazardDefFn = [this](const MachineInstr &MI) {
793 return TII.isVALU(MI);
794 };
795 for (const MachineOperand &Use : VMEM->uses()) {
796 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
797 continue;
798
799 int WaitStatesNeededForUse =
800 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
801 VmemSgprWaitStates);
802 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
803 }
804 return WaitStatesNeeded;
805}
806
807int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
808 const SIRegisterInfo *TRI = ST.getRegisterInfo();
809 const SIInstrInfo *TII = ST.getInstrInfo();
810
811 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
812 int DppVgprWaitStates = 2;
813 int DppExecWaitStates = 5;
814 int WaitStatesNeeded = 0;
815 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
816 return TII->isVALU(MI);
817 };
818
819 for (const MachineOperand &Use : DPP->uses()) {
820 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
821 continue;
822 int WaitStatesNeededForUse =
823 DppVgprWaitStates - getWaitStatesSinceDef(
824 Use.getReg(),
825 [](const MachineInstr &) { return true; },
826 DppVgprWaitStates);
827 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
828 }
829
830 WaitStatesNeeded = std::max(
831 WaitStatesNeeded,
832 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
833 DppExecWaitStates));
834
835 return WaitStatesNeeded;
836}
837
838int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
839 const SIInstrInfo *TII = ST.getInstrInfo();
840
841 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
842 // instruction.
843 const int DivFMasWaitStates = 4;
844 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
845 return TII->isVALU(MI);
846 };
847 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
848 DivFMasWaitStates);
849
850 return DivFMasWaitStates - WaitStatesNeeded;
851}
852
853int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
854 const SIInstrInfo *TII = ST.getInstrInfo();
855 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
856
857 const int GetRegWaitStates = 2;
858 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
859 return GetRegHWReg == getHWReg(TII, MI);
860 };
861 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
862
863 return GetRegWaitStates - WaitStatesNeeded;
864}
865
866int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
867 const SIInstrInfo *TII = ST.getInstrInfo();
868 unsigned HWReg = getHWReg(TII, *SetRegInstr);
869
870 const int SetRegWaitStates = ST.getSetRegWaitStates();
871 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
872 return HWReg == getHWReg(TII, MI);
873 };
874 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
875 return SetRegWaitStates - WaitStatesNeeded;
876}
877
878int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
879 if (!MI.mayStore())
880 return -1;
881
882 const SIInstrInfo *TII = ST.getInstrInfo();
883 unsigned Opcode = MI.getOpcode();
884 const MCInstrDesc &Desc = MI.getDesc();
885
886 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
887 int VDataRCID = -1;
888 if (VDataIdx != -1)
889 VDataRCID = TII->getOpRegClassID(Desc.operands()[VDataIdx]);
890
891 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
892 // There is no hazard if the instruction does not use vector regs
893 // (like wbinvl1)
894 if (VDataIdx == -1)
895 return -1;
896 // For MUBUF/MTBUF instructions this hazard only exists if the
897 // instruction is not using a register in the soffset field.
898 const MachineOperand *SOffset =
899 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
900 // If we have no soffset operand, then assume this field has been
901 // hardcoded to zero.
902 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
903 (!SOffset || !SOffset->isReg()))
904 return VDataIdx;
905 }
906
907 // MIMG instructions create a hazard if they don't use a 256-bit T# and
908 // the store size is greater than 8 bytes and they have more than two bits
909 // of their dmask set.
910 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
911 if (TII->isMIMG(MI)) {
912 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
913 assert(SRsrcIdx != -1 && AMDGPU::getRegBitWidth(TII->getOpRegClassID(
914 Desc.operands()[SRsrcIdx])) == 256);
915 (void)SRsrcIdx;
916 }
917
918 if (TII->isFLAT(MI)) {
919 // There is no hazard if the instruction does not use vector regs
920 if (VDataIdx == -1)
921 return -1;
922
923 if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
924 return VDataIdx;
925 }
926
927 return -1;
928}
929
930int
931GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
932 const MachineRegisterInfo &MRI) {
933 // Helper to check for the hazard where VMEM instructions that store more than
934 // 8 bytes can have there store data over written by the next instruction.
935 const SIRegisterInfo *TRI = ST.getRegisterInfo();
936
937 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
938 int WaitStatesNeeded = 0;
939
940 if (!TRI->isVectorRegister(MRI, Def.getReg()))
941 return WaitStatesNeeded;
942 Register Reg = Def.getReg();
943 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
944 int DataIdx = createsVALUHazard(MI);
945 return DataIdx >= 0 &&
946 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
947 };
948
949 int WaitStatesNeededForDef =
950 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
951 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
952
953 return WaitStatesNeeded;
954}
955
956/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
957/// pack the computed value into correct bit position of the dest register. This
958/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
959/// dst_sel that is not aligned to the register. This function analayzes the \p
960/// MI and \returns an operand with dst forwarding issue, or nullptr if
961/// none exists.
962static const MachineOperand *
965 return nullptr;
966
967 const SIInstrInfo *TII = ST.getInstrInfo();
968
969 unsigned Opcode = MI.getOpcode();
970
971 // There are three different types of instructions
972 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
973 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
974 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
975 // op_sel[3:2]
976 // != 0
977 if (SIInstrInfo::isSDWA(MI)) {
978 // Type 1: SDWA with dst_sel != DWORD
979 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
980 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
981 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
982 }
983
984 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
985 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
986 // Type 2: VOP3 which write the hi bits
987 if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
989 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
990
991 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
992 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
993 (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
995 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
996 }
997
998 // Special case: nop is required for all the opsel values for fp4 sr variant
999 // cvt scale instructions
1000 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
1001 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1002
1003 return nullptr;
1004}
1005
1006/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
1007/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
1008/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
1010 const MachineOperand *Dst,
1011 const SIRegisterInfo *TRI) {
1012 // We must consider implicit reads of the VALU. SDWA with dst_sel and
1013 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
1014 // and we must account for that hazard.
1015 // We also must account for WAW hazards. In particular, WAW with dest
1016 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
1017 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
1018 // check for ECC. Without accounting for this hazard, the ECC will be
1019 // wrong.
1020 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
1021 // complete zeroesHigh16BitsOfDest)
1022 for (auto &Operand : VALU->operands()) {
1023 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1024 return true;
1025 }
1026 }
1027 return false;
1028}
1029
1030int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
1031 int WaitStatesNeeded = 0;
1032
1033 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
1034 const int TransDefWaitstates = 1;
1035
1036 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
1038 return false;
1039 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1040 const SIInstrInfo *TII = ST.getInstrInfo();
1041 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
1042
1043 for (const MachineOperand &Use : VALU->explicit_uses()) {
1044 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
1045 return true;
1046 }
1047
1048 return false;
1049 };
1050
1051 int WaitStatesNeededForDef =
1052 TransDefWaitstates -
1053 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1054 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1055 }
1056
1057 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1058 const int Shift16DefWaitstates = 1;
1059
1060 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1061 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1062 const MachineOperand *ForwardedDst =
1063 getDstSelForwardingOperand(ProducerMI, ST);
1064 if (ForwardedDst) {
1065 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
1066 }
1067
1068 if (ProducerMI.isInlineAsm()) {
1069 // Assume inline asm has dst forwarding hazard
1070 for (auto &Def : ProducerMI.all_defs()) {
1071 if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
1072 return true;
1073 }
1074 }
1075
1076 return false;
1077 };
1078
1079 int WaitStatesNeededForDef =
1080 Shift16DefWaitstates -
1081 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1082 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1083 }
1084
1085 if (ST.hasVDecCoExecHazard()) {
1086 const int VALUWriteSGPRVALUReadWaitstates = 2;
1087 const int VALUWriteEXECRWLane = 4;
1088 const int VALUWriteVGPRReadlaneRead = 1;
1089
1090 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1091 const MachineRegisterInfo &MRI = MF.getRegInfo();
1093 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1094 if (!SIInstrInfo::isVALU(MI))
1095 return false;
1096 return MI.modifiesRegister(UseReg, TRI);
1097 };
1098
1099 for (const MachineOperand &Use : VALU->explicit_uses()) {
1100 if (!Use.isReg())
1101 continue;
1102
1103 UseReg = Use.getReg();
1104 if (TRI->isSGPRReg(MRI, UseReg)) {
1105 int WaitStatesNeededForDef =
1106 VALUWriteSGPRVALUReadWaitstates -
1107 getWaitStatesSince(IsVALUDefSGPRFn,
1108 VALUWriteSGPRVALUReadWaitstates);
1109 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1110 }
1111 }
1112
1113 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1114 UseReg = AMDGPU::VCC;
1115 int WaitStatesNeededForDef =
1116 VALUWriteSGPRVALUReadWaitstates -
1117 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1118 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1119 }
1120
1121 switch (VALU->getOpcode()) {
1122 case AMDGPU::V_READLANE_B32:
1123 case AMDGPU::V_READFIRSTLANE_B32: {
1124 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1125 UseReg = Src->getReg();
1126 int WaitStatesNeededForDef =
1127 VALUWriteVGPRReadlaneRead -
1128 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1129 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1130 }
1131 [[fallthrough]];
1132 case AMDGPU::V_WRITELANE_B32: {
1133 UseReg = AMDGPU::EXEC;
1134 int WaitStatesNeededForDef =
1135 VALUWriteEXECRWLane -
1136 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1137 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1138 break;
1139 }
1140 default:
1141 break;
1142 }
1143 }
1144
1145 // This checks for the hazard where VMEM instructions that store more than
1146 // 8 bytes can have there store data over written by the next instruction.
1147 if (!ST.has12DWordStoreHazard())
1148 return WaitStatesNeeded;
1149
1150 const MachineRegisterInfo &MRI = MF.getRegInfo();
1151
1152 for (const MachineOperand &Def : VALU->defs()) {
1153 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1154 }
1155
1156 return WaitStatesNeeded;
1157}
1158
1159int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1160 // This checks for hazards associated with inline asm statements.
1161 // Since inline asms can contain just about anything, we use this
1162 // to call/leverage other check*Hazard routines. Note that
1163 // this function doesn't attempt to address all possible inline asm
1164 // hazards (good luck), but is a collection of what has been
1165 // problematic thus far.
1166
1167 // see checkVALUHazards()
1168 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1169 !ST.hasCvtScaleForwardingHazard())
1170 return 0;
1171
1172 const MachineRegisterInfo &MRI = MF.getRegInfo();
1173 int WaitStatesNeeded = 0;
1174
1175 for (const MachineOperand &Op :
1177 if (Op.isReg() && Op.isDef()) {
1178 if (!TRI.isVectorRegister(MRI, Op.getReg()))
1179 continue;
1180
1181 if (ST.has12DWordStoreHazard()) {
1182 WaitStatesNeeded =
1183 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1184 }
1185 }
1186 }
1187
1188 if (ST.hasDstSelForwardingHazard()) {
1189 const int Shift16DefWaitstates = 1;
1190
1191 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1192 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1193 // Assume inline asm reads the dst
1194 if (Dst)
1195 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1196 IA->readsRegister(Dst->getReg(), &TRI);
1197
1198 if (ProducerMI.isInlineAsm()) {
1199 // If MI is inline asm, assume it has dst forwarding hazard
1200 for (auto &Def : ProducerMI.all_defs()) {
1201 if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1202 IA->readsRegister(Def.getReg(), &TRI)) {
1203 return true;
1204 }
1205 }
1206 }
1207
1208 return false;
1209 };
1210
1211 int WaitStatesNeededForDef =
1212 Shift16DefWaitstates -
1213 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1214 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1215 }
1216
1217 return WaitStatesNeeded;
1218}
1219
1220int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1221 const SIInstrInfo *TII = ST.getInstrInfo();
1222 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1223 const MachineRegisterInfo &MRI = MF.getRegInfo();
1224
1225 const MachineOperand *LaneSelectOp =
1226 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1227
1228 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1229 return 0;
1230
1231 Register LaneSelectReg = LaneSelectOp->getReg();
1232 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1233
1234 const int RWLaneWaitStates = 4;
1235 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1236 RWLaneWaitStates);
1237 return RWLaneWaitStates - WaitStatesSince;
1238}
1239
1240int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1241 if (!ST.hasRFEHazards())
1242 return 0;
1243
1244 const SIInstrInfo *TII = ST.getInstrInfo();
1245
1246 const int RFEWaitStates = 1;
1247
1248 auto IsHazardFn = [TII](const MachineInstr &MI) {
1249 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1250 };
1251 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1252 return RFEWaitStates - WaitStatesNeeded;
1253}
1254
1255int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1256 const SIInstrInfo *TII = ST.getInstrInfo();
1257 const int ReadM0WaitStates = 1;
1258 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1259 return ReadM0WaitStates -
1260 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1261}
1262
1263// emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we need
1264// to insert, negative means not needed.
1265bool GCNHazardRecognizer::emitVNops(MachineInstr *MI, int WaitStatesNeeded) {
1266 if (WaitStatesNeeded <= 0)
1267 return false;
1268
1269 const SIInstrInfo *TII = ST.getInstrInfo();
1270 for (int I = 0; I < WaitStatesNeeded; ++I)
1271 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1272 TII->get(AMDGPU::V_NOP_e32));
1273
1274 return true;
1275}
1276
1277void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1278 fixVMEMtoScalarWriteHazards(MI);
1279 fixVcmpxPermlaneHazards(MI);
1280 fixSMEMtoVectorWriteHazards(MI);
1281 fixVcmpxExecWARHazard(MI);
1282 fixLdsBranchVmemWARHazard(MI);
1283 if (ST.hasLdsDirect()) {
1284 fixLdsDirectVALUHazard(MI);
1285 fixLdsDirectVMEMHazard(MI);
1286 }
1287 fixVALUPartialForwardingHazard(MI);
1288 fixVALUTransUseHazard(MI);
1289 fixVALUTransCoexecutionHazards(MI);
1290 fixWMMAHazards(MI); // fall-through if co-execution is enabled.
1291 emitVNops(MI, checkWMMACoexecutionHazards(MI));
1292 fixShift64HighRegBug(MI);
1293 fixVALUMaskWriteHazard(MI);
1294 fixRequiredExportPriority(MI);
1295 if (ST.requiresWaitIdleBeforeGetReg())
1296 fixGetRegWaitIdle(MI);
1297 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1298 fixDsAtomicAsyncBarrierArriveB64(MI);
1299 if (ST.hasScratchBaseForwardingHazard())
1300 fixScratchBaseForwardingHazard(MI);
1301 if (ST.setRegModeNeedsVNOPs())
1302 fixSetRegMode(MI);
1303}
1304
1306 const MachineInstr &MI) {
1307 return (TII.isVOPC(MI) ||
1308 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1309 MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1310}
1311
1312bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1313 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1314 return false;
1315
1316 const SIInstrInfo *TII = ST.getInstrInfo();
1317 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1318 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1319 return isVCmpXWritesExec(*TII, *TRI, MI);
1320 };
1321
1322 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1323 unsigned Opc = MI.getOpcode();
1324 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1325 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1326 };
1327
1328 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1329 std::numeric_limits<int>::max())
1330 return false;
1331
1332 // V_NOP will be discarded by SQ.
1333 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1334 // which is always a VGPR and available.
1335 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1336 Register Reg = Src0->getReg();
1337 bool IsUndef = Src0->isUndef();
1338 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1339 TII->get(AMDGPU::V_MOV_B32_e32))
1342
1343 return true;
1344}
1345
1346bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1347 if (!ST.hasVMEMtoScalarWriteHazard())
1348 return false;
1349 assert(!ST.hasExtendedWaitCounts());
1350
1352 return false;
1353
1354 if (MI->getNumDefs() == 0)
1355 return false;
1356
1357 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1358
1359 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1361 return false;
1362
1363 for (const MachineOperand &Def : MI->defs()) {
1364 const MachineOperand *Op =
1365 I.findRegisterUseOperand(Def.getReg(), TRI, false);
1366 if (!Op)
1367 continue;
1368 return true;
1369 }
1370 return false;
1371 };
1372
1373 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1374 return SIInstrInfo::isVALU(MI) ||
1375 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1376 !MI.getOperand(0).getImm()) ||
1377 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1378 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1379 };
1380
1381 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1382 std::numeric_limits<int>::max())
1383 return false;
1384
1385 const SIInstrInfo *TII = ST.getInstrInfo();
1386 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1387 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1389 return true;
1390}
1391
1392bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1393 if (!ST.hasSMEMtoVectorWriteHazard())
1394 return false;
1395 assert(!ST.hasExtendedWaitCounts());
1396
1397 if (!SIInstrInfo::isVALU(*MI))
1398 return false;
1399
1400 AMDGPU::OpName SDSTName;
1401 switch (MI->getOpcode()) {
1402 case AMDGPU::V_READLANE_B32:
1403 case AMDGPU::V_READFIRSTLANE_B32:
1404 SDSTName = AMDGPU::OpName::vdst;
1405 break;
1406 default:
1407 SDSTName = AMDGPU::OpName::sdst;
1408 break;
1409 }
1410
1411 const SIInstrInfo *TII = ST.getInstrInfo();
1412 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1413 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1414 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1415 if (!SDST) {
1416 for (const auto &MO : MI->implicit_operands()) {
1417 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1418 SDST = &MO;
1419 break;
1420 }
1421 }
1422 }
1423
1424 if (!SDST)
1425 return false;
1426
1427 const Register SDSTReg = SDST->getReg();
1428 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1429 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1430 };
1431
1432 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1433 if (TII->isSALU(MI)) {
1434 switch (MI.getOpcode()) {
1435 case AMDGPU::S_SETVSKIP:
1436 case AMDGPU::S_VERSION:
1437 case AMDGPU::S_WAITCNT_VSCNT:
1438 case AMDGPU::S_WAITCNT_VMCNT:
1439 case AMDGPU::S_WAITCNT_EXPCNT:
1440 // These instructions cannot not mitigate the hazard.
1441 return false;
1442 case AMDGPU::S_WAITCNT_LGKMCNT:
1443 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1444 return (MI.getOperand(1).getImm() == 0) &&
1445 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1446 case AMDGPU::S_WAITCNT: {
1447 const int64_t Imm = MI.getOperand(0).getImm();
1448 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1449 // DsCnt corresponds to LGKMCnt here.
1450 return (Decoded.DsCnt == 0);
1451 }
1452 default:
1453 assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
1454 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1455 "unexpected wait count instruction");
1456 // SOPP instructions cannot mitigate the hazard.
1457 if (TII->isSOPP(MI))
1458 return false;
1459 // At this point the SALU can be assumed to mitigate the hazard
1460 // because either:
1461 // (a) it is independent of the at risk SMEM (breaking chain),
1462 // or
1463 // (b) it is dependent on the SMEM, in which case an appropriate
1464 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1465 // SMEM instruction.
1466 return true;
1467 }
1468 }
1469 return false;
1470 };
1471
1472 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1473 std::numeric_limits<int>::max())
1474 return false;
1475
1476 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1477 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1478 .addImm(0);
1479 return true;
1480}
1481
1482bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1483 if (!ST.hasVcmpxExecWARHazard())
1484 return false;
1485 assert(!ST.hasExtendedWaitCounts());
1486
1487 if (!SIInstrInfo::isVALU(*MI))
1488 return false;
1489
1490 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1491 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1492 return false;
1493
1494 auto IsHazardFn = [TRI](const MachineInstr &I) {
1496 return false;
1497 return I.readsRegister(AMDGPU::EXEC, TRI);
1498 };
1499
1500 const SIInstrInfo *TII = ST.getInstrInfo();
1501 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1502 if (SIInstrInfo::isVALU(MI)) {
1503 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1504 return true;
1505 for (auto MO : MI.implicit_operands())
1506 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1507 return true;
1508 }
1509 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1510 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1511 return true;
1512 return false;
1513 };
1514
1515 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1516 std::numeric_limits<int>::max())
1517 return false;
1518
1519 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1520 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1522 return true;
1523}
1524
1526 const GCNSubtarget &ST) {
1527 if (!ST.hasLdsBranchVmemWARHazard())
1528 return false;
1529
1530 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1531 // instructions need to appear in the same function.
1532 bool HasLds = false;
1533 bool HasVmem = false;
1534 for (auto &MBB : MF) {
1535 for (auto &MI : MBB) {
1537 HasVmem |= SIInstrInfo::isVMEM(MI);
1538 if (HasLds && HasVmem)
1539 return true;
1540 }
1541 }
1542 return false;
1543}
1544
1546 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1547 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1548 !I.getOperand(1).getImm();
1549}
1550
1551bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1552 if (!RunLdsBranchVmemWARHazardFixup)
1553 return false;
1554
1555 assert(ST.hasLdsBranchVmemWARHazard());
1556 assert(!ST.hasExtendedWaitCounts());
1557
1558 auto IsHazardInst = [](const MachineInstr &MI) {
1560 return 1;
1562 return 2;
1563 return 0;
1564 };
1565
1566 auto InstType = IsHazardInst(*MI);
1567 if (!InstType)
1568 return false;
1569
1570 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1571 return IsHazardInst(I) || isStoreCountWaitZero(I);
1572 };
1573
1574 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1575 if (!I.isBranch())
1576 return false;
1577
1578 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1579 auto InstType2 = IsHazardInst(I);
1580 return InstType2 && InstType != InstType2;
1581 };
1582
1583 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1584 auto InstType2 = IsHazardInst(I);
1585 if (InstType == InstType2)
1586 return true;
1587
1588 return isStoreCountWaitZero(I);
1589 };
1590
1591 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1592 std::numeric_limits<int>::max();
1593 };
1594
1595 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1596 std::numeric_limits<int>::max())
1597 return false;
1598
1599 const SIInstrInfo *TII = ST.getInstrInfo();
1600 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1601 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1602 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1603 .addImm(0);
1604
1605 return true;
1606}
1607
1608bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1610 return false;
1611
1612 const int NoHazardWaitStates = 15;
1613 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1614 const Register VDSTReg = VDST->getReg();
1615
1616 bool VisitedTrans = false;
1617 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1618 if (!SIInstrInfo::isVALU(I))
1619 return false;
1620 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1621 // Cover both WAR and WAW
1622 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1623 };
1624 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1625 if (WaitStates >= NoHazardWaitStates)
1626 return true;
1627 // Instructions which cause va_vdst==0 expire hazard
1630 };
1631 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1632 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1633 };
1634
1635 DenseSet<const MachineBasicBlock *> Visited;
1636 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1637 std::next(MI->getReverseIterator()), 0,
1638 IsExpiredFn, Visited, GetWaitStatesFn);
1639
1640 // Transcendentals can execute in parallel to other VALUs.
1641 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1642 if (VisitedTrans)
1643 Count = 0;
1644
1645 MachineOperand *WaitVdstOp =
1646 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1647 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1648
1649 return true;
1650}
1651
1652bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1654 return false;
1655
1656 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1657 const Register VDSTReg = VDST->getReg();
1658
1659 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1661 return false;
1662 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1663 };
1664 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1665 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1666 // according to the type of VMEM instruction.
1667 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1669 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1670 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1671 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1672 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1673 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1674 };
1675
1676 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1677 std::numeric_limits<int>::max())
1678 return false;
1679
1680 if (LdsdirCanWait) {
1681 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1682 } else {
1683 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1684 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1686 }
1687
1688 return true;
1689}
1690
1691bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1692 if (!ST.hasVALUPartialForwardingHazard())
1693 return false;
1694 assert(!ST.hasExtendedWaitCounts());
1695
1696 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
1697 return false;
1698
1699 SmallSetVector<Register, 4> SrcVGPRs;
1700
1701 for (const MachineOperand &Use : MI->explicit_uses()) {
1702 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1703 SrcVGPRs.insert(Use.getReg());
1704 }
1705
1706 // Only applies with >= 2 unique VGPR sources
1707 if (SrcVGPRs.size() <= 1)
1708 return false;
1709
1710 // Look for the following pattern:
1711 // Va <- VALU [PreExecPos]
1712 // intv1
1713 // Exec <- SALU [ExecPos]
1714 // intv2
1715 // Vb <- VALU [PostExecPos]
1716 // intv3
1717 // MI Va, Vb (WaitState = 0)
1718 //
1719 // Where:
1720 // intv1 + intv2 <= 2 VALUs
1721 // intv3 <= 4 VALUs
1722 //
1723 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1724
1725 const int Intv1plus2MaxVALUs = 2;
1726 const int Intv3MaxVALUs = 4;
1727 const int IntvMaxVALUs = 6;
1728 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1729
1730 struct StateType {
1731 SmallDenseMap<Register, int, 4> DefPos;
1732 int ExecPos = std::numeric_limits<int>::max();
1733 int VALUs = 0;
1734
1735 static unsigned getHashValue(const StateType &State) {
1736 return hash_combine(State.ExecPos, State.VALUs,
1737 hash_combine_range(State.DefPos));
1738 }
1739 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1740 return LHS.DefPos == RHS.DefPos && LHS.ExecPos == RHS.ExecPos &&
1741 LHS.VALUs == RHS.VALUs;
1742 }
1743 };
1744
1745 StateType State;
1746
1747 // This overloads expiry testing with all the hazard detection
1748 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1749 // Too many VALU states have passed
1750 if (State.VALUs > NoHazardVALUWaitStates)
1751 return HazardExpired;
1752
1753 // Instructions which cause va_vdst==0 expire hazard
1756 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1757 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1758 return HazardExpired;
1759
1760 // Track registers writes
1761 bool Changed = false;
1762 if (SIInstrInfo::isVALU(I)) {
1763 for (Register Src : SrcVGPRs) {
1764 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1765 State.DefPos[Src] = State.VALUs;
1766 Changed = true;
1767 }
1768 }
1769 } else if (SIInstrInfo::isSALU(I)) {
1770 if (State.ExecPos == std::numeric_limits<int>::max()) {
1771 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1772 State.ExecPos = State.VALUs;
1773 Changed = true;
1774 }
1775 }
1776 }
1777
1778 // Early expiration: too many VALUs in intv3
1779 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1780 return HazardExpired;
1781
1782 // Only evaluate state if something changed
1783 if (!Changed)
1784 return NoHazardFound;
1785
1786 // Determine positions of VALUs pre/post exec change
1787 if (State.ExecPos == std::numeric_limits<int>::max())
1788 return NoHazardFound;
1789
1790 int PreExecPos = std::numeric_limits<int>::max();
1791 int PostExecPos = std::numeric_limits<int>::max();
1792
1793 for (auto Entry : State.DefPos) {
1794 int DefVALUs = Entry.second;
1795 if (DefVALUs != std::numeric_limits<int>::max()) {
1796 if (DefVALUs >= State.ExecPos)
1797 PreExecPos = std::min(PreExecPos, DefVALUs);
1798 else
1799 PostExecPos = std::min(PostExecPos, DefVALUs);
1800 }
1801 }
1802
1803 // Need a VALUs post exec change
1804 if (PostExecPos == std::numeric_limits<int>::max())
1805 return NoHazardFound;
1806
1807 // Too many VALUs in intv3?
1808 int Intv3VALUs = PostExecPos;
1809 if (Intv3VALUs > Intv3MaxVALUs)
1810 return HazardExpired;
1811
1812 // Too many VALUs in intv2?
1813 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1814 if (Intv2VALUs > Intv1plus2MaxVALUs)
1815 return HazardExpired;
1816
1817 // Need a VALUs pre exec change
1818 if (PreExecPos == std::numeric_limits<int>::max())
1819 return NoHazardFound;
1820
1821 // Too many VALUs in intv1?
1822 int Intv1VALUs = PreExecPos - State.ExecPos;
1823 if (Intv1VALUs > Intv1plus2MaxVALUs)
1824 return HazardExpired;
1825
1826 // Too many VALUs in intv1 + intv2
1827 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1828 return HazardExpired;
1829
1830 return HazardFound;
1831 };
1832 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1834 State.VALUs += 1;
1835 };
1836
1837 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1838 std::next(MI->getReverseIterator())))
1839 return false;
1840
1841 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1842 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1844
1845 return true;
1846}
1847
1848bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1849 if (!ST.hasVALUTransUseHazard())
1850 return false;
1851 assert(!ST.hasExtendedWaitCounts());
1852
1853 if (!SIInstrInfo::isVALU(*MI))
1854 return false;
1855
1856 SmallSet<Register, 4> SrcVGPRs;
1857
1858 for (const MachineOperand &Use : MI->explicit_uses()) {
1859 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1860 SrcVGPRs.insert(Use.getReg());
1861 }
1862
1863 // Look for the following pattern:
1864 // Va <- TRANS VALU
1865 // intv
1866 // MI Va (WaitState = 0)
1867 //
1868 // Where:
1869 // intv <= 5 VALUs / 1 TRANS
1870 //
1871 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1872
1873 const int IntvMaxVALUs = 5;
1874 const int IntvMaxTRANS = 1;
1875
1876 struct StateType {
1877 int VALUs = 0;
1878 int TRANS = 0;
1879
1880 static unsigned getHashValue(const StateType &State) {
1881 return hash_combine(State.VALUs, State.TRANS);
1882 }
1883 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1884 return LHS.VALUs == RHS.VALUs && LHS.TRANS == RHS.TRANS;
1885 }
1886 };
1887
1888 StateType State;
1889
1890 // This overloads expiry testing with all the hazard detection
1891 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1892 // Too many VALU states have passed
1893 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1894 return HazardExpired;
1895
1896 // Instructions which cause va_vdst==0 expire hazard
1899 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1900 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1901 return HazardExpired;
1902
1903 // Track registers writes
1904 if (SIInstrInfo::isTRANS(I)) {
1905 for (Register Src : SrcVGPRs) {
1906 if (I.modifiesRegister(Src, &TRI)) {
1907 return HazardFound;
1908 }
1909 }
1910 }
1911
1912 return NoHazardFound;
1913 };
1914 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1916 State.VALUs += 1;
1918 State.TRANS += 1;
1919 };
1920
1921 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1922 std::next(MI->getReverseIterator())))
1923 return false;
1924
1925 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1926 // avoided.
1927 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1928 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1930
1931 return true;
1932}
1933
1934bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
1935 if (!ST.hasGFX1250Insts() || // Coexecution disabled.
1937 return false;
1938
1939 const SIInstrInfo *TII = ST.getInstrInfo();
1940 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1941
1942 auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1943 if (!SIInstrInfo::isTRANS(I))
1944 return false;
1945
1946 // RAW: Trans(I) writes, VALU(MI) reads.
1947 Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1948 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
1949 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1950 return true;
1951 }
1952
1953 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
1954 if (!ValuDst || !ValuDst->isReg())
1955 return false;
1956
1957 // WAR: Trans(I) reads, VALU(MI) writes.
1958 Register ValuDef = ValuDst->getReg();
1959 for (const MachineOperand &TransUse : I.explicit_uses()) {
1960 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1961 return true;
1962 }
1963
1964 return false;
1965 };
1966
1967 auto IsExpiredFn = [](const MachineInstr &I, int) {
1968 return SIInstrInfo::isVALU(I);
1969 };
1970
1971 const int HasVALU = std::numeric_limits<int>::max();
1972 if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU)
1973 return false;
1974
1975 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1976 return true;
1977}
1978
1979bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1981 return false;
1982
1983 const SIInstrInfo *TII = ST.getInstrInfo();
1984 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1985
1986 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1988 return false;
1989
1990 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1991 // with the dest(matrix D) of the previous wmma.
1992 const Register CurSrc0Reg =
1993 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1994 const Register CurSrc1Reg =
1995 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1996
1997 const Register PrevDstReg =
1998 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1999
2000 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2001 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2002 return true;
2003 }
2004
2005 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
2006 // but Index can't overlap with PrevDstReg.
2007 if (AMDGPU::isGFX12Plus(ST)) {
2008 if (SIInstrInfo::isSWMMAC(*MI)) {
2009 const Register CurIndex =
2010 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
2011 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2012 return true;
2013 }
2014 return false;
2015 }
2016
2017 return false;
2018 };
2019
2020 auto IsExpiredFn = [](const MachineInstr &I, int) {
2021 return SIInstrInfo::isVALU(I);
2022 };
2023
2024 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2025 std::numeric_limits<int>::max())
2026 return false;
2027
2028 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2029
2030 return true;
2031}
2032
2035 !SIInstrInfo::isWMMA(MI) && !SIInstrInfo::isSWMMAC(MI); // What else?
2036}
2037
2039 const SIInstrInfo *TII, unsigned Latency,
2040 unsigned Category) {
2041 assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) &&
2042 "Handle me if the xdl wmma instruction latency changes");
2043
2044 switch (Category) {
2045 case 0: // Dense WMMA Instructions:
2046 // WMMA_*F16, WMMA_*BF16
2047 // WMMA_*FP8FP8
2048 // WMMA_*FP8BF8
2049 // WMMA_*BF8FP8
2050 // WMMA_*BF8BF8
2051 // WMMA_*F8F6F4 if SRCA & SRCB != F8
2052 return Latency == 8 && SIInstrInfo::isWMMA(MI);
2053
2054 case 1: // Dense WMMA Instructions:
2055 // WMMA_IU8
2056 // WMMA_IU4
2057 // WMMA_*F8F6F4 if SRCA OR SRCB == F8
2058 return Latency == 16 && SIInstrInfo::isWMMA(MI);
2059
2060 case 2: // Dense SWMMAC Instructions
2061 // SWMMAC_*F16, SWMMAC_*BF16,
2062 // SWMMAC_*FP8FP8
2063 // SWMMAC_*BF8FP8
2064 // SWMMAC_*FP8BF8
2065 // SWMMAC_*BF8BF8
2066 return Latency == 8 && SIInstrInfo::isSWMMAC(MI);
2067
2068 case 3: // Sparse WMMA Instructions:
2069 // SWMMAC_IU8
2070 // SWMMAC_IU4
2071 return Latency == 16 && SIInstrInfo::isSWMMAC(MI);
2072 default:
2073 break;
2074 } // end switch.
2075
2076 return false;
2077}
2078
2079int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
2080 if (!ST.hasGFX1250Insts())
2081 return 0;
2082
2083 const SIInstrInfo *TII = ST.getInstrInfo();
2084 if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
2085 return 0;
2086
2087 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2088
2089 // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
2090 // be in between the first WMMA and the second instruction to cover the hazard
2091 // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
2092 // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
2093 // numbers, which depends on the category of the first WMMA.
2094 const int WMMAWaitStates[] = {5, 9, 3, 5};
2095 const int VALUWaitStates[] = {4, 8, 2, 4};
2096 unsigned Category = 0;
2097
2098 auto IsWMMAHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
2099 if (!TII->isXDLWMMA(I))
2100 return false;
2101
2102 unsigned Latency = TSchedModel.computeInstrLatency(&I);
2103 if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
2104 return false;
2105
2106 Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2107 Register A1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
2108 Register B1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
2109
2110 // WMMA0 wrires (D0), WMMA1 reads (A1/B1/Idx1).
2111 if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))
2112 return true;
2113
2114 if (SIInstrInfo::isSWMMAC(*MI)) {
2115 Register Idx1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
2116 if (TRI->regsOverlap(D0, Idx1))
2117 return true;
2118 }
2119
2120 return false;
2121 };
2122
2123 auto IsVALUHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
2124 if (!TII->isXDLWMMA(I))
2125 return false;
2126
2127 unsigned Latency = TSchedModel.computeInstrLatency(&I);
2128 if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
2129 return false;
2130
2131 // WMMA writes, VALU reads.
2132 Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2133 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
2134 if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))
2135 return true;
2136 }
2137
2138 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
2139 if (!ValuDst || !ValuDst->isReg())
2140 return false;
2141 Register D1 = ValuDst->getReg();
2142
2143 // WMMA writes, VALU writes.
2144 if (TRI->regsOverlap(D0, D1))
2145 return true;
2146
2147 // WMMA reads, VALU writes.
2148 Register A0 = TII->getNamedOperand(I, AMDGPU::OpName::src0)->getReg();
2149 Register B0 = TII->getNamedOperand(I, AMDGPU::OpName::src1)->getReg();
2150 if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))
2151 return true;
2152
2153 if (SIInstrInfo::isSWMMAC(I)) {
2154 Register Idx0 = TII->getNamedOperand(I, AMDGPU::OpName::src2)->getReg();
2155 if (TRI->regsOverlap(D1, Idx0))
2156 return true;
2157 }
2158
2159 return false;
2160 };
2161
2162 int Limit = 0;
2163
2164 auto GetWaitStatesFn = [](const MachineInstr &I) {
2165 return SIInstrInfo::isVALU(I) ? 1 : 0;
2166 };
2167
2168 int WaitStatesNeeded = -1;
2169 if (TII->isXDLWMMA(*MI)) {
2170 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2171 Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
2172 // 'getWaitStatesSince' returns the number of VALUs in between if hazard
2173 // exists, and INT_MAX if there is no hazard. As a result, a negative
2174 // WaitStatesNeeded here means no hazard, and we will continue to search
2175 // for other categories.
2176 WaitStatesNeeded =
2177 Limit - getWaitStatesSince(IsWMMAHazardFn, Limit, GetWaitStatesFn);
2178 }
2179 } else { // Must be a co-executable VALU.
2180 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2181 Limit = VALUWaitStates[Category]; // for IsExpiredFn.
2182 // 'getWaitStatesSince' returns the number of VALUs in between if hazard
2183 // exists, and INT_MAX if there is no hazard. As a result, a negative
2184 // WaitStatesNeeded here means no hazard, and we will continue to search
2185 // for other categories.
2186 WaitStatesNeeded =
2187 Limit - getWaitStatesSince(IsVALUHazardFn, Limit, GetWaitStatesFn);
2188 }
2189 }
2190
2191 return WaitStatesNeeded;
2192}
2193
2194bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
2195 if (!ST.hasShift64HighRegBug())
2196 return false;
2197 assert(!ST.hasExtendedWaitCounts());
2198
2199 switch (MI->getOpcode()) {
2200 default:
2201 return false;
2202 case AMDGPU::V_LSHLREV_B64_e64:
2203 case AMDGPU::V_LSHRREV_B64_e64:
2204 case AMDGPU::V_ASHRREV_I64_e64:
2205 break;
2206 }
2207
2208 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
2209 if (!Amt->isReg())
2210 return false;
2211
2212 Register AmtReg = Amt->getReg();
2213 const MachineRegisterInfo &MRI = MF.getRegInfo();
2214 // Check if this is a last VGPR in the allocation block.
2215 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2216 return false;
2217
2218 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
2219 return false;
2220
2221 assert(ST.needsAlignedVGPRs());
2222 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2223
2224 const DebugLoc &DL = MI->getDebugLoc();
2225 MachineBasicBlock *MBB = MI->getParent();
2226 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
2227
2228 // In:
2229 //
2230 // Dst = shiftrev64 Amt, Src1
2231 //
2232 // if Dst!=Src1 then avoid the bug with:
2233 //
2234 // Dst.sub0 = Amt
2235 // Dst = shift64 Dst.sub0, Src1
2236
2237 Register DstReg = MI->getOperand(0).getReg();
2238 if (!Src1->isReg() || Src1->getReg() != DstReg) {
2239 Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
2240 runOnInstruction(
2241 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstLo).add(*Amt));
2242 Amt->setReg(DstLo);
2243 Amt->setIsKill(true);
2244 return true;
2245 }
2246
2247 bool Overlapped = MI->modifiesRegister(AmtReg, &TRI);
2248 Register NewReg;
2249 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2250 : AMDGPU::VGPR_32RegClass) {
2251 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
2252 NewReg = Reg;
2253 break;
2254 }
2255 }
2256
2257 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2258 : NewReg;
2259 Register NewAmtLo;
2260
2261 if (Overlapped)
2262 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2263
2264 // Insert a full wait count because found register might be pending a wait.
2265 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
2266 .addImm(0);
2267
2268 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
2269 if (Overlapped)
2270 runOnInstruction(
2271 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
2272 .addDef(AmtReg - 1)
2273 .addReg(AmtReg - 1, RegState::Undef)
2274 .addReg(NewAmtLo, RegState::Undef));
2275 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2276 .addDef(AmtReg)
2277 .addReg(AmtReg, RegState::Undef)
2278 .addReg(NewAmt, RegState::Undef));
2279
2280 // Instructions emitted after the current instruction will be processed by the
2281 // parent loop of the hazard recognizer in a natural way.
2282 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2283 AmtReg)
2284 .addDef(NewAmt)
2285 .addReg(NewAmt)
2286 .addReg(AmtReg);
2287 if (Overlapped)
2288 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2289 AmtReg - 1)
2290 .addDef(NewAmtLo)
2291 .addReg(NewAmtLo)
2292 .addReg(AmtReg - 1);
2293
2294 // Re-running hazard recognizer on the modified instruction is not necessary,
2295 // inserted V_SWAP_B32 has already both read and write new registers so
2296 // hazards related to these register has already been handled.
2297 Amt->setReg(NewAmt);
2298 Amt->setIsKill(false);
2299 // We do not update liveness, so verifier may see it as undef.
2300 Amt->setIsUndef();
2301 if (Overlapped) {
2302 MI->getOperand(0).setReg(NewReg);
2303 Src1->setReg(NewReg);
2304 Src1->setIsKill(false);
2305 Src1->setIsUndef();
2306 }
2307
2308 return true;
2309}
2310
2311int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
2312 int NSAtoVMEMWaitStates = 1;
2313
2314 if (!ST.hasNSAtoVMEMBug())
2315 return 0;
2316
2318 return 0;
2319
2320 const SIInstrInfo *TII = ST.getInstrInfo();
2321 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2322 if (!Offset || (Offset->getImm() & 6) == 0)
2323 return 0;
2324
2325 auto IsHazardFn = [TII](const MachineInstr &I) {
2326 if (!SIInstrInfo::isMIMG(I))
2327 return false;
2328 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
2329 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2330 TII->getInstSizeInBytes(I) >= 16;
2331 };
2332
2333 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
2334}
2335
2336int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
2337 int FPAtomicToDenormModeWaitStates = 3;
2338
2339 if (!ST.hasFPAtomicToDenormModeHazard())
2340 return 0;
2341 assert(!ST.hasExtendedWaitCounts());
2342
2343 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2344 return 0;
2345
2346 auto IsHazardFn = [](const MachineInstr &I) {
2347 if (!SIInstrInfo::isVMEM(I))
2348 return false;
2349 return SIInstrInfo::isFPAtomic(I);
2350 };
2351
2352 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2353 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2354 return true;
2355
2356 return SIInstrInfo::isWaitcnt(MI.getOpcode());
2357 };
2358
2359 return FPAtomicToDenormModeWaitStates -
2360 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2361}
2362
2363int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2365
2366 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2367}
2368
2369int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2370 // Early exit if no padding is requested.
2371 if (MFMAPaddingRatio == 0)
2372 return 0;
2373
2374 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2375 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2376 return 0;
2377
2378 int NeighborMFMALatency = 0;
2379 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2380 this](const MachineInstr &MI) {
2381 if (!SIInstrInfo::isMFMA(MI))
2382 return false;
2383
2384 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2385 return true;
2386 };
2387
2388 const int MaxMFMAPipelineWaitStates = 16;
2389 int WaitStatesSinceNeighborMFMA =
2390 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2391
2392 int NeighborMFMAPaddingNeeded =
2393 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2394 WaitStatesSinceNeighborMFMA;
2395
2396 return std::max(0, NeighborMFMAPaddingNeeded);
2397}
2398
2399int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2400 int WaitStatesNeeded = 0;
2401 unsigned Opc = MI->getOpcode();
2402
2403 auto IsVALUFn = [](const MachineInstr &MI) {
2404 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2405 };
2406
2407 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2408 const int LegacyVALUWritesVGPRWaitStates = 2;
2409 const int VALUWritesExecWaitStates = 4;
2410 const int MaxWaitStates = 4;
2411
2412 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2413 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2414 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2415
2416 if (WaitStatesNeeded < MaxWaitStates) {
2417 for (const MachineOperand &Use : MI->explicit_uses()) {
2418 const int MaxWaitStates = 2;
2419
2420 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2421 continue;
2422
2423 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2424 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2425 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2426
2427 if (WaitStatesNeeded == MaxWaitStates)
2428 break;
2429 }
2430 }
2431 }
2432
2433 for (const MachineOperand &Op : MI->explicit_operands()) {
2434 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2435 continue;
2436
2437 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2438 continue;
2439
2440 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2441 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2442 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2443 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2444 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2445 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2446 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2447 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2448 const int MaxWaitStates = 18;
2449 Register Reg = Op.getReg();
2450 unsigned HazardDefLatency = 0;
2451
2452 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2453 this](const MachineInstr &MI) {
2454 if (!SIInstrInfo::isMFMA(MI))
2455 return false;
2456 Register DstReg = MI.getOperand(0).getReg();
2457 if (DstReg == Reg)
2458 return false;
2459 HazardDefLatency =
2460 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2461 return TRI.regsOverlap(DstReg, Reg);
2462 };
2463
2464 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2465 MaxWaitStates);
2466 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2467 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2468 int OpNo = Op.getOperandNo();
2469 if (OpNo == SrcCIdx) {
2470 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2471 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2472 switch (HazardDefLatency) {
2473 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2474 break;
2475 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2476 break;
2477 case 16: [[fallthrough]];
2478 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2479 break;
2480 }
2481 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2482 switch (HazardDefLatency) {
2483 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2484 break;
2485 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2486 break;
2487 case 16: [[fallthrough]];
2488 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2489 break;
2490 }
2491 }
2492
2493 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2494 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2495
2496 if (WaitStatesNeeded == MaxWaitStates)
2497 return WaitStatesNeeded; // Early exit.
2498
2499 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2500 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2501 return false;
2502 Register DstReg = MI.getOperand(0).getReg();
2503 return TRI.regsOverlap(Reg, DstReg);
2504 };
2505
2506 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2507 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2508 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2509 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2510 if (OpNo == SrcCIdx)
2511 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2512 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2513 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2514
2515 WaitStatesNeededForUse = NeedWaitStates -
2516 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2517 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2518
2519 if (WaitStatesNeeded == MaxWaitStates)
2520 return WaitStatesNeeded; // Early exit.
2521 }
2522
2523 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2524 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2525 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2526 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2527 const int MaxWaitStates = 13;
2528 Register DstReg = MI->getOperand(0).getReg();
2529 unsigned HazardDefLatency = 0;
2530
2531 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2532 this](const MachineInstr &MI) {
2533 if (!SIInstrInfo::isMFMA(MI))
2534 return false;
2535 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2536 HazardDefLatency =
2537 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2538 return TRI.regsOverlap(Reg, DstReg);
2539 };
2540
2541 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2542 int NeedWaitStates;
2543 switch (HazardDefLatency) {
2544 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2545 break;
2546 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2547 break;
2548 case 16: [[fallthrough]];
2549 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2550 break;
2551 }
2552
2553 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2554 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2555 }
2556
2557 // Pad neighboring MFMA with noops for better inter-wave performance.
2558 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2559
2560 return WaitStatesNeeded;
2561}
2562
2563static int
2565 bool IsGFX950) {
2566 // xdl def cycles | gfx940 | gfx950
2567 // 2 pass | 3 4
2568 // 4 pass | 5 6
2569 // 8 pass | 9 10
2570 // 16 pass | 17 18
2571 return NumPasses + 1 + IsGFX950;
2572}
2573
2574static int
2576 bool IsGFX950) {
2577 // xdl def cycles | gfx940 | gfx950
2578 // 2 pass | 3 3
2579 // 4 pass | 5 6
2580 // 8 pass | 9 10
2581 // 16 pass | 17 18
2582 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2583}
2584
2585static int
2587 // 2 pass -> 2
2588 // 4 pass -> 4
2589 // 8 pass -> 8
2590 // 16 pass -> 16
2591 return NumPasses;
2592}
2593
2594static int
2596 // 2 pass -> 4
2597 // 4 pass -> 6
2598 // 8 pass -> 10
2599 // 16 pass -> 18
2600 return NumPasses + 2;
2601}
2602
2604 bool IsGFX950) {
2605 // xdl def cycles | gfx942 | gfx950
2606 // 2 pass | 5 5
2607 // 4 pass | 7 8
2608 // 8 pass | 11 12
2609 // 16 pass | 19 20
2610 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2611}
2612
2613int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2614 int WaitStatesNeeded = 0;
2615 unsigned Opc = MI->getOpcode();
2616
2617 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2619 };
2620
2621 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2624 };
2625
2626 if (!SIInstrInfo::isMFMA(*MI))
2627 return WaitStatesNeeded;
2628
2629 const int VALUWritesExecWaitStates = 4;
2630 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2631 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2632 VALUWritesExecWaitStates);
2633 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2634
2635 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2636
2637 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2638 for (const MachineOperand &Use : MI->explicit_uses()) {
2639 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2640 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2641 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2642 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2643 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2644 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2645 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2646 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2647 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2648 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2649 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2650 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2651 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2652 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2653 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2654 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2655 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2656 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2657 const int MaxWaitStates = 19;
2658
2659 if (!Use.isReg())
2660 continue;
2661 Register Reg = Use.getReg();
2662 bool FullReg;
2663 const MachineInstr *MI1;
2664
2665 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2666 this](const MachineInstr &MI) {
2667 if (!SIInstrInfo::isMFMA(MI))
2668 return false;
2669 Register DstReg = MI.getOperand(0).getReg();
2670 FullReg = (DstReg == Reg);
2671 MI1 = &MI;
2672 return TRI.regsOverlap(DstReg, Reg);
2673 };
2674
2675 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2676 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2677 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2678
2679 int NumWaitStates =
2680 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2681 if (NumWaitStates == std::numeric_limits<int>::max())
2682 continue;
2683
2684 int OpNo = Use.getOperandNo();
2685 unsigned Opc1 = MI1->getOpcode();
2686 int NeedWaitStates = 0;
2687 if (OpNo == SrcCIdx) {
2688 if (!SIInstrInfo::isDGEMM(Opc) &&
2689 (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opc1))) {
2690 NeedWaitStates = 0;
2691 } else if (FullReg) {
2692 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2693 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2694 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2695 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2696 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2697 else if (ST.hasGFX940Insts() &&
2698 TSchedModel.computeInstrLatency(MI1) == 2)
2699 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2700 } else {
2701 switch (Opc1) {
2702 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2703 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2704 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2705 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2706 if (!TII.isXDL(*MI))
2707 NeedWaitStates =
2708 ST.hasGFX950Insts()
2709 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2710 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2711 break;
2712 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2713 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2714 if (!TII.isXDL(*MI))
2715 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2716 break;
2717 default:
2718 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2719 if (ST.hasGFX940Insts()) {
2720 if (TII.isXDL(*MI) && !TII.isXDL(*MI1))
2721 break;
2722
2723 NeedWaitStates =
2724 TII.isXDL(*MI1)
2725 ? (TII.isXDL(*MI)
2727 NumPasses, ST.hasGFX950Insts())
2729 NumPasses, ST.hasGFX950Insts()))
2731 NumPasses);
2732 break;
2733 }
2734
2735 switch (NumPasses) {
2736 case 2:
2737 NeedWaitStates =
2739 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2740 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2741 break;
2742 case 8:
2743 NeedWaitStates =
2745 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2746 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2747 break;
2748 case 16:
2749 NeedWaitStates =
2751 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2752 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2753 break;
2754 default:
2755 llvm_unreachable("unexpected number of passes");
2756 }
2757 }
2758 }
2759 } else {
2760 switch (Opc1) {
2761 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2762 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2763 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2764 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2765 NeedWaitStates =
2766 ST.hasGFX950Insts()
2767 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2768 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2769 break;
2770 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2771 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2772 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2773 break;
2774 default:
2775 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2776
2777 if (ST.hasGFX940Insts()) {
2778 NeedWaitStates =
2779 TII.isXDL(*MI1)
2781 NumPasses, ST.hasGFX950Insts())
2783 NumPasses);
2784 break;
2785 }
2786
2787 switch (NumPasses) {
2788 case 2:
2789 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2790 break;
2791 case 4:
2792 llvm_unreachable("unexpected number of passes for mfma");
2793 case 8:
2794 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2795 break;
2796 case 16:
2797 default:
2798 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2799 }
2800 }
2801 }
2802 if (WaitStatesNeeded >= NeedWaitStates)
2803 continue;
2804
2805 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2806 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2807
2808 if (WaitStatesNeeded == MaxWaitStates)
2809 break;
2810 }
2811
2812 // Pad neighboring MFMA with noops for better inter-wave performance.
2813 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2814
2815 return WaitStatesNeeded;
2816}
2817
2818int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2819 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2820 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2821 return 0;
2822
2823 int WaitStatesNeeded = 0;
2824
2825 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2826 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2827 };
2828
2829 for (const MachineOperand &Op : MI->explicit_uses()) {
2830 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2831 continue;
2832
2833 Register Reg = Op.getReg();
2834
2835 const int AccVgprReadLdStWaitStates = 2;
2836 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2837 const int MaxWaitStates = 2;
2838
2839 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2840 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2841 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2842
2843 if (WaitStatesNeeded == MaxWaitStates)
2844 return WaitStatesNeeded; // Early exit.
2845
2846 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2847 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2848 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2849 return false;
2850 auto IsVALUFn = [](const MachineInstr &MI) {
2852 };
2853 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2854 std::numeric_limits<int>::max();
2855 };
2856
2857 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2858 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2859 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2860 }
2861
2862 return WaitStatesNeeded;
2863}
2864
2865int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
2866 assert(!ST.hasVcmpxPermlaneHazard() &&
2867 "this is a different vcmpx+permlane hazard");
2868 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2869 const SIInstrInfo *TII = ST.getInstrInfo();
2870
2871 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2872 return isVCmpXWritesExec(*TII, *TRI, MI);
2873 };
2874
2875 auto IsVALUFn = [](const MachineInstr &MI) {
2876 return SIInstrInfo::isVALU(MI);
2877 };
2878
2879 const int VCmpXWritesExecWaitStates = 4;
2880 const int VALUWritesVDstWaitStates = 2;
2881 int WaitStatesNeeded = 0;
2882
2883 for (const MachineOperand &Op : MI->explicit_uses()) {
2884 if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
2885 continue;
2886 Register Reg = Op.getReg();
2887
2888 int WaitStatesSinceDef =
2889 VALUWritesVDstWaitStates -
2890 getWaitStatesSinceDef(Reg, IsVALUFn,
2891 /*MaxWaitStates=*/VALUWritesVDstWaitStates);
2892 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2893 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2894 break;
2895 }
2896
2897 int VCmpXHazardWaits =
2898 VCmpXWritesExecWaitStates -
2899 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2900
2901 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2902 return WaitStatesNeeded;
2903}
2904
2906 // 2 pass -> 4
2907 // 4 pass -> 6
2908 // 8 pass -> 10
2909 // 16 pass -> 18
2910 return NumPasses + 2;
2911}
2912
2914 bool IsGFX950) {
2915 // xdl def cycles | gfx942 | gfx950
2916 // 2 pass | 5 5
2917 // 4 pass | 7 8
2918 // 8 pass | 11 12
2919 // 16 pass | 19 20
2920 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2921}
2922
2924 bool IsGFX950) {
2925 // xdl def cycles | gfx942 | gfx950
2926 // 2 pass | 5 5
2927 // 4 pass | 7 8
2928 // 8 pass | 11 12
2929 // 16 pass | 19 20
2930 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2931}
2932
2934 // 2 pass -> 4
2935 // 4 pass -> 6
2936 // 8 pass -> 10
2937 // 16 pass -> 18
2938 return NumPasses + 2;
2939}
2940
2941int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2942 if (!ST.hasGFX90AInsts())
2943 return 0;
2944
2945 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2946 return SIInstrInfo::isDGEMM(MI.getOpcode());
2947 };
2948
2949 // This is checked in checkMAIHazards90A()
2950 if (SIInstrInfo::isMFMA(*MI))
2951 return 0;
2952
2953 const MachineRegisterInfo &MRI = MF.getRegInfo();
2954
2955 int WaitStatesNeeded = 0;
2956
2957 bool IsMem = SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI);
2958 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2959 bool IsVALU = SIInstrInfo::isVALU(*MI);
2960
2961 const MachineInstr *MFMA = nullptr;
2962 unsigned Reg;
2963 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2964 if (!SIInstrInfo::isMFMA(MI) ||
2965 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2966 return false;
2967 MFMA = &MI;
2968 return true;
2969 };
2970
2971 const MachineInstr *DOT = nullptr;
2972 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2973 if (!SIInstrInfo::isDOT(MI) ||
2974 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2975 return false;
2976 DOT = &MI;
2977 return true;
2978 };
2979
2980 bool DGEMMAfterVALUWrite = false;
2981 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2982 // Found DGEMM on reverse traversal to def.
2983 if (SIInstrInfo::isDGEMM(MI.getOpcode()))
2984 DGEMMAfterVALUWrite = true;
2985
2986 // Only hazard if register is defined by a VALU and a DGEMM is found after
2987 // after the def.
2988 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2989 return false;
2990
2991 return true;
2992 };
2993
2994 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2995 AMDGPU::OpName::src2);
2996
2997 if (IsMemOrExport || IsVALU) {
2998 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2999 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3000 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3001 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3002 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3003 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3004 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3005 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3006 const int DotWriteSameDotReadSrcAB = 3;
3007 const int DotWriteDifferentVALURead = 3;
3008 const int DMFMABetweenVALUWriteVMEMRead = 2;
3009 const int MaxWaitStates = 19;
3010
3011 for (const MachineOperand &Use : MI->explicit_uses()) {
3012 if (!Use.isReg())
3013 continue;
3014 Reg = Use.getReg();
3015
3016 DOT = nullptr;
3017 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3018 MaxWaitStates);
3019 if (DOT) {
3020 int NeedWaitStates = 0;
3021 if (DOT->getOpcode() == MI->getOpcode()) {
3022 if (&Use - &MI->getOperand(0) != SrcCIdx)
3023 NeedWaitStates = DotWriteSameDotReadSrcAB;
3024 } else {
3025 NeedWaitStates = DotWriteDifferentVALURead;
3026 }
3027
3028 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3029 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3030 }
3031
3032 // Workaround for HW data hazard bug observed only in GFX90A. When there
3033 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
3034 // causes the SQ to incorrectly not insert two wait states between the two
3035 // instructions needed to avoid data hazard.
3036 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3037 DGEMMAfterVALUWrite = false;
3038 if (TRI.isVectorRegister(MRI, Reg)) {
3039 int WaitStatesNeededForUse =
3040 DMFMABetweenVALUWriteVMEMRead -
3041 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
3042 DMFMABetweenVALUWriteVMEMRead);
3043
3044 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3045 }
3046 }
3047
3048 MFMA = nullptr;
3049 WaitStatesSinceDef =
3050 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3051 if (!MFMA)
3052 continue;
3053
3054 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3055 int NumPasses = HazardDefLatency;
3056 int NeedWaitStates = MaxWaitStates;
3057
3058 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3059 switch (HazardDefLatency) {
3060 case 4:
3061 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3062 : DMFMA4x4WriteVgprVALUReadWaitStates;
3063 break;
3064 case 8:
3065 case 16:
3066 NeedWaitStates =
3067 IsMemOrExport
3068 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3069 : (ST.hasGFX950Insts()
3070 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3071 : DMFMA16x16WriteVgprVALUReadWaitStates);
3072 break;
3073 default:
3074 llvm_unreachable("unexpected dgemm");
3075 }
3076 } else if (ST.hasGFX940Insts()) {
3077 NeedWaitStates =
3078 TII.isXDL(*MFMA)
3080 NumPasses, ST.hasGFX950Insts())
3082 NumPasses);
3083 } else {
3084 switch (HazardDefLatency) {
3085 case 2:
3086 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3087 break;
3088 case 8:
3089 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3090 break;
3091 case 16:
3092 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3093 break;
3094 default:
3095 llvm_unreachable("unexpected number of passes for mfma");
3096 }
3097 }
3098
3099 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3100 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3101
3102 if (WaitStatesNeeded == MaxWaitStates)
3103 break;
3104 }
3105 }
3106
3107 unsigned Opc = MI->getOpcode();
3108 const int DMFMAToFMA64WaitStates = 2;
3109 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
3110 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
3111 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3112 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3113 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3114 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3115 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3116 }
3117
3118 if (!IsVALU && !IsMemOrExport)
3119 return WaitStatesNeeded;
3120
3121 for (const MachineOperand &Def : MI->defs()) {
3122 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3123 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3124 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3125 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3126 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3127 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3128 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3129 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3130 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3131 const int DotWriteDifferentVALUWrite = 3;
3132 const int MaxWaitStates = 19;
3133 const int MaxWarWaitStates = 15;
3134
3135 Reg = Def.getReg();
3136
3137 DOT = nullptr;
3138 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3139 MaxWaitStates);
3140 if (DOT && DOT->getOpcode() != MI->getOpcode())
3141 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3142 WaitStatesSinceDef);
3143
3144 MFMA = nullptr;
3145 WaitStatesSinceDef =
3146 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3147 if (MFMA) {
3148 int NeedWaitStates = MaxWaitStates;
3149 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
3150
3151 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3152 switch (NumPasses) {
3153 case 4:
3154 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3155 break;
3156 case 8:
3157 case 16:
3158 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3159 break;
3160 default:
3161 llvm_unreachable("unexpected number of cycles for dgemm");
3162 }
3163 } else if (ST.hasGFX940Insts()) {
3164 NeedWaitStates =
3165 TII.isXDL(*MFMA)
3167 NumPasses, ST.hasGFX950Insts())
3169 } else {
3170 switch (NumPasses) {
3171 case 2:
3172 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3173 break;
3174 case 8:
3175 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3176 break;
3177 case 16:
3178 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3179 break;
3180 default:
3181 llvm_unreachable("Unexpected number of passes for mfma");
3182 }
3183 }
3184
3185 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3186 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3187
3188 if (WaitStatesNeeded == MaxWaitStates)
3189 break;
3190 }
3191
3192 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3193 if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(MI.getOpcode()) ||
3194 !MI.readsRegister(Reg, &TRI))
3195 return false;
3196
3197 if (ST.hasGFX940Insts() && !TII.isXDL(MI))
3198 return false;
3199
3200 const MachineOperand *SrcC =
3201 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
3202 assert(SrcC);
3203 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
3204 return false;
3205
3206 MFMA = &MI;
3207 return true;
3208 };
3209
3210 MFMA = nullptr;
3211 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3212 MaxWarWaitStates);
3213 if (!MFMA)
3214 continue;
3215
3216 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3217 int NeedWaitStates = MaxWaitStates;
3218 switch (HazardDefLatency) {
3219 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3220 break;
3221 case 4: assert(ST.hasGFX940Insts());
3222 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3223 break;
3224 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3225 break;
3226 case 16: [[fallthrough]];
3227 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3228 break;
3229 }
3230
3231 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3232 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3233 }
3234
3235 return WaitStatesNeeded;
3236}
3237
3239 if (!SU->isInstr())
3240 return false;
3241
3242 const MachineInstr *MAI = nullptr;
3243
3244 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
3245 MAI = nullptr;
3247 MAI = &MI;
3248 return MAI != nullptr;
3249 };
3250
3251 MachineInstr *MI = SU->getInstr();
3252 if (IsMFMAFn(*MI)) {
3253 int W = getWaitStatesSince(IsMFMAFn, 16);
3254 if (MAI)
3255 return W < (int)TSchedModel.computeInstrLatency(MAI);
3256 }
3257
3258 return false;
3259}
3260
3261// Adjust global offsets for instructions bundled with S_GETPC_B64 after
3262// insertion of a new instruction.
3263static void updateGetPCBundle(MachineInstr *NewMI) {
3264 if (!NewMI->isBundled())
3265 return;
3266
3267 // Find start of bundle.
3268 auto I = NewMI->getIterator();
3269 while (I->isBundledWithPred())
3270 I--;
3271 if (I->isBundle())
3272 I++;
3273
3274 // Bail if this is not an S_GETPC bundle.
3275 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
3276 return;
3277
3278 // Update offsets of any references in the bundle.
3279 const unsigned NewBytes = 4;
3280 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3281 "Unexpected instruction insertion in bundle");
3282 auto NextMI = std::next(NewMI->getIterator());
3283 auto End = NewMI->getParent()->end();
3284 while (NextMI != End && NextMI->isBundledWithPred()) {
3285 for (auto &Operand : NextMI->operands()) {
3286 if (Operand.isGlobal())
3287 Operand.setOffset(Operand.getOffset() + NewBytes);
3288 }
3289 NextMI++;
3290 }
3291}
3292
3293bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
3294 if (!ST.hasVALUMaskWriteHazard())
3295 return false;
3296 assert(!ST.hasExtendedWaitCounts());
3297
3298 if (!ST.isWave64())
3299 return false;
3300
3301 const bool IsSALU = SIInstrInfo::isSALU(*MI);
3302 const bool IsVALU = SIInstrInfo::isVALU(*MI);
3303 if (!IsSALU && !IsVALU)
3304 return false;
3305
3306 // The hazard sequence is three instructions:
3307 // 1. VALU reads SGPR as mask
3308 // 2. VALU/SALU writes SGPR
3309 // 3. VALU/SALU reads SGPR
3310 // The hazard can expire if the distance between 2 and 3 is sufficient,
3311 // or (2) is VALU and (3) is SALU.
3312 // In practice this happens <10% of the time, hence always assume the hazard
3313 // exists if (1) and (2) are present to avoid searching all SGPR reads.
3314
3315 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3316 const MachineRegisterInfo &MRI = MF.getRegInfo();
3317
3318 auto IgnoreableSGPR = [](const Register Reg) {
3319 switch (Reg) {
3320 case AMDGPU::EXEC:
3321 case AMDGPU::EXEC_LO:
3322 case AMDGPU::EXEC_HI:
3323 case AMDGPU::M0:
3324 case AMDGPU::SGPR_NULL:
3325 case AMDGPU::SGPR_NULL64:
3326 case AMDGPU::SCC:
3327 return true;
3328 default:
3329 return false;
3330 }
3331 };
3332 auto IsVCC = [](const Register Reg) {
3333 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3334 };
3335
3336 struct StateType {
3337 SmallSet<Register, 2> HazardSGPRs;
3338
3339 static unsigned getHashValue(const StateType &State) {
3340 return hash_combine_range(State.HazardSGPRs);
3341 }
3342 static bool isEqual(const StateType &LHS, const StateType &RHS) {
3343 return LHS.HazardSGPRs == RHS.HazardSGPRs;
3344 }
3345 };
3346
3347 SmallVector<const MachineInstr *> WaitInstrs;
3348 bool HasSGPRRead = false;
3349 StateType InitialState;
3350
3351 // Look for SGPR write.
3352 MachineOperand *HazardDef = nullptr;
3353 for (MachineOperand &Op : MI->operands()) {
3354 if (!Op.isReg())
3355 continue;
3356 if (Op.isDef() && HazardDef)
3357 continue;
3358
3359 Register Reg = Op.getReg();
3360 if (IgnoreableSGPR(Reg))
3361 continue;
3362 if (!IsVCC(Reg)) {
3363 if (Op.isImplicit())
3364 continue;
3365 if (!TRI->isSGPRReg(MRI, Reg))
3366 continue;
3367 }
3368 // Also check for SGPR reads.
3369 if (Op.isUse()) {
3370 HasSGPRRead = true;
3371 continue;
3372 }
3373
3374 assert(!HazardDef);
3375 HazardDef = &Op;
3376 }
3377
3378 if (!HazardDef)
3379 return false;
3380
3381 // Setup to track writes to individual SGPRs
3382 const Register HazardReg = HazardDef->getReg();
3383 if (AMDGPU::SReg_32RegClass.contains(HazardReg)) {
3384 InitialState.HazardSGPRs.insert(HazardReg);
3385 } else {
3386 assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
3387 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3388 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3389 }
3390
3391 auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
3392 if (State.HazardSGPRs.empty())
3393 return HazardExpired;
3394
3395 switch (I.getOpcode()) {
3396 case AMDGPU::V_ADDC_U32_e32:
3397 case AMDGPU::V_ADDC_U32_dpp:
3398 case AMDGPU::V_CNDMASK_B16_t16_e32:
3399 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3400 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3401 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3402 case AMDGPU::V_CNDMASK_B32_e32:
3403 case AMDGPU::V_CNDMASK_B32_dpp:
3404 case AMDGPU::V_DIV_FMAS_F32_e64:
3405 case AMDGPU::V_DIV_FMAS_F64_e64:
3406 case AMDGPU::V_SUBB_U32_e32:
3407 case AMDGPU::V_SUBB_U32_dpp:
3408 case AMDGPU::V_SUBBREV_U32_e32:
3409 case AMDGPU::V_SUBBREV_U32_dpp: {
3410 // These implicitly read VCC as mask source.
3411 return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
3412 }
3413 case AMDGPU::V_ADDC_U32_e64:
3414 case AMDGPU::V_ADDC_U32_e64_dpp:
3415 case AMDGPU::V_CNDMASK_B16_t16_e64:
3416 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3417 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3418 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3419 case AMDGPU::V_CNDMASK_B32_e64:
3420 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3421 case AMDGPU::V_SUBB_U32_e64:
3422 case AMDGPU::V_SUBB_U32_e64_dpp:
3423 case AMDGPU::V_SUBBREV_U32_e64:
3424 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3425 // Only check mask register overlaps.
3426 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
3427 assert(SSRCOp);
3428 bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg);
3429 return Result ? HazardFound : NoHazardFound;
3430 }
3431 default:
3432 return NoHazardFound;
3433 }
3434 };
3435
3436 const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
3438 0),
3439 0);
3440 auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
3441 switch (I.getOpcode()) {
3442 case AMDGPU::S_WAITCNT_DEPCTR:
3443 // Record mergable waits within region of instructions free of SGPR reads.
3444 if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
3445 (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3446 WaitInstrs.push_back(&I);
3447 break;
3448 default:
3449 // Update tracking of SGPR reads and writes.
3450 for (auto &Op : I.operands()) {
3451 if (!Op.isReg())
3452 continue;
3453
3454 Register Reg = Op.getReg();
3455 if (IgnoreableSGPR(Reg))
3456 continue;
3457 if (!IsVCC(Reg)) {
3458 if (Op.isImplicit())
3459 continue;
3460 if (!TRI->isSGPRReg(MRI, Reg))
3461 continue;
3462 }
3463 if (Op.isUse()) {
3464 HasSGPRRead = true;
3465 continue;
3466 }
3467
3468 // Stop tracking any SGPRs with writes on the basis that they will
3469 // already have an appropriate wait inserted afterwards.
3471 for (Register SGPR : State.HazardSGPRs) {
3472 if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR))
3473 Found.push_back(SGPR);
3474 }
3475 for (Register SGPR : Found)
3476 State.HazardSGPRs.erase(SGPR);
3477 }
3478 break;
3479 }
3480 };
3481
3482 // Check for hazard
3483 if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
3484 MI->getParent(),
3485 std::next(MI->getReverseIterator())))
3486 return false;
3487
3488 // Compute counter mask
3489 unsigned DepCtr =
3490 IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0, ST)
3491 : AMDGPU::DepCtr::encodeFieldVaSdst(0, ST))
3492 : AMDGPU::DepCtr::encodeFieldSaSdst(0, ST);
3493
3494 // Try to merge previous waits into this one for regions with no SGPR reads.
3495 if (!WaitInstrs.empty()) {
3496 // Note: WaitInstrs contains const pointers, so walk backward from MI to
3497 // obtain a mutable pointer to each instruction to be merged.
3498 // This is expected to be a very short walk within the same block.
3499 SmallVector<MachineInstr *> ToErase;
3500 unsigned Found = 0;
3501 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3502 End = MI->getParent()->rend();
3503 Found < WaitInstrs.size() && It != End; ++It) {
3504 MachineInstr *WaitMI = &*It;
3505 // Find next wait instruction.
3506 if (std::as_const(WaitMI) != WaitInstrs[Found])
3507 continue;
3508 Found++;
3509 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3510 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3511 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3512 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3513 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3514 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3515 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3516 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3517 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3518 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3519 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3520 ToErase.push_back(WaitMI);
3521 }
3522 assert(Found == WaitInstrs.size());
3523 for (MachineInstr *WaitMI : ToErase)
3524 WaitMI->eraseFromParent();
3525 }
3526
3527 // Add s_waitcnt_depctr after SGPR write.
3528 auto NextMI = std::next(MI->getIterator());
3529 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3530 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3531 .addImm(DepCtr);
3532
3533 // SALU write may be s_getpc in a bundle.
3534 updateGetPCBundle(NewMI);
3535
3536 return true;
3537}
3538
3539static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3540 const SIInstrInfo &TII) {
3541 MachineBasicBlock &EntryMBB = MF->front();
3542 if (EntryMBB.begin() != EntryMBB.end()) {
3543 auto &EntryMI = *EntryMBB.begin();
3544 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3545 EntryMI.getOperand(0).getImm() >= Priority)
3546 return false;
3547 }
3548
3549 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3550 .addImm(Priority);
3551 return true;
3552}
3553
3554bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3555 if (!ST.hasRequiredExportPriority())
3556 return false;
3557
3558 // Assume the following shader types will never have exports,
3559 // and avoid adding or adjusting S_SETPRIO.
3560 MachineBasicBlock *MBB = MI->getParent();
3561 MachineFunction *MF = MBB->getParent();
3562 auto CC = MF->getFunction().getCallingConv();
3563 switch (CC) {
3568 return false;
3569 default:
3570 break;
3571 }
3572
3573 const int MaxPriority = 3;
3574 const int NormalPriority = 2;
3575 const int PostExportPriority = 0;
3576
3577 auto It = MI->getIterator();
3578 switch (MI->getOpcode()) {
3579 case AMDGPU::S_ENDPGM:
3580 case AMDGPU::S_ENDPGM_SAVED:
3581 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3582 case AMDGPU::SI_RETURN_TO_EPILOG:
3583 // Ensure shader with calls raises priority at entry.
3584 // This ensures correct priority if exports exist in callee.
3585 if (MF->getFrameInfo().hasCalls())
3586 return ensureEntrySetPrio(MF, NormalPriority, TII);
3587 return false;
3588 case AMDGPU::S_SETPRIO: {
3589 // Raise minimum priority unless in workaround.
3590 auto &PrioOp = MI->getOperand(0);
3591 int Prio = PrioOp.getImm();
3592 bool InWA = (Prio == PostExportPriority) &&
3593 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3594 if (InWA || Prio >= NormalPriority)
3595 return false;
3596 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3597 return true;
3598 }
3599 default:
3600 if (!TII.isEXP(*MI))
3601 return false;
3602 break;
3603 }
3604
3605 // Check entry priority at each export (as there will only be a few).
3606 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3607 bool Changed = false;
3609 Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3610
3611 auto NextMI = std::next(It);
3612 bool EndOfShader = false;
3613 if (NextMI != MBB->end()) {
3614 // Only need WA at end of sequence of exports.
3615 if (TII.isEXP(*NextMI))
3616 return Changed;
3617 // Assume appropriate S_SETPRIO after export means WA already applied.
3618 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3619 NextMI->getOperand(0).getImm() == PostExportPriority)
3620 return Changed;
3621 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3622 }
3623
3624 const DebugLoc &DL = MI->getDebugLoc();
3625
3626 // Lower priority.
3627 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3628 .addImm(PostExportPriority);
3629
3630 if (!EndOfShader) {
3631 // Wait for exports to complete.
3632 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3633 .addReg(AMDGPU::SGPR_NULL)
3634 .addImm(0);
3635 }
3636
3637 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3638 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3639
3640 if (!EndOfShader) {
3641 // Return to normal (higher) priority.
3642 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3643 .addImm(NormalPriority);
3644 }
3645
3646 return true;
3647}
3648
3649bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
3650 if (!isSGetReg(MI->getOpcode()))
3651 return false;
3652
3653 const SIInstrInfo *TII = ST.getInstrInfo();
3654 switch (getHWReg(TII, *MI)) {
3655 default:
3656 return false;
3661 break;
3662 }
3663
3664 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3665 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3666 .addImm(0);
3667 return true;
3668}
3669
3670bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
3671 if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3672 return false;
3673
3674 const SIInstrInfo *TII = ST.getInstrInfo();
3675 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3676 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3678 BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(),
3679 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3681
3682 return true;
3683}
3684
3685bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
3686 // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3687 // for hazard to trigger.
3688 if (!IsHazardRecognizerMode)
3689 return false;
3690
3691 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3692 const SIInstrInfo *TII = ST.getInstrInfo();
3693 // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3694 const int FlatScrBaseWaitStates = 10;
3695
3696 bool ReadsFlatScrLo =
3697 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3698 bool ReadsFlatScrHi =
3699 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3700 if (isSGetReg(MI->getOpcode())) {
3701 switch (getHWReg(TII, *MI)) {
3702 default:
3703 break;
3705 ReadsFlatScrLo = true;
3706 break;
3708 ReadsFlatScrHi = true;
3709 break;
3710 }
3711 }
3712
3713 const MachineRegisterInfo &MRI = MF.getRegInfo();
3714
3715 auto IsRegDefHazard = [&](Register Reg) -> bool {
3716 DenseSet<const MachineBasicBlock *> Visited;
3717 auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3718 return MI.modifiesRegister(Reg, TRI);
3719 };
3720
3721 // This literally abuses the idea of waitstates. Instead of waitstates it
3722 // returns 1 for SGPR written and 0 otherwise.
3723 auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3724 if (!TII->isSALU(MI) && !TII->isVALU(MI))
3725 return 0;
3726 for (const MachineOperand &MO : MI.all_defs()) {
3727 if (TRI->isSGPRReg(MRI, MO.getReg()))
3728 return 1;
3729 }
3730 return 0;
3731 };
3732
3733 auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3734 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3735 unsigned Wait = MI.getOperand(0).getImm();
3738 return true;
3739 }
3740 return SgprWrites >= FlatScrBaseWaitStates;
3741 };
3742
3743 return ::getWaitStatesSince(
3744 IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()),
3745 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3746 };
3747
3748 if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
3749 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3750 (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
3751 !IsRegDefHazard(AMDGPU::SGPR103)))
3752 return false;
3753
3754 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3755 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3758 return true;
3759}
3760
3761bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
3762 if (!isSSetReg(MI->getOpcode()) ||
3763 MI->getOperand(1).getImm() != AMDGPU::Hwreg::ID_MODE)
3764 return false;
3765
3766 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3767 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3768 return true;
3769}
unsigned const MachineRegisterInfo * MRI
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, unsigned Latency, unsigned Category)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(DataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
Definition LineTable.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
BitVector & set()
Definition BitVector.h:370
A debug info location.
Definition DebugLoc.h:123
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Definition DenseMap.h:274
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
unsigned PreEmitNoopsCommon(MachineInstr *)
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
Instructions::const_reverse_iterator const_reverse_instr_iterator
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:472
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
A range adaptor for a pair of iterators.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ Entry
Definition COFF.h:862
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
initializer< Ty > init(const Ty &Val)
constexpr double e
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Wait
Definition Threading.h:60
constexpr RegState getDeadRegState(bool B)
Op::Description Desc
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...