LLVM 23.0.0git
GCNHazardRecognizer.cpp
Go to the documentation of this file.
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "AMDGPUWaitcntUtils.h"
15#include "GCNSubtarget.h"
18#include "llvm/ADT/Statistic.h"
23#include "llvm/Support/Debug.h"
25
26using namespace llvm;
27
28#define DEBUG_TYPE "gcn-hazard-recognizer"
29
30STATISTIC(NumWMMANopsHoisted,
31 "Number of WMMA hazard V_NOPs hoisted from loops");
32STATISTIC(NumWMMAHoistingBailed,
33 "Number of WMMA hazards where V_NOP hoisting was not possible");
34
35namespace {
36
37struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
38 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
39
40 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
41 if (Arg.getAsInteger(0, Value))
42 return O.error("'" + Arg + "' value invalid for uint argument!");
43
44 if (Value > 100)
45 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
46
47 return false;
48 }
49};
50
51} // end anonymous namespace
52
54 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
55 cl::desc("Fill a percentage of the latency between "
56 "neighboring MFMA with s_nops."));
57
58// This is intended for debugging purposes only.
60 NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden,
61 cl::desc("Insert a s_nop x before every instruction"));
62
64 "amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden,
65 cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"));
66
67//===----------------------------------------------------------------------===//
68// Hazard Recognizer Implementation
69//===----------------------------------------------------------------------===//
70
72 const GCNSubtarget &ST);
73
75 MachineLoopInfo *MLI)
76 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
77 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
78 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
79 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
80 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
81 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
82}
83
85 EmittedInstrs.clear();
86 EmittedVALUInstrs.clear();
87 HasPendingWMMACoexecHazard = false;
88}
89
93
95 CurrCycleInstr = MI;
96}
97
98static bool isDivFMas(unsigned Opcode) {
99 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
100}
101
102static bool isSGetReg(unsigned Opcode) {
103 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
104}
105
106static bool isSSetReg(unsigned Opcode) {
107 switch (Opcode) {
108 case AMDGPU::S_SETREG_B32:
109 case AMDGPU::S_SETREG_B32_mode:
110 case AMDGPU::S_SETREG_IMM32_B32:
111 case AMDGPU::S_SETREG_IMM32_B32_mode:
112 return true;
113 }
114 return false;
115}
116
117static bool isRWLane(unsigned Opcode) {
118 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
119}
120
121static bool isRFE(unsigned Opcode) {
122 return Opcode == AMDGPU::S_RFE_B64;
123}
124
125static bool isSMovRel(unsigned Opcode) {
126 switch (Opcode) {
127 case AMDGPU::S_MOVRELS_B32:
128 case AMDGPU::S_MOVRELS_B64:
129 case AMDGPU::S_MOVRELD_B32:
130 case AMDGPU::S_MOVRELD_B64:
131 return true;
132 default:
133 return false;
134 }
135}
136
138 const MachineInstr &MI) {
139 if (TII.isAlwaysGDS(MI.getOpcode()))
140 return true;
141
142 switch (MI.getOpcode()) {
143 case AMDGPU::S_SENDMSG:
144 case AMDGPU::S_SENDMSGHALT:
145 case AMDGPU::S_TTRACEDATA:
146 return true;
147 // These DS opcodes don't support GDS.
148 case AMDGPU::DS_NOP:
149 case AMDGPU::DS_PERMUTE_B32:
150 case AMDGPU::DS_BPERMUTE_B32:
151 return false;
152 default:
153 if (TII.isDS(MI.getOpcode())) {
154 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
155 AMDGPU::OpName::gds);
156 if (MI.getOperand(GDS).getImm())
157 return true;
158 }
159 return false;
160 }
161}
162
163static bool isPermlane(const MachineInstr &MI) {
164 unsigned Opcode = MI.getOpcode();
165 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANE64_B32 ||
167 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
169 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
170 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
171 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
173 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
175 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
176 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
177 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
178 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
179}
180
181static bool isLdsDma(const MachineInstr &MI) {
182 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) &&
184}
185
186static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
187 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
188 AMDGPU::OpName::simm16);
189 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
190}
191
194 MachineInstr *MI = SU->getInstr();
195 // If we are not in "HazardRecognizerMode" and therefore not being run from
196 // the scheduler, track possible stalls from hazards but don't insert noops.
197 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
198
199 if (MI->isBundle())
200 return NoHazard;
201
202 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
203 return HazardType;
204
205 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
206 return HazardType;
207
208 if (checkFPAtomicToDenormModeHazard(MI) > 0)
209 return HazardType;
210
211 // Hazards which cannot be mitigated with S_NOPs.
212 if (!IsHazardRecognizerMode) {
213 if (checkWMMACoexecutionHazards(MI) > 0) {
214 HasPendingWMMACoexecHazard = true;
215 return Hazard;
216 }
217 }
218
219 if (ST.hasNoDataDepHazard())
220 return NoHazard;
221
222 if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
223 return HazardType;
224
225 if (SIInstrInfo::isVALU(*MI, /*AllowLDSDMA=*/true) &&
226 checkVALUHazards(MI) > 0)
227 return HazardType;
228
229 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
230 return HazardType;
231
232 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
233 return HazardType;
234
235 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
236 return HazardType;
237
238 if ((SIInstrInfo::isVALU(*MI, /*AllowLDSDMA=*/true) ||
241 checkMAIVALUHazards(MI) > 0)
242 return HazardType;
243
244 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
245 return HazardType;
246
247 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
248 return HazardType;
249
250 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
251 return HazardType;
252
253 if (((ST.hasReadM0MovRelInterpHazard() &&
254 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
255 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
256 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
257 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
258 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
259 (ST.hasReadM0LdsDirectHazard() &&
260 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
261 checkReadM0Hazards(MI) > 0)
262 return HazardType;
263
264 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
265 return HazardType;
266
268 checkMAILdStHazards(MI) > 0)
269 return HazardType;
270
271 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
272 return HazardType;
273
274 return NoHazard;
275}
276
278 unsigned Quantity) {
279 while (Quantity > 0) {
280 unsigned Arg = std::min(Quantity, 8u);
281 Quantity -= Arg;
282 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
283 .addImm(Arg - 1);
284 }
285}
286
287unsigned
288GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
289 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
290 assert(TSchedModel.getWriteProcResBegin(SC) !=
291 TSchedModel.getWriteProcResEnd(SC));
292 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
293}
294
295void GCNHazardRecognizer::processBundle() {
296 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
297 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
298 // Check bundled MachineInstr's for hazards.
299 for (; MI != E && MI->isInsideBundle(); ++MI) {
300 CurrCycleInstr = &*MI;
301 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
302
303 if (IsHazardRecognizerMode) {
304 fixHazards(CurrCycleInstr);
305
306 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
307 }
308
309 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
310 // include the bundled MI directly after, only add a maximum of
311 // (MaxLookAhead - 1) noops to EmittedInstrs.
312 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
313 EmittedInstrs.push_front(nullptr);
314
315 EmittedInstrs.push_front(CurrCycleInstr);
316 EmittedInstrs.resize(MaxLookAhead);
317 }
318 CurrCycleInstr = nullptr;
319}
320
321void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
322 assert(IsHazardRecognizerMode);
323
324 unsigned NumPreNoops = PreEmitNoops(MI);
325 EmitNoops(NumPreNoops);
326 if (MI->isInsideBundle())
327 insertNoopsInBundle(MI, TII, NumPreNoops);
328 else
329 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
330 NumPreNoops);
332 AdvanceCycle();
333}
334
336 IsHazardRecognizerMode = true;
337 CurrCycleInstr = MI;
338 unsigned W = PreEmitNoopsCommon(MI);
339 fixHazards(MI);
340 CurrCycleInstr = nullptr;
341 return std::max(W, NopPadding.getValue());
342}
343
347
349 if (MI->isBundle())
350 return 0;
351
352 int WaitStates = 0;
353
355 return std::max(WaitStates, checkSMRDHazards(MI));
356
357 if (ST.hasNSAtoVMEMBug())
358 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
359
360 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
361
362 if (ST.hasNoDataDepHazard())
363 return WaitStates;
364
366 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
367
368 if (SIInstrInfo::isVALU(*MI, /*AllowLDSDMA=*/true))
369 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
370
372 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
373
374 if (isDivFMas(MI->getOpcode()))
375 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
376
377 if (isRWLane(MI->getOpcode()))
378 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
379
380 if ((SIInstrInfo::isVALU(*MI, /*AllowLDSDMA=*/true) ||
383 checkMAIVALUHazards(MI) > 0)
384 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
385
386 if (MI->isInlineAsm())
387 return std::max(WaitStates, checkInlineAsmHazards(MI));
388
389 if (isSGetReg(MI->getOpcode()))
390 return std::max(WaitStates, checkGetRegHazards(MI));
391
392 if (isSSetReg(MI->getOpcode()))
393 return std::max(WaitStates, checkSetRegHazards(MI));
394
395 if (isRFE(MI->getOpcode()))
396 return std::max(WaitStates, checkRFEHazards(MI));
397
398 if ((ST.hasReadM0MovRelInterpHazard() &&
399 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
400 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
401 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
402 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
403 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
404 (ST.hasReadM0LdsDirectHazard() &&
405 MI->readsRegister(AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
406 return std::max(WaitStates, checkReadM0Hazards(MI));
407
409 return std::max(WaitStates, checkMAIHazards(MI));
410
412 return std::max(WaitStates, checkMAILdStHazards(MI));
413
414 if (ST.hasGFX950Insts() && isPermlane(*MI))
415 return std::max(WaitStates, checkPermlaneHazards(MI));
416
417 return WaitStates;
418}
419
421 EmittedInstrs.push_front(nullptr);
422}
423
425 // When the scheduler detects a stall, it will call AdvanceCycle() without
426 // emitting any instructions.
427 if (!CurrCycleInstr) {
428 EmittedInstrs.push_front(nullptr);
429
430 if (HasPendingWMMACoexecHazard)
431 EmittedVALUInstrs.push_front(nullptr);
432 return;
433 }
434
435 HasPendingWMMACoexecHazard = false;
436
437 if (CurrCycleInstr->isBundle()) {
438 processBundle();
439 return;
440 }
441
442 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
443 if (!NumWaitStates) {
444 CurrCycleInstr = nullptr;
445 return;
446 }
447
448 // Keep track of emitted instructions
449 EmittedInstrs.push_front(CurrCycleInstr);
450
451 bool IsVALUOrWMMA =
452 SIInstrInfo::isVALU(*CurrCycleInstr, /*AllowLDSDMA=*/true) ||
453 SIInstrInfo::isWMMA(*CurrCycleInstr) ||
454 SIInstrInfo::isSWMMAC(*CurrCycleInstr);
455 if (IsVALUOrWMMA) {
456 EmittedVALUInstrs.push_front(CurrCycleInstr);
457 } else {
458 // A pending WMMA co-execution hazard optimistically records stall cycles as
459 // future V_NOPs. If the scheduler instead stalls for a different
460 // (S_NOP-resolvable) hazard and schedules a non-VALU into those cycles,
461 // they will not resolve the VALU-pipe hazard, so drop them here.
462 while (!EmittedVALUInstrs.empty() && EmittedVALUInstrs.front() == nullptr)
463 EmittedVALUInstrs.pop_front();
464 }
465
466 // Add a nullptr for each additional wait state after the first. Make sure
467 // not to add more than getMaxLookAhead() items to the list, since we
468 // truncate the list to that size right after this loop.
469 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
470 i < e; ++i) {
471 EmittedInstrs.push_front(nullptr);
472 }
473
474 // getMaxLookahead() is the largest number of wait states we will ever need
475 // to insert, so there is no point in keeping track of more than that many
476 // wait states.
477 EmittedInstrs.resize(getMaxLookAhead());
478 if (EmittedVALUInstrs.size() > MaxVALULookAhead)
479 EmittedVALUInstrs.resize(MaxVALULookAhead);
480
481 CurrCycleInstr = nullptr;
482}
483
485 assert(!IsHazardRecognizerMode &&
486 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
487}
488
489//===----------------------------------------------------------------------===//
490// Helper Functions
491//===----------------------------------------------------------------------===//
492
494
495// Search for a hazard in a block and its predecessors.
496template <typename StateT>
497static bool
498hasHazard(StateT InitialState,
499 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
500 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
501 const MachineBasicBlock *InitialMBB,
503 struct StateMapKey {
505 unsigned Idx;
506 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
507 return LHS.States == RHS.States && LHS.Idx == RHS.Idx;
508 }
509 };
510 struct StateMapKeyTraits : DenseMapInfo<StateMapKey> {
511 static unsigned getHashValue(const StateMapKey &Key) {
512 return StateT::getHashValue((*Key.States)[Key.Idx]);
513 }
514 static unsigned getHashValue(const StateT &State) {
515 return StateT::getHashValue(State);
516 }
517 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
518 return StateT::isEqual((*LHS.States)[LHS.Idx], (*RHS.States)[RHS.Idx]);
519 }
520 static bool isEqual(const StateT &LHS, const StateMapKey &RHS) {
521 return StateT::isEqual(LHS, (*RHS.States)[RHS.Idx]);
522 }
523 };
524
527
529 const MachineBasicBlock *MBB = InitialMBB;
530 StateT State = InitialState;
531
533 unsigned WorkIdx = 0;
534 for (;;) {
535 bool Expired = false;
536 for (auto E = MBB->instr_rend(); I != E; ++I) {
537 // No need to look at parent BUNDLE instructions.
538 if (I->isBundle())
539 continue;
540
541 auto Result = IsHazard(State, *I);
542 if (Result == HazardFound)
543 return true;
544 if (Result == HazardExpired) {
545 Expired = true;
546 break;
547 }
548
549 if (I->isInlineAsm() || I->isMetaInstruction())
550 continue;
551
552 UpdateState(State, *I);
553 }
554
555 if (!Expired) {
556 unsigned StateIdx = States.size();
557 StateMapKey Key = {&States, StateIdx};
558 auto Insertion = StateMap.insert_as(std::pair(Key, StateIdx), State);
559 if (Insertion.second) {
560 States.emplace_back(State);
561 } else {
562 StateIdx = Insertion.first->second;
563 }
564 for (MachineBasicBlock *Pred : MBB->predecessors())
565 Worklist.insert(std::pair(Pred, StateIdx));
566 }
567
568 if (WorkIdx == Worklist.size())
569 break;
570
571 unsigned StateIdx;
572 std::tie(MBB, StateIdx) = Worklist[WorkIdx++];
573 State = States[StateIdx];
574 I = MBB->instr_rbegin();
575 }
576
577 return false;
578}
579
580// Returns a minimum wait states since \p I walking all predecessors.
581// Only scans until \p IsExpired does not return true.
582// Can only be run in a hazard recognizer mode.
583static int
585 const MachineBasicBlock *MBB,
587 int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired,
591 for (auto E = MBB->instr_rend(); I != E; ++I) {
592 // Don't add WaitStates for parent BUNDLE instructions.
593 if (I->isBundle())
594 continue;
595
596 if (IsHazard(*I))
597 return WaitStates;
598
599 if (I->isInlineAsm())
600 continue;
601
602 WaitStates += GetNumWaitStates(*I);
603
604 if (IsExpired(*I, WaitStates))
605 return std::numeric_limits<int>::max();
606 }
607
608 int MinWaitStates = std::numeric_limits<int>::max();
609 for (MachineBasicBlock *Pred : MBB->predecessors()) {
610 if (!Visited.insert(Pred).second)
611 continue;
612
613 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
614 IsExpired, Visited, GetNumWaitStates);
615
616 MinWaitStates = std::min(MinWaitStates, W);
617 }
618
619 return MinWaitStates;
620}
621
622static int
624 const MachineInstr *MI,
629 return getWaitStatesSince(IsHazard, MI->getParent(),
630 std::next(MI->getReverseIterator()), 0, IsExpired,
631 Visited, GetNumWaitStates);
632}
633
634int GCNHazardRecognizer::getWaitStatesSince(
635 IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) const {
636 if (IsHazardRecognizerMode) {
637 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
638 return WaitStates >= Limit;
639 };
640 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn,
641 GetNumWaitStates);
642 }
643
644 int WaitStates = 0;
645 for (MachineInstr *MI : EmittedInstrs) {
646 if (MI) {
647 if (IsHazard(*MI))
648 return WaitStates;
649
650 if (MI->isInlineAsm())
651 continue;
652 }
653 WaitStates += MI ? GetNumWaitStates(*MI) : 1;
654
655 if (WaitStates >= Limit)
656 break;
657 }
658 return std::numeric_limits<int>::max();
659}
660
661int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
662 int Limit) const {
663 return getWaitStatesSince(IsHazard, Limit, SIInstrInfo::getNumWaitStates);
664}
665
666int GCNHazardRecognizer::getWaitStatesSinceVALU(IsHazardFn IsHazard,
667 int Limit) const {
668 if (IsHazardRecognizerMode) {
669 auto GetVALUWaitStates = [](const MachineInstr &MI) -> unsigned {
670 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) ? 1 : 0;
671 };
672 return getWaitStatesSince(IsHazard, Limit, GetVALUWaitStates);
673 }
674
675 // EmittedVALUInstrs is capped at MaxVALULookAhead, so a Limit beyond that
676 // window could miss a hazard. Keep the cap in sync with the wait-state
677 // tables.
678 assert(Limit <= (int)MaxVALULookAhead &&
679 "Limit exceeds the EmittedVALUInstrs lookahead window");
680 int WaitStates = 0;
681 for (MachineInstr *MI : EmittedVALUInstrs) {
682 if (MI) {
683 if (IsHazard(*MI))
684 return WaitStates;
685 }
686
687 ++WaitStates;
688
689 if (WaitStates >= Limit)
690 break;
691 }
692 return std::numeric_limits<int>::max();
693}
694
695int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
696 IsHazardFn IsHazardDef,
697 int Limit) const {
698 const SIRegisterInfo *TRI = ST.getRegisterInfo();
699
700 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
701 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
702 };
703
704 return getWaitStatesSince(IsHazardFn, Limit);
705}
706
707int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
708 int Limit) const {
709 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
710 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
711 };
712
713 return getWaitStatesSince(IsHazardFn, Limit);
714}
715
716//===----------------------------------------------------------------------===//
717// No-op Hazard Detection
718//===----------------------------------------------------------------------===//
719
720static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
721 MCRegister Reg) {
722 for (MCRegUnit Unit : TRI.regunits(Reg))
723 BV.set(static_cast<unsigned>(Unit));
724}
725
726static void addRegsToSet(const SIRegisterInfo &TRI,
728 BitVector &DefSet, BitVector &UseSet) {
729 for (const MachineOperand &Op : Ops) {
730 if (Op.isReg())
731 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
732 }
733}
734
735void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) const {
736 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
737}
738
740 return !SIInstrInfo::isSMRD(*MI);
741}
742
744 return !SIInstrInfo::isVMEM(*MI);
745}
746
747int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) const {
748 // SMEM soft clause are only present on VI+, and only matter if xnack is
749 // enabled.
750 if (!ST.isXNACKEnabled())
751 return 0;
752
753 bool IsSMRD = TII.isSMRD(*MEM);
754
755 resetClause();
756
757 // A soft-clause is any group of consecutive SMEM instructions. The
758 // instructions in this group may return out of order and/or may be
759 // replayed (i.e. the same instruction issued more than once).
760 //
761 // In order to handle these situations correctly we need to make sure that
762 // when a clause has more than one instruction, no instruction in the clause
763 // writes to a register that is read by another instruction in the clause
764 // (including itself). If we encounter this situation, we need to break the
765 // clause by inserting a non SMEM instruction.
766
767 for (MachineInstr *MI : EmittedInstrs) {
768 // When we hit a non-SMEM instruction then we have passed the start of the
769 // clause and we can stop.
770 if (!MI)
771 break;
772
774 break;
775
776 addClauseInst(*MI);
777 }
778
779 if (ClauseDefs.none())
780 return 0;
781
782 // We need to make sure not to put loads and stores in the same clause if they
783 // use the same address. For now, just start a new clause whenever we see a
784 // store.
785 if (MEM->mayStore())
786 return 1;
787
788 addClauseInst(*MEM);
789
790 // If the set of defs and uses intersect then we cannot add this instruction
791 // to the clause, so we have a hazard.
792 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
793}
794
795int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) const {
796 int WaitStatesNeeded = 0;
797
798 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
799
800 // This SMRD hazard only affects SI.
801 if (!ST.hasSMRDReadVALUDefHazard())
802 return WaitStatesNeeded;
803
804 // A read of an SGPR by SMRD instruction requires 4 wait states when the
805 // SGPR was written by a VALU instruction.
806 int SmrdSgprWaitStates = 4;
807 auto IsHazardDefFn = [this](const MachineInstr &MI) {
808 return TII.isVALU(MI, /*AllowLDSDMA=*/true);
809 };
810 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
811 return TII.isSALU(MI);
812 };
813
814 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
815
816 for (const MachineOperand &Use : SMRD->uses()) {
817 if (!Use.isReg())
818 continue;
819 int WaitStatesNeededForUse =
820 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
821 SmrdSgprWaitStates);
822 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
823
824 // This fixes what appears to be undocumented hardware behavior in SI where
825 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
826 // needs some number of nops in between. We don't know how many we need, but
827 // let's use 4. This wasn't discovered before probably because the only
828 // case when this happens is when we expand a 64-bit pointer into a full
829 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
830 // probably never encountered in the closed-source land.
831 if (IsBufferSMRD) {
832 int WaitStatesNeededForUse =
833 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
834 IsBufferHazardDefFn,
835 SmrdSgprWaitStates);
836 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
837 }
838 }
839
840 return WaitStatesNeeded;
841}
842
843int GCNHazardRecognizer::checkVMEMHazards(MachineInstr *VMEM) const {
844 if (!ST.hasVMEMReadSGPRVALUDefHazard())
845 return 0;
846
847 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
848
849 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
850 // SGPR was written by a VALU Instruction.
851 const int VmemSgprWaitStates = 5;
852 auto IsHazardDefFn = [this](const MachineInstr &MI) {
853 return TII.isVALU(MI, /*AllowLDSDMA=*/true);
854 };
855 for (const MachineOperand &Use : VMEM->uses()) {
856 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
857 continue;
858
859 int WaitStatesNeededForUse =
860 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
861 VmemSgprWaitStates);
862 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
863 }
864 return WaitStatesNeeded;
865}
866
867int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) const {
868 const SIRegisterInfo *TRI = ST.getRegisterInfo();
869 const SIInstrInfo *TII = ST.getInstrInfo();
870
871 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
872 int DppVgprWaitStates = 2;
873 int DppExecWaitStates = 5;
874 int WaitStatesNeeded = 0;
875 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
876 return TII->isVALU(MI, /*AllowLDSDMA=*/true);
877 };
878
879 for (const MachineOperand &Use : DPP->uses()) {
880 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
881 continue;
882 int WaitStatesNeededForUse =
883 DppVgprWaitStates - getWaitStatesSinceDef(
884 Use.getReg(),
885 [](const MachineInstr &) { return true; },
886 DppVgprWaitStates);
887 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
888 }
889
890 WaitStatesNeeded = std::max(
891 WaitStatesNeeded,
892 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
893 DppExecWaitStates));
894
895 return WaitStatesNeeded;
896}
897
898int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) const {
899 const SIInstrInfo *TII = ST.getInstrInfo();
900
901 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
902 // instruction.
903 const int DivFMasWaitStates = 4;
904 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
905 return TII->isVALU(MI, /*AllowLDSDMA=*/true);
906 };
907 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
908 DivFMasWaitStates);
909
910 return DivFMasWaitStates - WaitStatesNeeded;
911}
912
913int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) const {
914 const SIInstrInfo *TII = ST.getInstrInfo();
915 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
916
917 const int GetRegWaitStates = 2;
918 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
919 return GetRegHWReg == getHWReg(TII, MI);
920 };
921 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
922
923 return GetRegWaitStates - WaitStatesNeeded;
924}
925
926int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) const {
927 const SIInstrInfo *TII = ST.getInstrInfo();
928 unsigned HWReg = getHWReg(TII, *SetRegInstr);
929
930 const int SetRegWaitStates = ST.getSetRegWaitStates();
931 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
932 return HWReg == getHWReg(TII, MI);
933 };
934 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
935 return SetRegWaitStates - WaitStatesNeeded;
936}
937
938int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) const {
939 if (!MI.mayStore())
940 return -1;
941
942 const SIInstrInfo *TII = ST.getInstrInfo();
943 unsigned Opcode = MI.getOpcode();
944 const MCInstrDesc &Desc = MI.getDesc();
945
946 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
947 int VDataRCID = -1;
948 if (VDataIdx != -1)
949 VDataRCID = TII->getOpRegClassID(Desc.operands()[VDataIdx]);
950
951 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
952 // There is no hazard if the instruction does not use vector regs
953 // (like wbinvl1)
954 if (VDataIdx == -1)
955 return -1;
956 if (AMDGPU::getRegBitWidth(VDataRCID) > 64) {
957 // When SOFFSET-dependent wide-store windows apply, the BUFFER_STORE
958 // source-vgpr WAR hazard exists for every SOFFSET shape; the wait-state
959 // count differs by SOFFSET and is computed in checkVALUHazardsHelper.
960 // Otherwise the hazard only exists if soffset is not an SGPR.
961 if (ST.hasVDecCoExecHazard())
962 return VDataIdx;
963 const MachineOperand *SOffset =
964 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
965 if (!SOffset || !SOffset->isReg())
966 return VDataIdx;
967 }
968 }
969
970 // MIMG instructions create a hazard if they don't use a 256-bit T# and
971 // the store size is greater than 8 bytes and they have more than two bits
972 // of their dmask set.
973 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
974 if (TII->isMIMG(MI)) {
975 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
976 assert(SRsrcIdx != -1 && AMDGPU::getRegBitWidth(TII->getOpRegClassID(
977 Desc.operands()[SRsrcIdx])) == 256);
978 (void)SRsrcIdx;
979 }
980
981 if (TII->isFLAT(MI)) {
982 // There is no hazard if the instruction does not use vector regs
983 if (VDataIdx == -1)
984 return -1;
985
986 if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
987 return VDataIdx;
988 }
989
990 return -1;
991}
992
993int GCNHazardRecognizer::checkUniformWindowVALUHazardsHelper(
994 Register Reg) const {
995 // Wide stores need a single wait-state bubble before a VALU that overwrites
996 // store data. createsVALUHazard already excludes MUBUF/MTBUF stores with an
997 // SGPR SOFFSET.
998 const SIRegisterInfo *TRI = ST.getRegisterInfo();
999
1000 auto IsHazard = [&](const MachineInstr &MI) {
1001 int DataIdx = createsVALUHazard(MI);
1002 return DataIdx >= 0 &&
1003 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
1004 };
1005
1006 return std::max(0, 1 - getWaitStatesSince(IsHazard, /*Limit=*/1));
1007}
1008
1009int GCNHazardRecognizer::checkSOFFSETWindowVALUHazardsHelper(
1010 Register Reg) const {
1011 // The required wait-state window depends on the producer's SOFFSET shape:
1012 // - MUBUF/MTBUF wide store with sgpr SOFFSET: 1 wait state.
1013 // - MUBUF/MTBUF wide store with literal/absent SOFFSET, and FLAT wide
1014 // store: 2 wait states.
1015 // The 1-cycle sgpr-SOFFSET window was measured on gfx950.
1016 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1017 const SIInstrInfo *TII = ST.getInstrInfo();
1018
1019 int WaitStatesNeeded = 0;
1020
1021 // Scan each wait-state window separately and take the max padding needed.
1022 // getWaitStatesSince supplies the minimum distance to a producer over paths.
1023 for (int Window = 1; Window <= 2; ++Window) {
1024 auto IsHazard = [&](const MachineInstr &MI) {
1025 int DataIdx = createsVALUHazard(MI);
1026 if (DataIdx < 0 ||
1027 !TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg))
1028 return false;
1029
1030 // Window 1 matches every hazard producer. Window 2 excludes BUF stores
1031 // with an SGPR SOFFSET, which only require a single wait state.
1032 if (Window == 1 || !TII->isBUF(MI))
1033 return true;
1034
1035 const MachineOperand *SOffset =
1036 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
1037 return !SOffset || !SOffset->isReg();
1038 };
1039 WaitStatesNeeded = std::max(WaitStatesNeeded,
1040 Window - getWaitStatesSince(IsHazard, Window));
1041 }
1042
1043 return WaitStatesNeeded;
1044}
1045
1046int GCNHazardRecognizer::checkVALUHazardsHelper(
1047 const MachineOperand &Def, const MachineRegisterInfo &MRI) const {
1048 // Helper to check for the hazard where VMEM instructions that store more
1049 // than 8 bytes can have their store data overwritten by the next
1050 // instruction.
1051 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1052
1053 if (!TRI->isVectorRegister(MRI, Def.getReg()))
1054 return 0;
1055
1056 if (ST.hasVDecCoExecHazard())
1057 return checkSOFFSETWindowVALUHazardsHelper(Def.getReg());
1058
1059 return checkUniformWindowVALUHazardsHelper(Def.getReg());
1060}
1061
1062/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
1063/// pack the computed value into correct bit position of the dest register. This
1064/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
1065/// dst_sel that is not aligned to the register. This function analayzes the \p
1066/// MI and \returns an operand with dst forwarding issue, or nullptr if
1067/// none exists.
1068static const MachineOperand *
1070 if (!SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true))
1071 return nullptr;
1072
1073 const SIInstrInfo *TII = ST.getInstrInfo();
1074
1075 unsigned Opcode = MI.getOpcode();
1076
1077 // There are three different types of instructions
1078 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
1079 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
1080 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
1081 // op_sel[3:2]
1082 // != 0
1083 if (SIInstrInfo::isSDWA(MI)) {
1084 // Type 1: SDWA with dst_sel != DWORD
1085 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
1086 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
1087 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1088 }
1089
1090 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
1091 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
1092 // Type 2: VOP3 which write the hi bits
1093 if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
1095 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1096
1097 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
1098 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
1099 (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
1101 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1102 }
1103
1104 // Special case: nop is required for all the opsel values for fp4 sr variant
1105 // cvt scale instructions
1106 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
1107 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1108
1109 return nullptr;
1110}
1111
1112/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
1113/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
1114/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
1116 const MachineOperand *Dst,
1117 const SIRegisterInfo *TRI) {
1118 // We must consider implicit reads of the VALU. SDWA with dst_sel and
1119 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
1120 // and we must account for that hazard.
1121 // We also must account for WAW hazards. In particular, WAW with dest
1122 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
1123 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
1124 // check for ECC. Without accounting for this hazard, the ECC will be
1125 // wrong.
1126 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
1127 // complete zeroesHigh16BitsOfDest)
1128 for (auto &Operand : VALU->operands()) {
1129 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1130 return true;
1131 }
1132 }
1133 return false;
1134}
1135
1136int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) const {
1137 int WaitStatesNeeded = 0;
1138
1139 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
1140 const int TransDefWaitstates = 1;
1141
1142 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
1144 return false;
1145 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1146 const SIInstrInfo *TII = ST.getInstrInfo();
1147 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
1148
1149 for (const MachineOperand &Use : VALU->explicit_uses()) {
1150 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
1151 return true;
1152 }
1153
1154 return false;
1155 };
1156
1157 int WaitStatesNeededForDef =
1158 TransDefWaitstates -
1159 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1160 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1161 }
1162
1163 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1164 const int Shift16DefWaitstates = 1;
1165
1166 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1167 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1168 const MachineOperand *ForwardedDst =
1169 getDstSelForwardingOperand(ProducerMI, ST);
1170 if (ForwardedDst) {
1171 return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
1172 }
1173
1174 if (ProducerMI.isInlineAsm()) {
1175 // Assume inline asm has dst forwarding hazard
1176 for (auto &Def : ProducerMI.all_defs()) {
1177 if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
1178 return true;
1179 }
1180 }
1181
1182 return false;
1183 };
1184
1185 int WaitStatesNeededForDef =
1186 Shift16DefWaitstates -
1187 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1188 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1189 }
1190
1191 if (ST.hasVDecCoExecHazard()) {
1192 const int VALUWriteSGPRVALUReadWaitstates = 2;
1193 const int VALUWriteEXECRWLane = 4;
1194 const int VALUWriteVGPRReadlaneRead = 1;
1195
1196 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1197 const MachineRegisterInfo &MRI = MF.getRegInfo();
1199 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1200 if (!SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true))
1201 return false;
1202 return MI.modifiesRegister(UseReg, TRI);
1203 };
1204
1205 for (const MachineOperand &Use : VALU->explicit_uses()) {
1206 if (!Use.isReg())
1207 continue;
1208
1209 UseReg = Use.getReg();
1210 if (TRI->isSGPRReg(MRI, UseReg)) {
1211 int WaitStatesNeededForDef =
1212 VALUWriteSGPRVALUReadWaitstates -
1213 getWaitStatesSince(IsVALUDefSGPRFn,
1214 VALUWriteSGPRVALUReadWaitstates);
1215 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1216 }
1217 }
1218
1219 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
1220 UseReg = AMDGPU::VCC;
1221 int WaitStatesNeededForDef =
1222 VALUWriteSGPRVALUReadWaitstates -
1223 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1224 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1225 }
1226
1227 switch (VALU->getOpcode()) {
1228 case AMDGPU::V_READLANE_B32:
1229 case AMDGPU::V_READFIRSTLANE_B32: {
1230 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1231 UseReg = Src->getReg();
1232 int WaitStatesNeededForDef =
1233 VALUWriteVGPRReadlaneRead -
1234 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1235 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1236 }
1237 [[fallthrough]];
1238 case AMDGPU::V_WRITELANE_B32: {
1239 UseReg = AMDGPU::EXEC;
1240 int WaitStatesNeededForDef =
1241 VALUWriteEXECRWLane -
1242 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1243 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1244 break;
1245 }
1246 default:
1247 break;
1248 }
1249 }
1250
1251 // This checks for the hazard where VMEM instructions that store more than
1252 // 8 bytes can have there store data over written by the next instruction.
1253 if (!ST.has12DWordStoreHazard())
1254 return WaitStatesNeeded;
1255
1256 const MachineRegisterInfo &MRI = MF.getRegInfo();
1257
1258 for (const MachineOperand &Def : VALU->defs()) {
1259 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1260 }
1261
1262 return WaitStatesNeeded;
1263}
1264
1265int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) const {
1266 // This checks for hazards associated with inline asm statements.
1267 // Since inline asms can contain just about anything, we use this
1268 // to call/leverage other check*Hazard routines. Note that
1269 // this function doesn't attempt to address all possible inline asm
1270 // hazards (good luck), but is a collection of what has been
1271 // problematic thus far.
1272
1273 // see checkVALUHazards()
1274 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1275 !ST.hasCvtScaleForwardingHazard())
1276 return 0;
1277
1278 const MachineRegisterInfo &MRI = MF.getRegInfo();
1279 int WaitStatesNeeded = 0;
1280
1281 for (const MachineOperand &Op :
1283 if (Op.isReg() && Op.isDef()) {
1284 if (!TRI.isVectorRegister(MRI, Op.getReg()))
1285 continue;
1286
1287 if (ST.has12DWordStoreHazard()) {
1288 WaitStatesNeeded =
1289 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1290 }
1291 }
1292 }
1293
1294 if (ST.hasDstSelForwardingHazard()) {
1295 const int Shift16DefWaitstates = 1;
1296
1297 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1298 const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1299 // Assume inline asm reads the dst
1300 if (Dst)
1301 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1302 IA->readsRegister(Dst->getReg(), &TRI);
1303
1304 if (ProducerMI.isInlineAsm()) {
1305 // If MI is inline asm, assume it has dst forwarding hazard
1306 for (auto &Def : ProducerMI.all_defs()) {
1307 if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1308 IA->readsRegister(Def.getReg(), &TRI)) {
1309 return true;
1310 }
1311 }
1312 }
1313
1314 return false;
1315 };
1316
1317 int WaitStatesNeededForDef =
1318 Shift16DefWaitstates -
1319 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1320 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1321 }
1322
1323 return WaitStatesNeeded;
1324}
1325
1326int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) const {
1327 const SIInstrInfo *TII = ST.getInstrInfo();
1328 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1329 const MachineRegisterInfo &MRI = MF.getRegInfo();
1330
1331 const MachineOperand *LaneSelectOp =
1332 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1333
1334 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1335 return 0;
1336
1337 Register LaneSelectReg = LaneSelectOp->getReg();
1338 auto IsHazardFn = [TII](const MachineInstr &MI) {
1339 return TII->isVALU(MI, /*AllowLDSDMA=*/true);
1340 };
1341
1342 const int RWLaneWaitStates = 4;
1343 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1344 RWLaneWaitStates);
1345 return RWLaneWaitStates - WaitStatesSince;
1346}
1347
1348int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) const {
1349 if (!ST.hasRFEHazards())
1350 return 0;
1351
1352 const SIInstrInfo *TII = ST.getInstrInfo();
1353
1354 const int RFEWaitStates = 1;
1355
1356 auto IsHazardFn = [TII](const MachineInstr &MI) {
1357 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1358 };
1359 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1360 return RFEWaitStates - WaitStatesNeeded;
1361}
1362
1363int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) const {
1364 const SIInstrInfo *TII = ST.getInstrInfo();
1365 const int ReadM0WaitStates = 1;
1366 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1367 return ReadM0WaitStates -
1368 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1369}
1370
1371void GCNHazardRecognizer::emitVNops(MachineBasicBlock &MBB,
1373 int WaitStatesNeeded, bool IsHoisting) {
1374 const DebugLoc &DL = IsHoisting ? DebugLoc() : InsertPt->getDebugLoc();
1375 for (int I = 0; I < WaitStatesNeeded; ++I)
1376 BuildMI(MBB, InsertPt, DL, TII.get(AMDGPU::V_NOP_e32));
1377}
1378
1379void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1380 fixVMEMtoScalarWriteHazards(MI);
1381 fixVcmpxPermlaneHazards(MI);
1382 fixSMEMtoVectorWriteHazards(MI);
1383 fixVcmpxExecWARHazard(MI);
1384 fixLdsBranchVmemWARHazard(MI);
1385 if (ST.hasLdsDirect()) {
1386 fixLdsDirectVALUHazard(MI);
1387 fixLdsDirectVMEMHazard(MI);
1388 }
1389 fixVALUPartialForwardingHazard(MI);
1390 fixVALUTransUseHazard(MI);
1391 fixVALUTransCoexecutionHazards(MI);
1392 fixWMMAHazards(MI); // fall-through if co-execution is enabled.
1393 fixWMMACoexecutionHazards(MI);
1394 fixShift64HighRegBug(MI);
1395 fixVALUMaskWriteHazard(MI);
1396 fixRequiredExportPriority(MI);
1397 if (ST.requiresWaitIdleBeforeGetReg())
1398 fixGetRegWaitIdle(MI);
1399 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1400 fixDsAtomicAsyncBarrierArriveB64(MI);
1401 if (ST.hasScratchBaseForwardingHazard())
1402 fixScratchBaseForwardingHazard(MI);
1403 if (ST.setRegModeNeedsVNOPs())
1404 fixSetRegMode(MI);
1405}
1406
1408 const MachineInstr &MI) {
1409 return (TII.isVOPC(MI) ||
1410 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1411 MI.modifiesRegister(AMDGPU::EXEC, &TRI);
1412}
1413
1414bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1415 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1416 return false;
1417
1418 const SIInstrInfo *TII = ST.getInstrInfo();
1419 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1420 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1421 return isVCmpXWritesExec(*TII, *TRI, MI);
1422 };
1423
1424 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1425 unsigned Opc = MI.getOpcode();
1426 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) &&
1427 Opc != AMDGPU::V_NOP_e32 && Opc != AMDGPU::V_NOP_e64 &&
1428 Opc != AMDGPU::V_NOP_sdwa;
1429 };
1430
1431 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1432 std::numeric_limits<int>::max())
1433 return false;
1434
1435 // V_NOP will be discarded by SQ.
1436 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1437 // which is always a VGPR and available.
1438 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1439 Register Reg = Src0->getReg();
1440 bool IsUndef = Src0->isUndef();
1441 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1442 TII->get(AMDGPU::V_MOV_B32_e32))
1445
1446 return true;
1447}
1448
1449bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1450 if (!ST.hasVMEMtoScalarWriteHazard())
1451 return false;
1452 assert(!ST.hasExtendedWaitCounts());
1453
1455 return false;
1456
1457 if (MI->getNumDefs() == 0)
1458 return false;
1459
1460 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1461
1462 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1464 return false;
1465
1466 for (const MachineOperand &Def : MI->defs()) {
1467 const MachineOperand *Op =
1468 I.findRegisterUseOperand(Def.getReg(), TRI, false);
1469 if (!Op)
1470 continue;
1471 return true;
1472 }
1473 return false;
1474 };
1475
1476 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1477 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) ||
1478 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1479 !MI.getOperand(0).getImm()) ||
1480 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1481 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1482 };
1483
1484 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1485 std::numeric_limits<int>::max())
1486 return false;
1487
1488 const SIInstrInfo *TII = ST.getInstrInfo();
1489 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1490 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1492 return true;
1493}
1494
1495bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1496 if (!ST.hasSMEMtoVectorWriteHazard())
1497 return false;
1498 assert(!ST.hasExtendedWaitCounts());
1499
1500 if (!SIInstrInfo::isVALU(*MI, /*AllowLDSDMA=*/true))
1501 return false;
1502
1503 AMDGPU::OpName SDSTName;
1504 switch (MI->getOpcode()) {
1505 case AMDGPU::V_READLANE_B32:
1506 case AMDGPU::V_READFIRSTLANE_B32:
1507 SDSTName = AMDGPU::OpName::vdst;
1508 break;
1509 default:
1510 SDSTName = AMDGPU::OpName::sdst;
1511 break;
1512 }
1513
1514 const SIInstrInfo *TII = ST.getInstrInfo();
1515 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1516 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1517 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1518 if (!SDST) {
1519 for (const auto &MO : MI->implicit_operands()) {
1520 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1521 SDST = &MO;
1522 break;
1523 }
1524 }
1525 }
1526
1527 if (!SDST)
1528 return false;
1529
1530 const Register SDSTReg = SDST->getReg();
1531 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1532 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1533 };
1534
1535 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1536 if (TII->isSALU(MI)) {
1537 switch (MI.getOpcode()) {
1538 case AMDGPU::S_SETVSKIP:
1539 case AMDGPU::S_VERSION:
1540 case AMDGPU::S_WAITCNT_VSCNT:
1541 case AMDGPU::S_WAITCNT_VMCNT:
1542 case AMDGPU::S_WAITCNT_EXPCNT:
1543 // These instructions cannot not mitigate the hazard.
1544 return false;
1545 case AMDGPU::S_WAITCNT_LGKMCNT:
1546 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1547 return (MI.getOperand(1).getImm() == 0) &&
1548 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1549 case AMDGPU::S_WAITCNT: {
1550 const int64_t Imm = MI.getOperand(0).getImm();
1551 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1552 // DsCnt corresponds to LGKMCnt here.
1553 return Decoded.get(AMDGPU::DS_CNT) == 0;
1554 }
1555 default:
1556 assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
1557 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1558 "unexpected wait count instruction");
1559 // SOPP instructions cannot mitigate the hazard.
1560 if (TII->isSOPP(MI))
1561 return false;
1562 // At this point the SALU can be assumed to mitigate the hazard
1563 // because either:
1564 // (a) it is independent of the at risk SMEM (breaking chain),
1565 // or
1566 // (b) it is dependent on the SMEM, in which case an appropriate
1567 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1568 // SMEM instruction.
1569 return true;
1570 }
1571 }
1572 return false;
1573 };
1574
1575 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1576 std::numeric_limits<int>::max())
1577 return false;
1578
1579 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1580 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1581 .addImm(0);
1582 return true;
1583}
1584
1585bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1586 if (!ST.hasVcmpxExecWARHazard())
1587 return false;
1588 assert(!ST.hasExtendedWaitCounts());
1589
1590 if (!SIInstrInfo::isVALU(*MI, /*AllowLDSDMA=*/true))
1591 return false;
1592
1593 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1594 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1595 return false;
1596
1597 auto IsHazardFn = [TRI](const MachineInstr &I) {
1598 if (SIInstrInfo::isVALU(I, /*AllowLDSDMA=*/true))
1599 return false;
1600 return I.readsRegister(AMDGPU::EXEC, TRI);
1601 };
1602
1603 const SIInstrInfo *TII = ST.getInstrInfo();
1604 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1605 if (SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true)) {
1606 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1607 return true;
1608 for (auto MO : MI.implicit_operands())
1609 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1610 return true;
1611 }
1612 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1613 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1614 return true;
1615 return false;
1616 };
1617
1618 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1619 std::numeric_limits<int>::max())
1620 return false;
1621
1622 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1623 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1625 return true;
1626}
1627
1629 const GCNSubtarget &ST) {
1630 if (!ST.hasLdsBranchVmemWARHazard())
1631 return false;
1632
1633 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1634 // instructions need to appear in the same function.
1635 bool HasLds = false;
1636 bool HasVmem = false;
1637 for (auto &MBB : MF) {
1638 for (auto &MI : MBB) {
1640 HasVmem |= SIInstrInfo::isVMEM(MI);
1641 if (HasLds && HasVmem)
1642 return true;
1643 }
1644 }
1645 return false;
1646}
1647
1649 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1650 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1651 !I.getOperand(1).getImm();
1652}
1653
1654bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1655 if (!RunLdsBranchVmemWARHazardFixup)
1656 return false;
1657
1658 assert(ST.hasLdsBranchVmemWARHazard());
1659 assert(!ST.hasExtendedWaitCounts());
1660
1661 auto IsHazardInst = [](const MachineInstr &MI) {
1663 return 1;
1665 return 2;
1666 return 0;
1667 };
1668
1669 auto InstType = IsHazardInst(*MI);
1670 if (!InstType)
1671 return false;
1672
1673 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1674 return IsHazardInst(I) || isStoreCountWaitZero(I);
1675 };
1676
1677 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1678 if (!I.isBranch())
1679 return false;
1680
1681 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1682 auto InstType2 = IsHazardInst(I);
1683 return InstType2 && InstType != InstType2;
1684 };
1685
1686 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1687 auto InstType2 = IsHazardInst(I);
1688 if (InstType == InstType2)
1689 return true;
1690
1691 return isStoreCountWaitZero(I);
1692 };
1693
1694 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1695 std::numeric_limits<int>::max();
1696 };
1697
1698 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1699 std::numeric_limits<int>::max())
1700 return false;
1701
1702 const SIInstrInfo *TII = ST.getInstrInfo();
1703 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1704 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1705 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1706 .addImm(0);
1707
1708 return true;
1709}
1710
1711bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1713 return false;
1714
1715 const int NoHazardWaitStates = 15;
1716 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1717 const Register VDSTReg = VDST->getReg();
1718
1719 bool VisitedTrans = false;
1720 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1721 if (!SIInstrInfo::isVALU(I, /*AllowLDSDMA=*/true))
1722 return false;
1723 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1724 // Cover both WAR and WAW
1725 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1726 };
1727 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1728 if (WaitStates >= NoHazardWaitStates)
1729 return true;
1730 // Instructions which cause va_vdst==0 expire hazard
1733 };
1734 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1735 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) ? 1 : 0;
1736 };
1737
1738 DenseSet<const MachineBasicBlock *> Visited;
1739 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1740 std::next(MI->getReverseIterator()), 0,
1741 IsExpiredFn, Visited, GetWaitStatesFn);
1742
1743 // Transcendentals can execute in parallel to other VALUs.
1744 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1745 if (VisitedTrans)
1746 Count = 0;
1747
1748 MachineOperand *WaitVdstOp =
1749 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1750 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1751
1752 return true;
1753}
1754
1755bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1757 return false;
1758
1759 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1760 const Register VDSTReg = VDST->getReg();
1761
1762 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1764 return false;
1765 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1766 };
1767 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1768 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1769 // according to the type of VMEM instruction.
1770 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1771 return SIInstrInfo::isVALU(I, /*AllowLDSDMA=*/true) ||
1773 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1774 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1775 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1776 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1777 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1778 };
1779
1780 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1781 std::numeric_limits<int>::max())
1782 return false;
1783
1784 if (LdsdirCanWait) {
1785 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1786 } else {
1787 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1788 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1790 }
1791
1792 return true;
1793}
1794
1795bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1796 if (!ST.hasVALUPartialForwardingHazard())
1797 return false;
1798 assert(!ST.hasExtendedWaitCounts());
1799
1800 if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI, /*AllowLDSDMA=*/true))
1801 return false;
1802
1803 SmallSetVector<Register, 4> SrcVGPRs;
1804
1805 for (const MachineOperand &Use : MI->explicit_uses()) {
1806 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1807 SrcVGPRs.insert(Use.getReg());
1808 }
1809
1810 // Only applies with >= 2 unique VGPR sources
1811 if (SrcVGPRs.size() <= 1)
1812 return false;
1813
1814 // Look for the following pattern:
1815 // Va <- VALU [PreExecPos]
1816 // intv1
1817 // Exec <- SALU [ExecPos]
1818 // intv2
1819 // Vb <- VALU [PostExecPos]
1820 // intv3
1821 // MI Va, Vb (WaitState = 0)
1822 //
1823 // Where:
1824 // intv1 + intv2 <= 2 VALUs
1825 // intv3 <= 4 VALUs
1826 //
1827 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1828
1829 const int Intv1plus2MaxVALUs = 2;
1830 const int Intv3MaxVALUs = 4;
1831 const int IntvMaxVALUs = 6;
1832 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1833
1834 struct StateType {
1835 SmallDenseMap<Register, int, 4> DefPos;
1836 int ExecPos = std::numeric_limits<int>::max();
1837 int VALUs = 0;
1838
1839 static unsigned getHashValue(const StateType &State) {
1840 return hash_combine(State.ExecPos, State.VALUs,
1841 hash_combine_range(State.DefPos));
1842 }
1843 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1844 return LHS.DefPos == RHS.DefPos && LHS.ExecPos == RHS.ExecPos &&
1845 LHS.VALUs == RHS.VALUs;
1846 }
1847 };
1848
1849 StateType State;
1850
1851 // This overloads expiry testing with all the hazard detection
1852 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1853 // Too many VALU states have passed
1854 if (State.VALUs > NoHazardVALUWaitStates)
1855 return HazardExpired;
1856
1857 // Instructions which cause va_vdst==0 expire hazard
1860 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1861 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1862 return HazardExpired;
1863
1864 // Track registers writes
1865 bool Changed = false;
1866 if (SIInstrInfo::isVALU(I, /*AllowLDSDMA=*/true)) {
1867 for (Register Src : SrcVGPRs) {
1868 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1869 State.DefPos[Src] = State.VALUs;
1870 Changed = true;
1871 }
1872 }
1873 } else if (SIInstrInfo::isSALU(I)) {
1874 if (State.ExecPos == std::numeric_limits<int>::max()) {
1875 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1876 State.ExecPos = State.VALUs;
1877 Changed = true;
1878 }
1879 }
1880 }
1881
1882 // Early expiration: too many VALUs in intv3
1883 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1884 return HazardExpired;
1885
1886 // Only evaluate state if something changed
1887 if (!Changed)
1888 return NoHazardFound;
1889
1890 // Determine positions of VALUs pre/post exec change
1891 if (State.ExecPos == std::numeric_limits<int>::max())
1892 return NoHazardFound;
1893
1894 int PreExecPos = std::numeric_limits<int>::max();
1895 int PostExecPos = std::numeric_limits<int>::max();
1896
1897 for (auto Entry : State.DefPos) {
1898 int DefVALUs = Entry.second;
1899 if (DefVALUs != std::numeric_limits<int>::max()) {
1900 if (DefVALUs >= State.ExecPos)
1901 PreExecPos = std::min(PreExecPos, DefVALUs);
1902 else
1903 PostExecPos = std::min(PostExecPos, DefVALUs);
1904 }
1905 }
1906
1907 // Need a VALUs post exec change
1908 if (PostExecPos == std::numeric_limits<int>::max())
1909 return NoHazardFound;
1910
1911 // Too many VALUs in intv3?
1912 int Intv3VALUs = PostExecPos;
1913 if (Intv3VALUs > Intv3MaxVALUs)
1914 return HazardExpired;
1915
1916 // Too many VALUs in intv2?
1917 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1918 if (Intv2VALUs > Intv1plus2MaxVALUs)
1919 return HazardExpired;
1920
1921 // Need a VALUs pre exec change
1922 if (PreExecPos == std::numeric_limits<int>::max())
1923 return NoHazardFound;
1924
1925 // Too many VALUs in intv1?
1926 int Intv1VALUs = PreExecPos - State.ExecPos;
1927 if (Intv1VALUs > Intv1plus2MaxVALUs)
1928 return HazardExpired;
1929
1930 // Too many VALUs in intv1 + intv2
1931 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1932 return HazardExpired;
1933
1934 return HazardFound;
1935 };
1936 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1937 if (SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true))
1938 State.VALUs += 1;
1939 };
1940
1941 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1942 std::next(MI->getReverseIterator())))
1943 return false;
1944
1945 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1946 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1948
1949 return true;
1950}
1951
1952bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1953 if (!ST.hasVALUTransUseHazard())
1954 return false;
1955 assert(!ST.hasExtendedWaitCounts());
1956
1957 if (!SIInstrInfo::isVALU(*MI, /*AllowLDSDMA=*/true))
1958 return false;
1959
1960 SmallSet<Register, 4> SrcVGPRs;
1961
1962 for (const MachineOperand &Use : MI->explicit_uses()) {
1963 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1964 SrcVGPRs.insert(Use.getReg());
1965 }
1966
1967 // Look for the following pattern:
1968 // Va <- TRANS VALU
1969 // intv
1970 // MI Va (WaitState = 0)
1971 //
1972 // Where:
1973 // intv <= 5 VALUs / 1 TRANS
1974 //
1975 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1976
1977 const int IntvMaxVALUs = 5;
1978 const int IntvMaxTRANS = 1;
1979
1980 struct StateType {
1981 int VALUs = 0;
1982 int TRANS = 0;
1983
1984 static unsigned getHashValue(const StateType &State) {
1985 return hash_combine(State.VALUs, State.TRANS);
1986 }
1987 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1988 return LHS.VALUs == RHS.VALUs && LHS.TRANS == RHS.TRANS;
1989 }
1990 };
1991
1992 StateType State;
1993
1994 // This overloads expiry testing with all the hazard detection
1995 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1996 // Too many VALU states have passed
1997 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1998 return HazardExpired;
1999
2000 // Instructions which cause va_vdst==0 expire hazard
2003 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2004 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
2005 return HazardExpired;
2006
2007 // Track registers writes
2008 if (SIInstrInfo::isTRANS(I)) {
2009 for (Register Src : SrcVGPRs) {
2010 if (I.modifiesRegister(Src, &TRI)) {
2011 return HazardFound;
2012 }
2013 }
2014 }
2015
2016 return NoHazardFound;
2017 };
2018 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
2019 if (SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true))
2020 State.VALUs += 1;
2022 State.TRANS += 1;
2023 };
2024
2025 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
2026 std::next(MI->getReverseIterator())))
2027 return false;
2028
2029 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
2030 // avoided.
2031 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2032 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2034
2035 return true;
2036}
2037
2038bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
2039 if (!ST.hasTransCoexecutionHazard() || // Coexecution disabled.
2040 !SIInstrInfo::isVALU(*MI, /*AllowLDSDMA=*/true) ||
2042 return false;
2043
2044 const SIInstrInfo *TII = ST.getInstrInfo();
2045 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2046
2047 auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
2048 if (!SIInstrInfo::isTRANS(I))
2049 return false;
2050
2051 // RAW: Trans(I) writes, VALU(MI) reads.
2052 Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2053 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
2054 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
2055 return true;
2056 }
2057
2058 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
2059 if (!ValuDst || !ValuDst->isReg())
2060 return false;
2061
2062 // WAR: Trans(I) reads, VALU(MI) writes.
2063 Register ValuDef = ValuDst->getReg();
2064 for (const MachineOperand &TransUse : I.explicit_uses()) {
2065 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
2066 return true;
2067 }
2068
2069 return false;
2070 };
2071
2072 auto IsExpiredFn = [](const MachineInstr &I, int) {
2073 return SIInstrInfo::isVALU(I, /*AllowLDSDMA=*/true);
2074 };
2075
2076 const int HasVALU = std::numeric_limits<int>::max();
2077 if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU)
2078 return false;
2079
2080 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2081 return true;
2082}
2083
2084bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
2086 return false;
2087
2088 const SIInstrInfo *TII = ST.getInstrInfo();
2089 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2090
2091 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
2093 return false;
2094
2095 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
2096 // with the dest(matrix D) of the previous wmma.
2097 const Register CurSrc0Reg =
2098 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
2099 const Register CurSrc1Reg =
2100 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
2101
2102 const Register PrevDstReg =
2103 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
2104
2105 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2106 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2107 return true;
2108 }
2109
2110 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
2111 // but Index can't overlap with PrevDstReg.
2112 if (AMDGPU::isGFX12Plus(ST)) {
2113 if (SIInstrInfo::isSWMMAC(*MI)) {
2114 const Register CurIndex =
2115 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
2116 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2117 return true;
2118 }
2119 return false;
2120 }
2121
2122 return false;
2123 };
2124
2125 auto IsExpiredFn = [](const MachineInstr &I, int) {
2126 return SIInstrInfo::isVALU(I, /*AllowLDSDMA=*/true);
2127 };
2128
2129 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2130 std::numeric_limits<int>::max())
2131 return false;
2132
2133 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2134
2135 return true;
2136}
2137
2139 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) &&
2142}
2143
2144// Classify XDL WMMA instructions into co-execution hazard categories
2145// (Refer to SPG 4.6.12.1), mainly based on instruction latency.
2146//
2147// Category 0: WMMA with Latency 8
2148// WMMA_*F16, WMMA_*BF16
2149// WMMA_*FP8FP8
2150// WMMA_*FP8BF8
2151// WMMA_*BF8FP8
2152// WMMA_*BF8BF8
2153// WMMA_*F8F6F4 if SRCA & SRCB != F8
2154//
2155// Category 1: WMMA Latency 16
2156// WMMA_IU8
2157// WMMA_*F8F6F4 if SRCA OR SRCB == F8
2158//
2159// Category 2: SWMMAC with Latency 8
2160// SWMMAC_*F16, SWMMAC_*BF16,
2161// SWMMAC_*FP8FP8
2162// SWMMAC_*BF8FP8
2163// SWMMAC_*FP8BF8
2164// SWMMAC_*BF8BF8
2165//
2166// Category 3: SWMMAC with Latency 16
2167// SWMMAC_IU8
2168//
2169// Category 4: 16 Pass GFX1251 WMMA with latency 16
2170// V_WMMA_*_16X16X32_{F16,BF16}
2171// V_WMMA_{F32,F16}_16X16X64_{FP8,BF8}*
2172// V_WMMA_F32_16x16x128_F8F6F4 (F4 only)
2173// V_SWMMAC_*_16X16X64_{F16,BF16}
2174// V_SWMMAC_{F32,F16}_16X16X128_{FP8,BF8}*
2175//
2176// Category 5: 32 Pass GFX1251 WMMA with latency 32
2177// V_WMMA_F32_16x16x128_F8F6F4 (not all F4)
2178// V_WMMA_{F32,F16}_16X16X128_{FP8,BF8}*
2179// V_WMMA_F32_32X16X128_F4
2180// V_WMMA_I32_16X16X64_IU8
2181// V_WMMA_I32_16X16X64_IU8
2183 const SIInstrInfo *TII,
2184 const TargetSchedModel &SchedModel,
2185 const GCNSubtarget &ST) {
2186 assert(TII->isXDLWMMA(MI) && "must be xdl wmma");
2187 bool IsSWMMAC = SIInstrInfo::isSWMMAC(MI);
2188 bool IsLowestRateWMMA = ST.hasGFX125xLowestRateWMMA();
2189 unsigned Category = 0;
2190
2191 unsigned Latency = SchedModel.computeInstrLatency(&MI);
2192 switch (Latency) {
2193 case 8:
2194 Category = IsSWMMAC ? 2 : 0;
2195 break;
2196 case 16:
2197 Category = IsLowestRateWMMA ? 4 : (IsSWMMAC ? 3 : 1);
2198 break;
2199 case 32:
2200 assert(IsLowestRateWMMA && "latency 32 is not expected");
2201 Category = 5;
2202 break;
2203 default:
2204 llvm_unreachable("unexpected xdl wmma latency");
2205 } // end switch.
2206
2207 return Category;
2208}
2209
2210int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) const {
2211 if (!ST.hasWMMACoexecutionHazards())
2212 return 0;
2213
2214 const SIInstrInfo *TII = ST.getInstrInfo();
2215 if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
2216 return 0;
2217
2218 // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
2219 // be in between the first WMMA and the second instruction to cover the hazard
2220 // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
2221 // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
2222 // numbers, which depends on the category of the first WMMA.
2223 const int WMMAWaitStates[] = {5, 9, 3, 5, 9, 17};
2224 const int VALUWaitStates[] = {4, 8, 2, 4, 8, 16};
2225 unsigned Category = 0;
2226
2227 auto IsWMMAHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2228 if (!TII->isXDLWMMA(I))
2229 return false;
2230
2231 Category = getWMMAHazardInstInCategory(I, TII, TSchedModel, ST);
2232 return hasWMMAToWMMARegOverlap(I, *MI);
2233 };
2234
2235 auto IsVALUHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2236 if (!TII->isXDLWMMA(I))
2237 return false;
2238
2239 Category = getWMMAHazardInstInCategory(I, TII, TSchedModel, ST);
2240 return hasWMMAToVALURegOverlap(I, *MI);
2241 };
2242
2243 int WaitStatesNeeded = -1;
2244 int ExistingVALUs = 0; // Existing number of VALU ops in between.
2245 bool IsLowestRateWMMA = ST.hasGFX125xLowestRateWMMA();
2246
2247 // getWaitStatesSinceVALU checks for a hazard between instruction 'I' and
2248 // 'MI':
2249 // - If a hazard exists: returns the number of VALUs in between and sets
2250 // 'Category' via IsWMMAHazardFn/IsVALUHazardFn for instruction 'I'.
2251 // - If no hazard exists: returns INT_MAX, making WaitStatesNeeded negative,
2252 // so no V_NOP insertion is needed.
2253 if (TII->isXDLWMMA(*MI)) {
2254 // Maximum of MMAWaitStates.
2255 const int WMMAWaitsLimit = IsLowestRateWMMA ? 17 : 9;
2256 ExistingVALUs = getWaitStatesSinceVALU(IsWMMAHazardFn, WMMAWaitsLimit);
2257 WaitStatesNeeded = WMMAWaitStates[Category] - ExistingVALUs;
2258 } else { // Must be a co-executable VALU.
2259 // Maximum of VALUWaitStates.
2260 const int VALUWaitsLimit = IsLowestRateWMMA ? 16 : 8;
2261 ExistingVALUs = getWaitStatesSinceVALU(IsVALUHazardFn, VALUWaitsLimit);
2262 WaitStatesNeeded = VALUWaitStates[Category] - ExistingVALUs;
2263 }
2264
2265 return WaitStatesNeeded;
2266}
2267
2268bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
2269 const MachineInstr &WMMA, const MachineInstr &MI) const {
2270 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2271 Register A1 = TII.getNamedOperand(MI, AMDGPU::OpName::src0)->getReg();
2272 Register B1 = TII.getNamedOperand(MI, AMDGPU::OpName::src1)->getReg();
2273
2274 // WMMA0 writes (D0), WMMA1 reads (A1/B1/Idx1).
2275 if (TRI.regsOverlap(D0, A1) || TRI.regsOverlap(D0, B1))
2276 return true;
2277
2279 Register Idx1 = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2280 if (TRI.regsOverlap(D0, Idx1))
2281 return true;
2282 }
2283 return false;
2284}
2285
2286bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
2287 const MachineInstr &WMMA, const MachineInstr &MI) const {
2288 // WMMA writes, VALU reads.
2289 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2290 for (const MachineOperand &ValuUse : MI.explicit_uses()) {
2291 if (ValuUse.isReg() && TRI.regsOverlap(D0, ValuUse.getReg()))
2292 return true;
2293 }
2294
2295 // WMMA reads or writes, VALU writes.
2296 Register A0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src0)->getReg();
2297 Register B0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src1)->getReg();
2298 SmallVector<Register, 4> WMMARegs({D0, A0, B0});
2299
2300 if (SIInstrInfo::isSWMMAC(WMMA)) {
2301 Register Idx0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src2)->getReg();
2302 WMMARegs.push_back(Idx0);
2303 }
2304
2305 for (const MachineOperand &ValuDef : MI.defs()) {
2306 Register VDstReg = ValuDef.getReg();
2307 for (Register WMMAReg : WMMARegs) {
2308 if (TRI.regsOverlap(VDstReg, WMMAReg))
2309 return true;
2310 }
2311 }
2312 return false;
2313}
2314
2315bool GCNHazardRecognizer::isCoexecutionHazardFor(const MachineInstr &I,
2316 const MachineInstr &MI) const {
2317 // I is the potential WMMA hazard source, MI is the instruction being checked
2318 // for hazard.
2319 if (!TII.isXDLWMMA(I))
2320 return false;
2321
2322 // Dispatch based on MI type
2323 if (TII.isXDLWMMA(MI))
2324 return hasWMMAToWMMARegOverlap(I, MI);
2326 return hasWMMAToVALURegOverlap(I, MI);
2327
2328 return false;
2329}
2330
2331bool GCNHazardRecognizer::hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
2332 bool IncludeSubloops) {
2333 // Scan loop for any WMMA that hazards MI.
2334 // TODO: Avoid full loop scan when WMMA is beyond VALU distance.
2335 for (MachineBasicBlock *MBB : L->getBlocks()) {
2336 if (!IncludeSubloops && MLI->getLoopFor(MBB) != L)
2337 continue;
2338 for (MachineInstr &I : *MBB) {
2339 if (&I == MI)
2340 continue;
2341 if (isCoexecutionHazardFor(I, *MI))
2342 return true;
2343 }
2344 }
2345 return false;
2346}
2347
2348bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
2349 int WaitStatesNeeded) {
2350 if (!MLI)
2351 return false;
2352
2353 MachineLoop *L = MLI->getLoopFor(MI->getParent());
2354 if (!L) {
2355 ++NumWMMAHoistingBailed;
2356 return false;
2357 }
2358
2359 // If innermost loop has WMMA hazard, we can't hoist at all
2360 if (hasWMMAHazardInLoop(L, MI)) {
2361 ++NumWMMAHoistingBailed;
2362 return false;
2363 }
2364
2365 // Find outermost loop with no internal hazard
2366 MachineLoop *TargetLoop = L;
2367 while (MachineLoop *Parent = TargetLoop->getParentLoop()) {
2368 if (hasWMMAHazardInLoop(Parent, MI, false))
2369 break; // Parent has hazard in its own blocks, stop here
2370 TargetLoop = Parent; // Safe to hoist further out
2371 }
2372
2373 // Need valid preheader to insert V_NOPs
2374 MachineBasicBlock *Preheader = TargetLoop->getLoopPreheader();
2375 if (!Preheader) {
2376 ++NumWMMAHoistingBailed;
2377 return false;
2378 }
2379
2380 LLVM_DEBUG(dbgs() << "WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
2381 << " V_NOPs from loop to " << printMBBReference(*Preheader)
2382 << "\n");
2383
2384 emitVNops(*Preheader, Preheader->getFirstTerminator(), WaitStatesNeeded,
2385 /*IsHoisting=*/true);
2386 NumWMMANopsHoisted += WaitStatesNeeded;
2387 return true;
2388}
2389
2390bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
2391 int WaitStatesNeeded = checkWMMACoexecutionHazards(MI);
2392 if (WaitStatesNeeded <= 0)
2393 return false;
2394
2395 if (EnableWMMAVnopHoisting && tryHoistWMMAVnopsFromLoop(MI, WaitStatesNeeded))
2396 return true;
2397
2398 emitVNops(*MI->getParent(), MI->getIterator(), WaitStatesNeeded);
2399 return true;
2400}
2401
2402bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
2403 if (!ST.hasShift64HighRegBug())
2404 return false;
2405 assert(!ST.hasExtendedWaitCounts());
2406
2407 switch (MI->getOpcode()) {
2408 default:
2409 return false;
2410 case AMDGPU::V_LSHLREV_B64_e64:
2411 case AMDGPU::V_LSHRREV_B64_e64:
2412 case AMDGPU::V_ASHRREV_I64_e64:
2413 break;
2414 }
2415
2416 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
2417 if (!Amt->isReg())
2418 return false;
2419
2420 Register AmtReg = Amt->getReg();
2421 const MachineRegisterInfo &MRI = MF.getRegInfo();
2422 // Check if this is a last VGPR in the allocation block.
2423 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2424 return false;
2425
2426 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
2427 return false;
2428
2429 assert(ST.needsAlignedVGPRs());
2430 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2431
2432 const DebugLoc &DL = MI->getDebugLoc();
2433 MachineBasicBlock *MBB = MI->getParent();
2434 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
2435
2436 // In:
2437 //
2438 // Dst = shiftrev64 Amt, Src1
2439 //
2440 // if Dst!=Src1 then avoid the bug with:
2441 //
2442 // Dst.sub0 = Amt
2443 // Dst = shift64 Dst.sub0, Src1
2444
2445 Register DstReg = MI->getOperand(0).getReg();
2446 if (!Src1->isReg() || Src1->getReg() != DstReg) {
2447 Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
2448 runOnInstruction(
2449 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstLo).add(*Amt));
2450 Amt->setReg(DstLo);
2451 Amt->setIsKill(true);
2452 return true;
2453 }
2454
2455 bool Overlapped = MI->modifiesRegister(AmtReg, &TRI);
2456 Register NewReg;
2457 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2458 : AMDGPU::VGPR_32RegClass) {
2459 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
2460 NewReg = Reg;
2461 break;
2462 }
2463 }
2464
2465 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2466 : NewReg;
2467 Register NewAmtLo;
2468
2469 if (Overlapped)
2470 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2471
2472 // Insert a full wait count because found register might be pending a wait.
2473 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
2474 .addImm(0);
2475
2476 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
2477 if (Overlapped)
2478 runOnInstruction(
2479 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
2480 .addDef(AmtReg - 1)
2481 .addReg(AmtReg - 1, RegState::Undef)
2482 .addReg(NewAmtLo, RegState::Undef));
2483 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2484 .addDef(AmtReg)
2485 .addReg(AmtReg, RegState::Undef)
2486 .addReg(NewAmt, RegState::Undef));
2487
2488 // Instructions emitted after the current instruction will be processed by the
2489 // parent loop of the hazard recognizer in a natural way.
2490 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2491 AmtReg)
2492 .addDef(NewAmt)
2493 .addReg(NewAmt)
2494 .addReg(AmtReg);
2495 if (Overlapped)
2496 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
2497 AmtReg - 1)
2498 .addDef(NewAmtLo)
2499 .addReg(NewAmtLo)
2500 .addReg(AmtReg - 1);
2501
2502 // Re-running hazard recognizer on the modified instruction is not necessary,
2503 // inserted V_SWAP_B32 has already both read and write new registers so
2504 // hazards related to these register has already been handled.
2505 Amt->setReg(NewAmt);
2506 Amt->setIsKill(false);
2507 // We do not update liveness, so verifier may see it as undef.
2508 Amt->setIsUndef();
2509 if (Overlapped) {
2510 MI->getOperand(0).setReg(NewReg);
2511 Src1->setReg(NewReg);
2512 Src1->setIsKill(false);
2513 Src1->setIsUndef();
2514 }
2515
2516 return true;
2517}
2518
2519int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) const {
2520 int NSAtoVMEMWaitStates = 1;
2521
2522 if (!ST.hasNSAtoVMEMBug())
2523 return 0;
2524
2526 return 0;
2527
2528 const SIInstrInfo *TII = ST.getInstrInfo();
2529 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2530 if (!Offset || (Offset->getImm() & 6) == 0)
2531 return 0;
2532
2533 auto IsHazardFn = [TII](const MachineInstr &I) {
2534 if (!SIInstrInfo::isMIMG(I))
2535 return false;
2536 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
2537 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2538 TII->getInstSizeInBytes(I) >= 16;
2539 };
2540
2541 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
2542}
2543
2544int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
2545 MachineInstr *MI) const {
2546 int FPAtomicToDenormModeWaitStates = 3;
2547
2548 if (!ST.hasFPAtomicToDenormModeHazard())
2549 return 0;
2550 assert(!ST.hasExtendedWaitCounts());
2551
2552 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2553 return 0;
2554
2555 auto IsHazardFn = [](const MachineInstr &I) {
2556 if (!SIInstrInfo::isVMEM(I))
2557 return false;
2558 return SIInstrInfo::isFPAtomic(I);
2559 };
2560
2561 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2562 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true))
2563 return true;
2564
2565 return SIInstrInfo::isWaitcnt(MI.getOpcode());
2566 };
2567
2568 return FPAtomicToDenormModeWaitStates -
2569 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
2570}
2571
2572int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) const {
2574
2575 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2576}
2577
2578int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) const {
2579 // Early exit if no padding is requested.
2580 if (MFMAPaddingRatio == 0)
2581 return 0;
2582
2583 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2584 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
2585 return 0;
2586
2587 int NeighborMFMALatency = 0;
2588 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2589 this](const MachineInstr &MI) {
2590 if (!SIInstrInfo::isMFMA(MI))
2591 return false;
2592
2593 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2594 return true;
2595 };
2596
2597 const int MaxMFMAPipelineWaitStates = 16;
2598 int WaitStatesSinceNeighborMFMA =
2599 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2600
2601 int NeighborMFMAPaddingNeeded =
2602 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2603 WaitStatesSinceNeighborMFMA;
2604
2605 return std::max(0, NeighborMFMAPaddingNeeded);
2606}
2607
2608int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) const {
2609 int WaitStatesNeeded = 0;
2610 unsigned Opc = MI->getOpcode();
2611
2612 auto IsVALUFn = [](const MachineInstr &MI) {
2613 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) || MI.isInlineAsm();
2614 };
2615
2616 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2617 const int LegacyVALUWritesVGPRWaitStates = 2;
2618 const int VALUWritesExecWaitStates = 4;
2619 const int MaxWaitStates = 4;
2620
2621 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2622 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2623 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2624
2625 if (WaitStatesNeeded < MaxWaitStates) {
2626 for (const MachineOperand &Use : MI->explicit_uses()) {
2627 const int MaxWaitStates = 2;
2628
2629 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
2630 continue;
2631
2632 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2633 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
2634 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2635
2636 if (WaitStatesNeeded == MaxWaitStates)
2637 break;
2638 }
2639 }
2640 }
2641
2642 for (const MachineOperand &Op : MI->explicit_operands()) {
2643 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
2644 continue;
2645
2646 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2647 continue;
2648
2649 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2650 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2651 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2652 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2653 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2654 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2655 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2656 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2657 const int MaxWaitStates = 18;
2658 Register Reg = Op.getReg();
2659 unsigned HazardDefLatency = 0;
2660
2661 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2662 this](const MachineInstr &MI) {
2663 if (!SIInstrInfo::isMFMA(MI))
2664 return false;
2665 Register DstReg = MI.getOperand(0).getReg();
2666 if (DstReg == Reg)
2667 return false;
2668 HazardDefLatency =
2669 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2670 return TRI.regsOverlap(DstReg, Reg);
2671 };
2672
2673 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2674 MaxWaitStates);
2675 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2676 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2677 int OpNo = Op.getOperandNo();
2678 if (OpNo == SrcCIdx) {
2679 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2680 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2681 switch (HazardDefLatency) {
2682 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2683 break;
2684 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2685 break;
2686 case 16: [[fallthrough]];
2687 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2688 break;
2689 }
2690 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2691 switch (HazardDefLatency) {
2692 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2693 break;
2694 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2695 break;
2696 case 16: [[fallthrough]];
2697 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2698 break;
2699 }
2700 }
2701
2702 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2703 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2704
2705 if (WaitStatesNeeded == MaxWaitStates)
2706 return WaitStatesNeeded; // Early exit.
2707
2708 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2709 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2710 return false;
2711 Register DstReg = MI.getOperand(0).getReg();
2712 return TRI.regsOverlap(Reg, DstReg);
2713 };
2714
2715 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2716 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2717 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2718 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2719 if (OpNo == SrcCIdx)
2720 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2721 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2722 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2723
2724 WaitStatesNeededForUse = NeedWaitStates -
2725 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2726 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2727
2728 if (WaitStatesNeeded == MaxWaitStates)
2729 return WaitStatesNeeded; // Early exit.
2730 }
2731
2732 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2733 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2734 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2735 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2736 const int MaxWaitStates = 13;
2737 Register DstReg = MI->getOperand(0).getReg();
2738 unsigned HazardDefLatency = 0;
2739
2740 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2741 this](const MachineInstr &MI) {
2742 if (!SIInstrInfo::isMFMA(MI))
2743 return false;
2744 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2745 HazardDefLatency =
2746 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2747 return TRI.regsOverlap(Reg, DstReg);
2748 };
2749
2750 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2751 int NeedWaitStates;
2752 switch (HazardDefLatency) {
2753 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2754 break;
2755 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2756 break;
2757 case 16: [[fallthrough]];
2758 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2759 break;
2760 }
2761
2762 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2763 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2764 }
2765
2766 // Pad neighboring MFMA with noops for better inter-wave performance.
2767 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2768
2769 return WaitStatesNeeded;
2770}
2771
2772static int
2774 bool IsGFX950) {
2775 // xdl def cycles | gfx940 | gfx950
2776 // 2 pass | 3 4
2777 // 4 pass | 5 6
2778 // 8 pass | 9 10
2779 // 16 pass | 17 18
2780 return NumPasses + 1 + IsGFX950;
2781}
2782
2783static int
2785 bool IsGFX950) {
2786 // xdl def cycles | gfx940 | gfx950
2787 // 2 pass | 3 3
2788 // 4 pass | 5 6
2789 // 8 pass | 9 10
2790 // 16 pass | 17 18
2791 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2792}
2793
2794static int
2796 // 2 pass -> 2
2797 // 4 pass -> 4
2798 // 8 pass -> 8
2799 // 16 pass -> 16
2800 return NumPasses;
2801}
2802
2803static int
2805 // 2 pass -> 4
2806 // 4 pass -> 6
2807 // 8 pass -> 10
2808 // 16 pass -> 18
2809 return NumPasses + 2;
2810}
2811
2813 bool IsGFX950) {
2814 // xdl def cycles | gfx942 | gfx950
2815 // 2 pass | 5 5
2816 // 4 pass | 7 8
2817 // 8 pass | 11 12
2818 // 16 pass | 19 20
2819 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2820}
2821
2822int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) const {
2823 int WaitStatesNeeded = 0;
2824 unsigned Opc = MI->getOpcode();
2825
2826 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2827 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) &&
2829 };
2830
2831 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2832 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) &&
2834 };
2835
2836 if (!SIInstrInfo::isMFMA(*MI))
2837 return WaitStatesNeeded;
2838
2839 const int VALUWritesExecWaitStates = 4;
2840 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2841 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2842 VALUWritesExecWaitStates);
2843 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2844
2845 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2846
2847 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2848 for (const MachineOperand &Use : MI->explicit_uses()) {
2849 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2850 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2851 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2852 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2853 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2854 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2855 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2856 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2857 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2858 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2859 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2860 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2861 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2862 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2863 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2864 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2865 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2866 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2867 const int MaxWaitStates = 19;
2868
2869 if (!Use.isReg())
2870 continue;
2871 Register Reg = Use.getReg();
2872 bool FullReg;
2873 const MachineInstr *MI1;
2874
2875 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2876 this](const MachineInstr &MI) {
2877 if (!SIInstrInfo::isMFMA(MI))
2878 return false;
2879 Register DstReg = MI.getOperand(0).getReg();
2880 FullReg = (DstReg == Reg);
2881 MI1 = &MI;
2882 return TRI.regsOverlap(DstReg, Reg);
2883 };
2884
2885 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2886 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2887 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2888
2889 int NumWaitStates =
2890 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2891 if (NumWaitStates == std::numeric_limits<int>::max())
2892 continue;
2893
2894 int OpNo = Use.getOperandNo();
2895 unsigned Opc1 = MI1->getOpcode();
2896 int NeedWaitStates = 0;
2897 if (OpNo == SrcCIdx) {
2898 if (!SIInstrInfo::isDGEMM(Opc) &&
2899 (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opc1))) {
2900 NeedWaitStates = 0;
2901 } else if (FullReg) {
2902 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2903 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2904 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2905 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2906 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2907 else if (ST.hasGFX940Insts() &&
2908 TSchedModel.computeInstrLatency(MI1) == 2)
2909 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2910 } else {
2911 switch (Opc1) {
2912 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2913 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2914 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2915 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2916 if (!TII.isXDL(*MI))
2917 NeedWaitStates =
2918 ST.hasGFX950Insts()
2919 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2920 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2921 break;
2922 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2923 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2924 if (!TII.isXDL(*MI))
2925 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2926 break;
2927 default:
2928 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2929 if (ST.hasGFX940Insts()) {
2930 if (TII.isXDL(*MI) && !TII.isXDL(*MI1))
2931 break;
2932
2933 NeedWaitStates =
2934 TII.isXDL(*MI1)
2935 ? (TII.isXDL(*MI)
2937 NumPasses, ST.hasGFX950Insts())
2939 NumPasses, ST.hasGFX950Insts()))
2941 NumPasses);
2942 break;
2943 }
2944
2945 switch (NumPasses) {
2946 case 2:
2947 NeedWaitStates =
2949 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2950 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2951 break;
2952 case 8:
2953 NeedWaitStates =
2955 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2956 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2957 break;
2958 case 16:
2959 NeedWaitStates =
2961 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2962 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2963 break;
2964 default:
2965 llvm_unreachable("unexpected number of passes");
2966 }
2967 }
2968 }
2969 } else {
2970 switch (Opc1) {
2971 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2972 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2973 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2974 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2975 NeedWaitStates =
2976 ST.hasGFX950Insts()
2977 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2978 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2979 break;
2980 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2981 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2982 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2983 break;
2984 default:
2985 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2986
2987 if (ST.hasGFX940Insts()) {
2988 NeedWaitStates =
2989 TII.isXDL(*MI1)
2991 NumPasses, ST.hasGFX950Insts())
2993 NumPasses);
2994 break;
2995 }
2996
2997 switch (NumPasses) {
2998 case 2:
2999 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
3000 break;
3001 case 4:
3002 llvm_unreachable("unexpected number of passes for mfma");
3003 case 8:
3004 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
3005 break;
3006 case 16:
3007 default:
3008 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
3009 }
3010 }
3011 }
3012 if (WaitStatesNeeded >= NeedWaitStates)
3013 continue;
3014
3015 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
3016 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3017
3018 if (WaitStatesNeeded == MaxWaitStates)
3019 break;
3020 }
3021
3022 // Pad neighboring MFMA with noops for better inter-wave performance.
3023 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
3024
3025 return WaitStatesNeeded;
3026}
3027
3028int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) const {
3029 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
3030 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
3031 return 0;
3032
3033 int WaitStatesNeeded = 0;
3034
3035 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
3036 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
3037 };
3038
3039 for (const MachineOperand &Op : MI->explicit_uses()) {
3040 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
3041 continue;
3042
3043 Register Reg = Op.getReg();
3044
3045 const int AccVgprReadLdStWaitStates = 2;
3046 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
3047 const int MaxWaitStates = 2;
3048
3049 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
3050 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
3051 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3052
3053 if (WaitStatesNeeded == MaxWaitStates)
3054 return WaitStatesNeeded; // Early exit.
3055
3056 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
3057 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
3058 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
3059 return false;
3060 auto IsVALUFn = [](const MachineInstr &MI) {
3061 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) &&
3063 };
3064 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
3065 std::numeric_limits<int>::max();
3066 };
3067
3068 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
3069 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
3070 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3071 }
3072
3073 return WaitStatesNeeded;
3074}
3075
3076int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) const {
3077 assert(!ST.hasVcmpxPermlaneHazard() &&
3078 "this is a different vcmpx+permlane hazard");
3079 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3080 const SIInstrInfo *TII = ST.getInstrInfo();
3081
3082 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
3083 return isVCmpXWritesExec(*TII, *TRI, MI);
3084 };
3085
3086 auto IsVALUFn = [](const MachineInstr &MI) {
3087 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true);
3088 };
3089
3090 const int VCmpXWritesExecWaitStates = 4;
3091 const int VALUWritesVDstWaitStates = 2;
3092 int WaitStatesNeeded = 0;
3093
3094 for (const MachineOperand &Op : MI->explicit_uses()) {
3095 if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
3096 continue;
3097 Register Reg = Op.getReg();
3098
3099 int WaitStatesSinceDef =
3100 VALUWritesVDstWaitStates -
3101 getWaitStatesSinceDef(Reg, IsVALUFn,
3102 /*MaxWaitStates=*/VALUWritesVDstWaitStates);
3103 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
3104 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
3105 break;
3106 }
3107
3108 int VCmpXHazardWaits =
3109 VCmpXWritesExecWaitStates -
3110 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
3111
3112 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
3113 return WaitStatesNeeded;
3114}
3115
3117 // 2 pass -> 4
3118 // 4 pass -> 6
3119 // 8 pass -> 10
3120 // 16 pass -> 18
3121 return NumPasses + 2;
3122}
3123
3125 bool IsGFX950) {
3126 // xdl def cycles | gfx942 | gfx950
3127 // 2 pass | 5 5
3128 // 4 pass | 7 8
3129 // 8 pass | 11 12
3130 // 16 pass | 19 20
3131 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3132}
3133
3135 bool IsGFX950) {
3136 // xdl def cycles | gfx942 | gfx950
3137 // 2 pass | 5 5
3138 // 4 pass | 7 8
3139 // 8 pass | 11 12
3140 // 16 pass | 19 20
3141 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3142}
3143
3145 // 2 pass -> 4
3146 // 4 pass -> 6
3147 // 8 pass -> 10
3148 // 16 pass -> 18
3149 return NumPasses + 2;
3150}
3151
3152int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) const {
3153 if (!ST.hasGFX90AInsts())
3154 return 0;
3155
3156 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
3157 return SIInstrInfo::isDGEMM(MI.getOpcode());
3158 };
3159
3160 // This is checked in checkMAIHazards90A()
3161 if (SIInstrInfo::isMFMA(*MI))
3162 return 0;
3163
3164 const MachineRegisterInfo &MRI = MF.getRegInfo();
3165
3166 int WaitStatesNeeded = 0;
3167
3168 bool IsMem = SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isDS(*MI);
3169 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
3170 bool IsVALU = SIInstrInfo::isVALU(*MI, /*AllowLDSDMA=*/true);
3171
3172 const MachineInstr *MFMA = nullptr;
3173 unsigned Reg;
3174 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3175 if (!SIInstrInfo::isMFMA(MI) ||
3176 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
3177 return false;
3178 MFMA = &MI;
3179 return true;
3180 };
3181
3182 const MachineInstr *DOT = nullptr;
3183 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
3184 if (!SIInstrInfo::isDOT(MI) ||
3185 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
3186 return false;
3187 DOT = &MI;
3188 return true;
3189 };
3190
3191 bool DGEMMAfterVALUWrite = false;
3192 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
3193 // Found DGEMM on reverse traversal to def.
3194 if (SIInstrInfo::isDGEMM(MI.getOpcode()))
3195 DGEMMAfterVALUWrite = true;
3196
3197 // Only hazard if register is defined by a VALU and a DGEMM is found after
3198 // after the def.
3199 if (!TII.isVALU(MI, /*AllowLDSDMA=*/true) || !DGEMMAfterVALUWrite)
3200 return false;
3201
3202 return true;
3203 };
3204
3205 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
3206 AMDGPU::OpName::src2);
3207
3208 if (IsMemOrExport || IsVALU) {
3209 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
3210 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3211 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3212 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3213 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3214 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3215 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3216 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3217 const int DotWriteSameDotReadSrcAB = 3;
3218 const int DotWriteDifferentVALURead = 3;
3219 const int DMFMABetweenVALUWriteVMEMRead = 2;
3220 const int MaxWaitStates = 19;
3221
3222 for (const MachineOperand &Use : MI->explicit_uses()) {
3223 if (!Use.isReg())
3224 continue;
3225 Reg = Use.getReg();
3226
3227 DOT = nullptr;
3228 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3229 MaxWaitStates);
3230 if (DOT) {
3231 int NeedWaitStates = 0;
3232 if (DOT->getOpcode() == MI->getOpcode()) {
3233 if (&Use - &MI->getOperand(0) != SrcCIdx)
3234 NeedWaitStates = DotWriteSameDotReadSrcAB;
3235 } else {
3236 NeedWaitStates = DotWriteDifferentVALURead;
3237 }
3238
3239 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3240 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3241 }
3242
3243 // Workaround for HW data hazard bug observed only in GFX90A. When there
3244 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
3245 // causes the SQ to incorrectly not insert two wait states between the two
3246 // instructions needed to avoid data hazard.
3247 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3248 DGEMMAfterVALUWrite = false;
3249 if (TRI.isVectorRegister(MRI, Reg)) {
3250 int WaitStatesNeededForUse =
3251 DMFMABetweenVALUWriteVMEMRead -
3252 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
3253 DMFMABetweenVALUWriteVMEMRead);
3254
3255 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3256 }
3257 }
3258
3259 MFMA = nullptr;
3260 WaitStatesSinceDef =
3261 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3262 if (!MFMA)
3263 continue;
3264
3265 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3266 int NumPasses = HazardDefLatency;
3267 int NeedWaitStates = MaxWaitStates;
3268
3269 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3270 switch (HazardDefLatency) {
3271 case 4:
3272 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3273 : DMFMA4x4WriteVgprVALUReadWaitStates;
3274 break;
3275 case 8:
3276 case 16:
3277 NeedWaitStates =
3278 IsMemOrExport
3279 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3280 : (ST.hasGFX950Insts()
3281 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3282 : DMFMA16x16WriteVgprVALUReadWaitStates);
3283 break;
3284 default:
3285 llvm_unreachable("unexpected dgemm");
3286 }
3287 } else if (ST.hasGFX940Insts()) {
3288 NeedWaitStates =
3289 TII.isXDL(*MFMA)
3291 NumPasses, ST.hasGFX950Insts())
3293 NumPasses);
3294 } else {
3295 switch (HazardDefLatency) {
3296 case 2:
3297 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3298 break;
3299 case 8:
3300 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3301 break;
3302 case 16:
3303 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3304 break;
3305 default:
3306 llvm_unreachable("unexpected number of passes for mfma");
3307 }
3308 }
3309
3310 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3311 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3312
3313 if (WaitStatesNeeded == MaxWaitStates)
3314 break;
3315 }
3316 }
3317
3318 unsigned Opc = MI->getOpcode();
3319 const int DMFMAToFMA64WaitStates = 2;
3320 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
3321 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
3322 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3323 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3324 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3325 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3326 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3327 }
3328
3329 if (!IsVALU && !IsMemOrExport)
3330 return WaitStatesNeeded;
3331
3332 for (const MachineOperand &Def : MI->defs()) {
3333 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3334 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3335 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3336 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3337 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3338 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3339 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3340 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3341 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3342 const int DotWriteDifferentVALUWrite = 3;
3343 const int MaxWaitStates = 19;
3344 const int MaxWarWaitStates = 15;
3345
3346 Reg = Def.getReg();
3347
3348 DOT = nullptr;
3349 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
3350 MaxWaitStates);
3351 if (DOT && DOT->getOpcode() != MI->getOpcode())
3352 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3353 WaitStatesSinceDef);
3354
3355 MFMA = nullptr;
3356 WaitStatesSinceDef =
3357 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
3358 if (MFMA) {
3359 int NeedWaitStates = MaxWaitStates;
3360 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
3361
3362 if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
3363 switch (NumPasses) {
3364 case 4:
3365 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3366 break;
3367 case 8:
3368 case 16:
3369 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3370 break;
3371 default:
3372 llvm_unreachable("unexpected number of cycles for dgemm");
3373 }
3374 } else if (ST.hasGFX940Insts()) {
3375 NeedWaitStates =
3376 TII.isXDL(*MFMA)
3378 NumPasses, ST.hasGFX950Insts())
3380 } else {
3381 switch (NumPasses) {
3382 case 2:
3383 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3384 break;
3385 case 8:
3386 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3387 break;
3388 case 16:
3389 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3390 break;
3391 default:
3392 llvm_unreachable("Unexpected number of passes for mfma");
3393 }
3394 }
3395
3396 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3397 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3398
3399 if (WaitStatesNeeded == MaxWaitStates)
3400 break;
3401 }
3402
3403 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3404 if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(MI.getOpcode()) ||
3405 !MI.readsRegister(Reg, &TRI))
3406 return false;
3407
3408 if (ST.hasGFX940Insts() && !TII.isXDL(MI))
3409 return false;
3410
3411 const MachineOperand *SrcC =
3412 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
3413 assert(SrcC);
3414 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
3415 return false;
3416
3417 MFMA = &MI;
3418 return true;
3419 };
3420
3421 MFMA = nullptr;
3422 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3423 MaxWarWaitStates);
3424 if (!MFMA)
3425 continue;
3426
3427 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
3428 int NeedWaitStates = MaxWaitStates;
3429 switch (HazardDefLatency) {
3430 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3431 break;
3432 case 4: assert(ST.hasGFX940Insts());
3433 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3434 break;
3435 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3436 break;
3437 case 16: [[fallthrough]];
3438 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3439 break;
3440 }
3441
3442 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3443 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3444 }
3445
3446 return WaitStatesNeeded;
3447}
3448
3450 if (!SU->isInstr())
3451 return false;
3452
3453 const MachineInstr *MAI = nullptr;
3454
3455 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
3456 MAI = nullptr;
3458 MAI = &MI;
3459 return MAI != nullptr;
3460 };
3461
3462 MachineInstr *MI = SU->getInstr();
3463 if (IsMFMAFn(*MI)) {
3464 int W = getWaitStatesSince(IsMFMAFn, 16);
3465 if (MAI)
3466 return W < (int)TSchedModel.computeInstrLatency(MAI);
3467 }
3468
3469 return false;
3470}
3471
3472// Adjust global offsets for instructions bundled with S_GETPC_B64 after
3473// insertion of a new instruction.
3474static void updateGetPCBundle(MachineInstr *NewMI) {
3475 if (!NewMI->isBundled())
3476 return;
3477
3478 // Find start of bundle.
3479 auto I = NewMI->getIterator();
3480 while (I->isBundledWithPred())
3481 I--;
3482 if (I->isBundle())
3483 I++;
3484
3485 // Bail if this is not an S_GETPC bundle.
3486 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
3487 return;
3488
3489 // Update offsets of any references in the bundle.
3490 const unsigned NewBytes = 4;
3491 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3492 "Unexpected instruction insertion in bundle");
3493 auto NextMI = std::next(NewMI->getIterator());
3494 auto End = NewMI->getParent()->end();
3495 while (NextMI != End && NextMI->isBundledWithPred()) {
3496 for (auto &Operand : NextMI->operands()) {
3497 if (Operand.isGlobal())
3498 Operand.setOffset(Operand.getOffset() + NewBytes);
3499 }
3500 NextMI++;
3501 }
3502}
3503
3504bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
3505 if (!ST.hasVALUMaskWriteHazard())
3506 return false;
3507 assert(!ST.hasExtendedWaitCounts());
3508
3509 if (!ST.isWave64())
3510 return false;
3511
3512 const bool IsSALU = SIInstrInfo::isSALU(*MI);
3513 const bool IsVALU = SIInstrInfo::isVALU(*MI, /*AllowLDSDMA=*/true);
3514 if (!IsSALU && !IsVALU)
3515 return false;
3516
3517 // The hazard sequence is three instructions:
3518 // 1. VALU reads SGPR as mask
3519 // 2. VALU/SALU writes SGPR
3520 // 3. VALU/SALU reads SGPR
3521 // The hazard can expire if the distance between 2 and 3 is sufficient,
3522 // or (2) is VALU and (3) is SALU.
3523 // In practice this happens <10% of the time, hence always assume the hazard
3524 // exists if (1) and (2) are present to avoid searching all SGPR reads.
3525
3526 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3527 const MachineRegisterInfo &MRI = MF.getRegInfo();
3528
3529 auto IgnoreableSGPR = [](const Register Reg) {
3530 switch (Reg) {
3531 case AMDGPU::EXEC:
3532 case AMDGPU::EXEC_LO:
3533 case AMDGPU::EXEC_HI:
3534 case AMDGPU::M0:
3535 case AMDGPU::SGPR_NULL:
3536 case AMDGPU::SGPR_NULL64:
3537 case AMDGPU::SCC:
3538 return true;
3539 default:
3540 return false;
3541 }
3542 };
3543 auto IsVCC = [](const Register Reg) {
3544 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3545 };
3546
3547 struct StateType {
3548 SmallSet<Register, 2> HazardSGPRs;
3549
3550 static unsigned getHashValue(const StateType &State) {
3551 return hash_combine_range(State.HazardSGPRs);
3552 }
3553 static bool isEqual(const StateType &LHS, const StateType &RHS) {
3554 return LHS.HazardSGPRs == RHS.HazardSGPRs;
3555 }
3556 };
3557
3558 SmallVector<const MachineInstr *> WaitInstrs;
3559 bool HasSGPRRead = false;
3560 StateType InitialState;
3561
3562 // Look for SGPR write.
3563 MachineOperand *HazardDef = nullptr;
3564 for (MachineOperand &Op : MI->operands()) {
3565 if (!Op.isReg())
3566 continue;
3567 if (Op.isDef() && HazardDef)
3568 continue;
3569
3570 Register Reg = Op.getReg();
3571 if (IgnoreableSGPR(Reg))
3572 continue;
3573 if (!IsVCC(Reg)) {
3574 if (Op.isImplicit())
3575 continue;
3576 if (!TRI->isSGPRReg(MRI, Reg))
3577 continue;
3578 }
3579 // Also check for SGPR reads.
3580 if (Op.isUse()) {
3581 HasSGPRRead = true;
3582 continue;
3583 }
3584
3585 assert(!HazardDef);
3586 HazardDef = &Op;
3587 }
3588
3589 if (!HazardDef)
3590 return false;
3591
3592 // Setup to track writes to individual SGPRs
3593 const Register HazardReg = HazardDef->getReg();
3594 if (AMDGPU::SReg_32RegClass.contains(HazardReg)) {
3595 InitialState.HazardSGPRs.insert(HazardReg);
3596 } else {
3597 assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
3598 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3599 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3600 }
3601
3602 auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
3603 if (State.HazardSGPRs.empty())
3604 return HazardExpired;
3605
3606 switch (I.getOpcode()) {
3607 case AMDGPU::V_ADDC_U32_e32:
3608 case AMDGPU::V_ADDC_U32_dpp:
3609 case AMDGPU::V_CNDMASK_B16_t16_e32:
3610 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3611 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3612 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3613 case AMDGPU::V_CNDMASK_B32_e32:
3614 case AMDGPU::V_CNDMASK_B32_dpp:
3615 case AMDGPU::V_DIV_FMAS_F32_e64:
3616 case AMDGPU::V_DIV_FMAS_F64_e64:
3617 case AMDGPU::V_SUBB_U32_e32:
3618 case AMDGPU::V_SUBB_U32_dpp:
3619 case AMDGPU::V_SUBBREV_U32_e32:
3620 case AMDGPU::V_SUBBREV_U32_dpp: {
3621 // These implicitly read VCC as mask source.
3622 return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
3623 }
3624 case AMDGPU::V_ADDC_U32_e64:
3625 case AMDGPU::V_ADDC_U32_e64_dpp:
3626 case AMDGPU::V_CNDMASK_B16_t16_e64:
3627 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3628 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3629 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3630 case AMDGPU::V_CNDMASK_B32_e64:
3631 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3632 case AMDGPU::V_SUBB_U32_e64:
3633 case AMDGPU::V_SUBB_U32_e64_dpp:
3634 case AMDGPU::V_SUBBREV_U32_e64:
3635 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3636 // Only check mask register overlaps.
3637 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
3638 assert(SSRCOp);
3639 bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg);
3640 return Result ? HazardFound : NoHazardFound;
3641 }
3642 default:
3643 return NoHazardFound;
3644 }
3645 };
3646
3647 const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
3649 0),
3650 0);
3651 auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
3652 switch (I.getOpcode()) {
3653 case AMDGPU::S_WAITCNT_DEPCTR:
3654 // Record mergable waits within region of instructions free of SGPR reads.
3655 if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
3656 (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3657 WaitInstrs.push_back(&I);
3658 break;
3659 default:
3660 // Update tracking of SGPR reads and writes.
3661 for (auto &Op : I.operands()) {
3662 if (!Op.isReg())
3663 continue;
3664
3665 Register Reg = Op.getReg();
3666 if (IgnoreableSGPR(Reg))
3667 continue;
3668 if (!IsVCC(Reg)) {
3669 if (Op.isImplicit())
3670 continue;
3671 if (!TRI->isSGPRReg(MRI, Reg))
3672 continue;
3673 }
3674 if (Op.isUse()) {
3675 HasSGPRRead = true;
3676 continue;
3677 }
3678
3679 // Stop tracking any SGPRs with writes on the basis that they will
3680 // already have an appropriate wait inserted afterwards.
3682 for (Register SGPR : State.HazardSGPRs) {
3683 if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR))
3684 Found.push_back(SGPR);
3685 }
3686 for (Register SGPR : Found)
3687 State.HazardSGPRs.erase(SGPR);
3688 }
3689 break;
3690 }
3691 };
3692
3693 // Check for hazard
3694 if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
3695 MI->getParent(),
3696 std::next(MI->getReverseIterator())))
3697 return false;
3698
3699 // Compute counter mask
3700 unsigned DepCtr =
3701 IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0, ST)
3702 : AMDGPU::DepCtr::encodeFieldVaSdst(0, ST))
3703 : AMDGPU::DepCtr::encodeFieldSaSdst(0, ST);
3704
3705 // Try to merge previous waits into this one for regions with no SGPR reads.
3706 if (!WaitInstrs.empty()) {
3707 // Note: WaitInstrs contains const pointers, so walk backward from MI to
3708 // obtain a mutable pointer to each instruction to be merged.
3709 // This is expected to be a very short walk within the same block.
3710 SmallVector<MachineInstr *> ToErase;
3711 unsigned Found = 0;
3712 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3713 End = MI->getParent()->rend();
3714 Found < WaitInstrs.size() && It != End; ++It) {
3715 MachineInstr *WaitMI = &*It;
3716 // Find next wait instruction.
3717 if (std::as_const(WaitMI) != WaitInstrs[Found])
3718 continue;
3719 Found++;
3720 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3721 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3722 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3723 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3724 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3725 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3726 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3727 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3728 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3729 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3730 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3731 ToErase.push_back(WaitMI);
3732 }
3733 assert(Found == WaitInstrs.size());
3734 for (MachineInstr *WaitMI : ToErase)
3735 WaitMI->eraseFromParent();
3736 }
3737
3738 // Add s_waitcnt_depctr after SGPR write.
3739 auto NextMI = std::next(MI->getIterator());
3740 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
3741 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3742 .addImm(DepCtr);
3743
3744 // SALU write may be s_getpc in a bundle.
3745 updateGetPCBundle(NewMI);
3746
3747 return true;
3748}
3749
3750static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3751 const SIInstrInfo &TII) {
3752 MachineBasicBlock &EntryMBB = MF->front();
3753 if (EntryMBB.begin() != EntryMBB.end()) {
3754 auto &EntryMI = *EntryMBB.begin();
3755 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3756 EntryMI.getOperand(0).getImm() >= Priority)
3757 return false;
3758 }
3759
3760 BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
3761 .addImm(Priority);
3762 return true;
3763}
3764
3765bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3766 if (!ST.hasRequiredExportPriority())
3767 return false;
3768
3769 // Assume the following shader types will never have exports,
3770 // and avoid adding or adjusting S_SETPRIO.
3771 MachineBasicBlock *MBB = MI->getParent();
3772 MachineFunction *MF = MBB->getParent();
3773 auto CC = MF->getFunction().getCallingConv();
3774 switch (CC) {
3779 return false;
3780 default:
3781 break;
3782 }
3783
3784 const int MaxPriority = 3;
3785 const int NormalPriority = 2;
3786 const int PostExportPriority = 0;
3787
3788 auto It = MI->getIterator();
3789 switch (MI->getOpcode()) {
3790 case AMDGPU::S_ENDPGM:
3791 case AMDGPU::S_ENDPGM_SAVED:
3792 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3793 case AMDGPU::SI_RETURN_TO_EPILOG:
3794 // Ensure shader with calls raises priority at entry.
3795 // This ensures correct priority if exports exist in callee.
3796 if (MF->getFrameInfo().hasCalls())
3797 return ensureEntrySetPrio(MF, NormalPriority, TII);
3798 return false;
3799 case AMDGPU::S_SETPRIO: {
3800 // Raise minimum priority unless in workaround.
3801 auto &PrioOp = MI->getOperand(0);
3802 int Prio = PrioOp.getImm();
3803 bool InWA = (Prio == PostExportPriority) &&
3804 (It != MBB->begin() && TII.isEXP(*std::prev(It)));
3805 if (InWA || Prio >= NormalPriority)
3806 return false;
3807 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3808 return true;
3809 }
3810 default:
3811 if (!TII.isEXP(*MI))
3812 return false;
3813 break;
3814 }
3815
3816 // Check entry priority at each export (as there will only be a few).
3817 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3818 bool Changed = false;
3820 Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
3821
3822 auto NextMI = std::next(It);
3823 bool EndOfShader = false;
3824 if (NextMI != MBB->end()) {
3825 // Only need WA at end of sequence of exports.
3826 if (TII.isEXP(*NextMI))
3827 return Changed;
3828 // Assume appropriate S_SETPRIO after export means WA already applied.
3829 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3830 NextMI->getOperand(0).getImm() == PostExportPriority)
3831 return Changed;
3832 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3833 }
3834
3835 const DebugLoc &DL = MI->getDebugLoc();
3836
3837 // Lower priority.
3838 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3839 .addImm(PostExportPriority);
3840
3841 if (!EndOfShader) {
3842 // Wait for exports to complete.
3843 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3844 .addReg(AMDGPU::SGPR_NULL)
3845 .addImm(0);
3846 }
3847
3848 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3849 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
3850
3851 if (!EndOfShader) {
3852 // Return to normal (higher) priority.
3853 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
3854 .addImm(NormalPriority);
3855 }
3856
3857 return true;
3858}
3859
3860bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
3861 if (!isSGetReg(MI->getOpcode()))
3862 return false;
3863
3864 const SIInstrInfo *TII = ST.getInstrInfo();
3865 switch (getHWReg(TII, *MI)) {
3866 default:
3867 return false;
3872 break;
3873 }
3874
3875 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3876 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3877 .addImm(0);
3878 return true;
3879}
3880
3881bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
3882 if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3883 return false;
3884
3885 const SIInstrInfo *TII = ST.getInstrInfo();
3886 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3887 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3889 BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(),
3890 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3892
3893 return true;
3894}
3895
3896bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
3897 // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3898 // for hazard to trigger.
3899 if (!IsHazardRecognizerMode)
3900 return false;
3901
3902 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3903 const SIInstrInfo *TII = ST.getInstrInfo();
3904 // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3905 const int FlatScrBaseWaitStates = 10;
3906
3907 bool ReadsFlatScrLo =
3908 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3909 bool ReadsFlatScrHi =
3910 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3911 if (isSGetReg(MI->getOpcode())) {
3912 switch (getHWReg(TII, *MI)) {
3913 default:
3914 break;
3916 ReadsFlatScrLo = true;
3917 break;
3919 ReadsFlatScrHi = true;
3920 break;
3921 }
3922 }
3923
3924 const MachineRegisterInfo &MRI = MF.getRegInfo();
3925
3926 auto IsRegDefHazard = [&](Register Reg) -> bool {
3927 DenseSet<const MachineBasicBlock *> Visited;
3928 auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3929 return MI.modifiesRegister(Reg, TRI);
3930 };
3931
3932 // This literally abuses the idea of waitstates. Instead of waitstates it
3933 // returns 1 for SGPR written and 0 otherwise.
3934 auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3935 if (!TII->isSALU(MI) && !TII->isVALU(MI, /*AllowLDSDMA=*/true))
3936 return 0;
3937 for (const MachineOperand &MO : MI.all_defs()) {
3938 if (TRI->isSGPRReg(MRI, MO.getReg()))
3939 return 1;
3940 }
3941 return 0;
3942 };
3943
3944 auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3945 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3946 unsigned Wait = MI.getOperand(0).getImm();
3949 return true;
3950 }
3951 return SgprWrites >= FlatScrBaseWaitStates;
3952 };
3953
3954 return ::getWaitStatesSince(
3955 IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()),
3956 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3957 };
3958
3959 if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
3960 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3961 (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
3962 !IsRegDefHazard(AMDGPU::SGPR103)))
3963 return false;
3964
3965 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3966 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3969 return true;
3970}
3971
3972bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
3973 if (!isSSetReg(MI->getOpcode()) ||
3974 MI->getOperand(1).getImm() != AMDGPU::Hwreg::ID_MODE)
3975 return false;
3976
3977 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3978 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3979 return true;
3980}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static cl::opt< bool > EnableWMMAVnopHoisting("amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden, cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"))
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static unsigned getWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, const TargetSchedModel &SchedModel, const GCNSubtarget &ST)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(GsymDataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
Definition LineTable.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:484
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
unsigned get(InstCounterType T) const
BitVector & set()
Set all bits in the bitvector.
Definition BitVector.h:366
A debug info location.
Definition DebugLoc.h:124
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Definition DenseMap.h:319
Implements a dense probed hash-table based set.
Definition DenseSet.h:289
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
unsigned getHazardWaitStates(MachineInstr *MI) const
Returns the number of wait states until all hazards for MI are resolved.
unsigned PreEmitNoopsCommon(MachineInstr *) const
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
bool ShouldPreferAnother(SUnit *SU) const override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
GCNHazardRecognizer(const MachineFunction &MF, MachineLoopInfo *MLI=nullptr)
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
Instructions::const_reverse_iterator const_reverse_instr_iterator
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool isConstantPhysReg(MCRegister PhysReg) const
Returns true if PhysReg is unallocatable and constant throughout the function.
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI, bool AllowLDSDMA)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:490
Provide an instruction scheduling machine model to CodeGen passes.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
A range adaptor for a pair of iterators.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ Entry
Definition COFF.h:862
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
This namespace contains all of the command line option processing machinery.
Definition MCSchedule.h:35
initializer< Ty > init(const Ty &Val)
constexpr double e
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Offset
Definition DWP.cpp:558
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Wait
Definition Threading.h:60
constexpr RegState getDeadRegState(bool B)
Op::Description Desc
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:305
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:285
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...