LLVM 23.0.0git
AMDGPUCoExecSchedStrategy.cpp
Go to the documentation of this file.
1//===- AMDGPUCoExecSchedStrategy.cpp - CoExec Scheduling Strategy ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Coexecution-focused scheduling strategy for AMDGPU.
11//
12//===----------------------------------------------------------------------===//
13
15#include "llvm/Support/Debug.h"
16
17using namespace llvm;
18using namespace llvm::AMDGPU;
19
20#define DEBUG_TYPE "machine-scheduler"
21
22namespace {
23
24// Used to disable post-RA scheduling with function level granularity.
25class GCNNoopPostScheduleDAG final : public ScheduleDAGInstrs {
26public:
27 explicit GCNNoopPostScheduleDAG(MachineSchedContext *C)
28 : ScheduleDAGInstrs(*C->MF, C->MLI, /*RemoveKillFlags=*/true) {}
29
30 // Do nothing.
31 void schedule() override {}
32};
33
34} // namespace
35
37 // pickOnlyChoice() releases pending instructions and checks for new hazards.
38 SUnit *OnlyChoice = Zone.pickOnlyChoice();
39 if (!Zone.Pending.empty())
40 return nullptr;
41
42 return OnlyChoice;
43}
44
46 const SIInstrInfo &SII) {
47 if (MI.isDebugInstr())
49
50 unsigned Opc = MI.getOpcode();
51
52 // Check for specific opcodes first.
53 if (Opc == AMDGPU::ATOMIC_FENCE || Opc == AMDGPU::S_WAIT_ASYNCCNT ||
54 Opc == AMDGPU::S_WAIT_TENSORCNT || Opc == AMDGPU::S_BARRIER_WAIT ||
55 Opc == AMDGPU::S_BARRIER_SIGNAL_IMM)
57
58 if (SII.isLDSDMA(MI))
60
61 if (SII.isMFMAorWMMA(MI))
63
64 if (SII.isTRANS(MI))
66
67 if (SII.isVALU(MI))
69
70 if (SII.isDS(MI))
72
73 if (SII.isFLAT(MI) || SII.isFLATGlobal(MI) || SII.isFLATScratch(MI))
75
76 if (SII.isSALU(MI))
78
80}
81
83 for (SUnit *PrioritySU : PrioritySUs) {
84 if (!PrioritySU->isTopReady())
85 return PrioritySU;
86 }
87
88 if (!LookDeep)
89 return nullptr;
90
91 unsigned MinDepth = std::numeric_limits<unsigned int>::max();
92 SUnit *TargetSU = nullptr;
93 for (auto *SU : AllSUs) {
94 if (SU->isScheduled)
95 continue;
96
97 if (SU->isTopReady())
98 continue;
99
100 if (SU->getDepth() < MinDepth) {
101 MinDepth = SU->getDepth();
102 TargetSU = SU;
103 }
104 }
105 return TargetSU;
106}
107
108void HardwareUnitInfo::insert(SUnit *SU, unsigned BlockingCycles) {
109 if (!AllSUs.insert(SU))
110 llvm_unreachable("HardwareUnit already contains SU!");
111
112 TotalCycles += BlockingCycles;
113
114 if (PrioritySUs.empty()) {
115 PrioritySUs.insert(SU);
116 return;
117 }
118 unsigned SUDepth = SU->getDepth();
119 unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
120 if (SUDepth > CurrDepth)
121 return;
122
123 if (SUDepth == CurrDepth) {
124 PrioritySUs.insert(SU);
125 return;
126 }
127
128 // SU is lower depth and should be prioritized.
129 PrioritySUs.clear();
130 PrioritySUs.insert(SU);
131}
132
133void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
134 // We may want to ignore some HWUIs (e.g. InstructionFlavor::Other). To do so,
135 // we just clear the HWUI. However, we still have instructions which map to
136 // this HWUI. Don't bother managing the state for these HWUI.
137 if (TotalCycles == 0)
138 return;
139
140 AllSUs.remove(SU);
141 PrioritySUs.remove(SU);
142
143 TotalCycles -= BlockingCycles;
144
145 if (AllSUs.empty())
146 return;
147 if (PrioritySUs.empty()) {
148 for (auto SU : AllSUs) {
149 if (PrioritySUs.empty()) {
150 PrioritySUs.insert(SU);
151 continue;
152 }
153 unsigned SUDepth = SU->getDepth();
154 unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
155 if (SUDepth > CurrDepth)
156 continue;
157
158 if (SUDepth == CurrDepth) {
159 PrioritySUs.insert(SU);
160 continue;
161 }
162
163 // SU is lower depth and should be prioritized.
164 PrioritySUs.clear();
165 PrioritySUs.insert(SU);
166 }
167 }
168}
169
172 for (HardwareUnitInfo &HWUICand : HWUInfo) {
173 if (HWUICand.getType() == Flavor) {
174 return &HWUICand;
175 }
176 }
177 return nullptr;
178}
179
181 assert(SchedModel && SchedModel->hasInstrSchedModel());
182 unsigned ReleaseAtCycle = 0;
183 const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
184 for (TargetSchedModel::ProcResIter PI = SchedModel->getWriteProcResBegin(SC),
185 PE = SchedModel->getWriteProcResEnd(SC);
186 PI != PE; ++PI) {
187 ReleaseAtCycle = std::max(ReleaseAtCycle, (unsigned)PI->ReleaseAtCycle);
188 }
189 return ReleaseAtCycle;
190}
191
198
201 const TargetRegisterInfo *TRI) {
202 DAG = SchedDAG;
204 assert(SchedModel && SchedModel->hasInstrSchedModel());
205
206 SRI = static_cast<const SIRegisterInfo *>(TRI);
207 SII = static_cast<const SIInstrInfo *>(DAG->TII);
208
210
211 for (unsigned I = 0; I < HWUInfo.size(); I++) {
212 HWUInfo[I].reset();
213 HWUInfo[I].setType(I);
214 }
215
216 HWUInfo[(int)InstructionFlavor::WMMA].setProducesCoexecWindow(true);
217 HWUInfo[(int)InstructionFlavor::MultiCycleVALU].setProducesCoexecWindow(true);
218 HWUInfo[(int)InstructionFlavor::TRANS].setProducesCoexecWindow(true);
219
221}
222
224 if (!SchedModel || !SchedModel->hasInstrSchedModel())
225 return;
226
227 for (auto &SU : DAG->SUnits) {
228 const InstructionFlavor Flavor = classifyFlavor(*SU.getInstr(), *SII);
229 HWUInfo[(int)(Flavor)].insert(&SU, getHWUICyclesForInst(&SU));
230 }
231
233}
234
236 MachineBasicBlock *BB = DAG->begin()->getParent();
237 dbgs() << "\n=== Region: " << DAG->MF.getName() << " BB" << BB->getNumber()
238 << " (" << DAG->SUnits.size() << " SUs) ===\n";
239
240 dbgs() << "\nHWUI Resource Pressure:\n";
241 for (auto &HWUI : HWUInfo) {
242 if (HWUI.getTotalCycles() == 0)
243 continue;
244
245 StringRef Name = getFlavorName(HWUI.getType());
246 dbgs() << " " << Name << ": " << HWUI.getTotalCycles() << " cycles, "
247 << HWUI.size() << " instrs\n";
248 }
249 dbgs() << "\n";
250}
251
253 // Highest priority should be first.
255 // Prefer CoexecWindow producers
256 if (A.producesCoexecWindow() != B.producesCoexecWindow())
257 return A.producesCoexecWindow();
258
259 // Prefer more demanded resources
260 if (A.getTotalCycles() != B.getTotalCycles())
261 return A.getTotalCycles() > B.getTotalCycles();
262
263 // In ties -- prefer the resource with more instructions
264 if (A.size() != B.size())
265 return A.size() < B.size();
266
267 // Default to Flavor order
268 return static_cast<unsigned>(A.getType()) <
269 static_cast<unsigned>(B.getType());
270 });
271}
272
276
277 auto HasPrioritySU = [this, &Cand, &TryCand](unsigned ResourceIdx) {
278 const HardwareUnitInfo &HWUI = HWUInfo[ResourceIdx];
279
280 auto CandFlavor = classifyFlavor(*Cand.SU->getInstr(), *SII);
281 auto TryCandFlavor = classifyFlavor(*TryCand.SU->getInstr(), *SII);
282 bool LookDeep = (CandFlavor == InstructionFlavor::DS ||
283 TryCandFlavor == InstructionFlavor::DS) &&
285 auto *TargetSU = HWUI.getNextTargetSU(LookDeep);
286
287 // If we do not have a TargetSU for this resource, then it is not critical.
288 if (!TargetSU)
289 return false;
290
291 return true;
292 };
293
294 auto TryEnablesResource = [&Cand, &TryCand, this](unsigned ResourceIdx) {
295 const HardwareUnitInfo &HWUI = HWUInfo[ResourceIdx];
296 auto CandFlavor = classifyFlavor(*Cand.SU->getInstr(), *SII);
297
298 // We want to ensure our DS order matches WMMA order.
299 bool LookDeep = CandFlavor == InstructionFlavor::DS &&
301 auto *TargetSU = HWUI.getNextTargetSU(LookDeep);
302
303 bool CandEnables =
304 TargetSU != Cand.SU && DAG->IsReachable(TargetSU, Cand.SU);
305 bool TryCandEnables =
306 TargetSU != TryCand.SU && DAG->IsReachable(TargetSU, TryCand.SU);
307
308 if (!CandEnables && !TryCandEnables)
309 return false;
310
311 if (CandEnables && !TryCandEnables) {
314
315 return true;
316 }
317
318 if (!CandEnables && TryCandEnables) {
320 return true;
321 }
322
323 // Both enable, prefer the critical path.
324 unsigned CandHeight = Cand.SU->getHeight();
325 unsigned TryCandHeight = TryCand.SU->getHeight();
326
327 if (CandHeight > TryCandHeight) {
330
331 return true;
332 }
333
334 if (CandHeight < TryCandHeight) {
336 return true;
337 }
338
339 // Same critical path, just prefer original candidate.
342
343 return true;
344 };
345
346 for (unsigned I = 0; I < HWUInfo.size(); I++) {
347 // If we have encountered a resource that is not critical, then neither
348 // candidate enables a critical resource
349 if (!HasPrioritySU(I))
350 continue;
351
352 bool Enabled = TryEnablesResource(I);
353 // If neither has enabled the resource, continue to the next resource
354 if (Enabled)
355 return true;
356 }
357 return false;
358}
359
363 for (unsigned I = 0; I < HWUInfo.size(); I++) {
364 const HardwareUnitInfo &HWUI = HWUInfo[I];
365
366 bool CandUsesCrit = HWUI.contains(Cand.SU);
367 bool TryCandUsesCrit = HWUI.contains(TryCand.SU);
368
369 if (!CandUsesCrit && !TryCandUsesCrit)
370 continue;
371
372 if (CandUsesCrit != TryCandUsesCrit) {
373 if (CandUsesCrit) {
376 return true;
377 }
379 return true;
380 }
381
382 // Otherwise, both use the critical resource
383 // For longer latency InstructionFlavors, we should prioritize first by
384 // their enablement of critical resources
385 if (HWUI.getType() == InstructionFlavor::DS) {
386 if (tryCriticalResourceDependency(TryCand, Cand, Zone))
387 return true;
388 }
389
390 // Prioritize based on HWUI priorities.
391 SUnit *Match = HWUI.getHigherPriority(Cand.SU, TryCand.SU);
392 if (Match) {
393 if (Match == Cand.SU) {
396 return true;
397 }
399 return true;
400 }
401 }
402
403 return false;
404}
405
415
418 unsigned NumRegionInstrs) {
422 "coexec scheduler only supports top-down scheduling");
423 RegionPolicy.OnlyTopDown = true;
424 RegionPolicy.OnlyBottomUp = false;
425 RegionPolicy.ShouldTrackLaneMasks = true;
426}
427
429 // Coexecution scheduling strategy is only done top-down to support new
430 // resource balancing heuristics.
431 RegionPolicy.OnlyTopDown = true;
432 RegionPolicy.OnlyBottomUp = false;
433
435 Heurs.initialize(DAG, SchedModel, TRI);
436}
437
439 Heurs.updateForScheduling(SU);
440 GCNSchedStrategy::schedNode(SU, IsTopNode);
441}
442
444 assert(RegionPolicy.OnlyTopDown && !RegionPolicy.OnlyBottomUp &&
445 "coexec scheduler only supports top-down scheduling");
446
447 if (DAG->top() == DAG->bottom()) {
448 assert(Top.Available.empty() && Top.Pending.empty() &&
449 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
450 return nullptr;
451 }
452
453 bool PickedPending = false;
454 SUnit *SU = nullptr;
455#ifndef NDEBUG
456 SchedCandidate *PickedCand = nullptr;
457#endif
458 do {
459 PickedPending = false;
460 SU = pickOnlyChoice(Top);
461 if (!SU) {
462 CandPolicy NoPolicy;
463 TopCand.reset(NoPolicy);
464 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
465 PickedPending, /*IsBottomUp=*/false);
466 assert(TopCand.Reason != NoCand && "failed to find a candidate");
467 SU = TopCand.SU;
468#ifndef NDEBUG
469 PickedCand = &TopCand;
470#endif
471 }
472 IsTopNode = true;
473 } while (SU->isScheduled);
474
475 LLVM_DEBUG(if (PickedCand) dumpPickSummary(SU, IsTopNode, *PickedCand));
476
477 if (PickedPending) {
478 unsigned ReadyCycle = SU->TopReadyCycle;
479 unsigned CurrentCycle = Top.getCurrCycle();
480 if (ReadyCycle > CurrentCycle)
481 Top.bumpCycle(ReadyCycle);
482
483 // checkHazard() does not expose the exact cycle where the hazard clears.
484 while (Top.checkHazard(SU))
485 Top.bumpCycle(Top.getCurrCycle() + 1);
486
487 Top.releasePending();
488 }
489
490 if (SU->isTopReady())
491 Top.removeReady(SU);
492 if (SU->isBottomReady())
493 Bot.removeReady(SU);
494
495 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
496 << *SU->getInstr());
497
498 assert(IsTopNode && "coexec scheduler must only schedule from top boundary");
499 return SU;
500}
501
503 SchedBoundary &Zone, const CandPolicy &ZonePolicy,
504 const RegPressureTracker &RPTracker, SchedCandidate &Cand,
505 bool &PickedPending, bool IsBottomUp) {
506 assert(Zone.isTop() && "coexec scheduler only supports top boundary");
507 assert(!IsBottomUp && "coexec scheduler only supports top-down scheduling");
508
509 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
511 unsigned SGPRPressure = 0;
512 unsigned VGPRPressure = 0;
513 PickedPending = false;
514 if (DAG->isTrackingPressure()) {
515 if (!useGCNTrackers()) {
516 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
517 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
518 } else {
519 SGPRPressure = DownwardTracker.getPressure().getSGPRNum();
520 VGPRPressure = DownwardTracker.getPressure().getArchVGPRNum();
521 }
522 }
523
524 auto EvaluateQueue = [&](ReadyQueue &Q, bool FromPending) {
525 for (SUnit *SU : Q) {
526 SchedCandidate TryCand(ZonePolicy);
527 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
528 VGPRPressure, IsBottomUp);
529 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
530 tryCandidateCoexec(Cand, TryCand, ZoneArg);
531 if (TryCand.Reason != NoCand) {
532 if (TryCand.ResDelta == SchedResourceDelta())
533 TryCand.initResourceDelta(Zone.DAG, SchedModel);
534 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
535 PickedPending = FromPending;
536 Cand.setBest(TryCand);
537 } else {
538 LLVM_DEBUG(printCandidateDecision(TryCand, Cand));
539 }
540 }
541 };
542
543 LLVM_DEBUG(dbgs() << "Available Q:\n");
544 EvaluateQueue(Zone.Available, /*FromPending=*/false);
545
546 LLVM_DEBUG(dbgs() << "Pending Q:\n");
547 EvaluateQueue(Zone.Pending, /*FromPending=*/true);
548}
549
550#ifndef NDEBUG
552 SchedCandidate &Cand) {
553 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
554 unsigned Cycle = IsTopNode ? Top.getCurrCycle() : Bot.getCurrCycle();
555
556 dbgs() << "=== Pick @ Cycle " << Cycle << " ===\n";
557
558 const InstructionFlavor Flavor = classifyFlavor(*SU->getInstr(), *SII);
559 dbgs() << "Picked: SU(" << SU->NodeNum << ") ";
560 SU->getInstr()->print(dbgs(), /*IsStandalone=*/true, /*SkipOpers=*/false,
561 /*SkipDebugLoc=*/true);
562 dbgs() << " [" << getFlavorName(Flavor) << "]\n";
563
564 dbgs() << " Reason: ";
567 else if (Cand.Reason != NoCand)
569 else
570 dbgs() << "Unknown";
571 dbgs() << "\n\n";
572
574}
575#endif
576
578 SchedCandidate &TryCand,
579 SchedBoundary *Zone) {
580 // Initialize the candidate if needed.
581 if (!Cand.isValid()) {
582 TryCand.Reason = FirstValid;
583 return true;
584 }
585
586 // Bias PhysReg Defs and copies to their uses and defined respectively.
587 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
588 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
589 return TryCand.Reason != NoCand;
590
591 // Avoid exceeding the target's limit.
592 if (DAG->isTrackingPressure() &&
593 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
594 RegExcess, TRI, DAG->MF))
595 return TryCand.Reason != NoCand;
596
597 // We only compare a subset of features when comparing nodes between
598 // Top and Bottom boundary. Some properties are simply incomparable, in many
599 // other instances we should only override the other boundary if something
600 // is a clear good pick on one boundary. Skip heuristics that are more
601 // "tie-breaking" in nature.
602 bool SameBoundary = Zone != nullptr;
603 if (SameBoundary) {
604 // Compare candidates by the stall they would introduce if
605 // scheduled in the current cycle.
606 if (tryEffectiveStall(Cand, TryCand, *Zone))
607 return TryCand.Reason != NoCand;
608
609 Heurs.sortHWUIResources();
610 if (Heurs.tryCriticalResource(TryCand, Cand, Zone)) {
612 return TryCand.Reason != NoCand;
613 }
614
615 if (Heurs.tryCriticalResourceDependency(TryCand, Cand, Zone)) {
617 return TryCand.Reason != NoCand;
618 }
619 }
620
621 // Keep clustered nodes together to encourage downstream peephole
622 // optimizations which may reduce resource requirements.
623 //
624 // This is a best effort to set things up for a post-RA pass. Optimizations
625 // like generating loads of multiple registers should ideally be done within
626 // the scheduler pass by combining the loads during DAG postprocessing.
627 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
628 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
629 bool CandIsClusterSucc =
630 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
631 bool TryCandIsClusterSucc =
632 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
633
634 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
635 Cluster))
636 return TryCand.Reason != NoCand;
637
638 if (SameBoundary) {
639 // Weak edges are for clustering and other constraints.
640 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
641 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
642 return TryCand.Reason != NoCand;
643 }
644
645 // Avoid increasing the max pressure of the entire region.
646 if (DAG->isTrackingPressure() &&
647 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
648 Cand, RegMax, TRI, DAG->MF))
649 return TryCand.Reason != NoCand;
650
651 if (SameBoundary) {
652 // Avoid serializing long latency dependence chains.
653 // For acyclic path limited loops, latency was already checked above.
654 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
655 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
656 return TryCand.Reason != NoCand;
657
658 // Fall through to original instruction order.
659 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
660 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
661 TryCand.Reason = NodeOrder;
662 return true;
663 }
664 }
665
666 return false;
667}
668
670 SchedCandidate &TryCand,
671 SchedBoundary &Zone) const {
672 // Treat structural and latency stalls as a single scheduling cost for the
673 // current cycle.
674 struct StallCosts {
675 unsigned Ready = 0;
676 unsigned Structural = 0;
677 unsigned Latency = 0;
678 unsigned Effective = 0;
679 };
680
681 unsigned CurrCycle = Zone.getCurrCycle();
682 auto GetStallCosts = [&](SUnit *SU) {
683 unsigned ReadyCycle = Zone.isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
684 StallCosts Costs;
685 Costs.Ready = ReadyCycle > CurrCycle ? ReadyCycle - CurrCycle : 0;
686 Costs.Structural = getStructuralStallCycles(Zone, SU);
687 Costs.Latency = Zone.getLatencyStallCycles(SU);
688 Costs.Effective = std::max({Costs.Ready, Costs.Structural, Costs.Latency});
689 return Costs;
690 };
691
692 StallCosts TryCosts = GetStallCosts(TryCand.SU);
693 StallCosts CandCosts = GetStallCosts(Cand.SU);
694
695 LLVM_DEBUG(if (TryCosts.Effective || CandCosts.Effective) {
696 dbgs() << "Effective stalls: try=" << TryCosts.Effective
697 << " (ready=" << TryCosts.Ready << ", struct=" << TryCosts.Structural
698 << ", lat=" << TryCosts.Latency << ") cand=" << CandCosts.Effective
699 << " (ready=" << CandCosts.Ready
700 << ", struct=" << CandCosts.Structural
701 << ", lat=" << CandCosts.Latency << ")\n";
702 });
703
704 return tryLess(TryCosts.Effective, CandCosts.Effective, TryCand, Cand, Stall);
705}
706
709 LLVM_DEBUG(dbgs() << "AMDGPU coexec preRA scheduler selected for "
710 << C->MF->getName() << '\n');
711 return new GCNScheduleDAGMILive(
712 C, std::make_unique<AMDGPUCoExecSchedStrategy>(C));
713}
714
717 LLVM_DEBUG(dbgs() << "AMDGPU nop postRA scheduler selected for "
718 << C->MF->getName() << '\n');
719 return new GCNNoopPostScheduleDAG(C);
720}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SUnit * pickOnlyChoice(SchedBoundary &Zone)
Coexecution-focused scheduling strategy for AMDGPU.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
#define LLVM_DEBUG(...)
Definition Debug.h:119
bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary &Zone) const
void initPolicy(MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, unsigned NumRegionInstrs) override
Optionally override the per-region scheduling policy.
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand, bool &PickedPending, bool IsBottomUp)
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
AMDGPUCoExecSchedStrategy(const MachineSchedContext *C)
void dumpPickSummary(SUnit *SU, bool IsTopNode, SchedCandidate &Cand)
bool tryCandidateCoexec(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone)
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
void updateForScheduling(SUnit *SU)
Update the state to reflect that SU is going to be scheduled.
HardwareUnitInfo * getHWUIFromFlavor(AMDGPU::InstructionFlavor Flavor)
Given a Flavor , find the corresponding HardwareUnit.
void sortHWUIResources()
Sort the HWUInfo vector.
bool tryCriticalResource(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const
Check for critical resource consumption.
bool tryCriticalResourceDependency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const
Check for dependencies of instructions that use prioritized HardwareUnits.
SmallVector< HardwareUnitInfo, 8 > HWUInfo
const TargetSchedModel * SchedModel
void collectHWUIPressure()
Walk over the region and collect total usage per HardwareUnit.
void initialize(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel, const TargetRegisterInfo *TRI)
unsigned getHWUICyclesForInst(SUnit *SU)
Compute the blocking cycles for the appropriate HardwareUnit given an SU.
GCNDownwardRPTracker DownwardTracker
GCNSchedStrategy(const MachineSchedContext *C)
SmallVector< GCNSchedStageID, 4 > SchedStages
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
std::vector< unsigned > Pressure
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
void printCandidateDecision(const SchedCandidate &Current, const SchedCandidate &Preferred)
unsigned getStructuralStallCycles(SchedBoundary &Zone, SUnit *SU) const
Estimate how many cycles SU must wait due to structural hazards at the current boundary cycle.
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp)
MachineSchedPolicy RegionPolicy
const TargetSchedModel * SchedModel
static const char * getReasonStr(GenericSchedulerBase::CandReason Reason)
const TargetRegisterInfo * TRI
SchedCandidate TopCand
Candidate last picked from Top boundary.
ScheduleDAGMILive * DAG
HardwareUnitInfo is a wrapper class which maps to some real hardware resource.
void markScheduled(SUnit *SU, unsigned BlockingCycles)
Update the state for SU being scheduled by removing it from the AllSUs and reducing its BlockingCycle...
SUnit * getNextTargetSU(bool LookDeep=false) const
void insert(SUnit *SU, unsigned BlockingCycles)
Insert the SU into AllSUs and account its BlockingCycles into the TotalCycles.
AMDGPU::InstructionFlavor getType() const
SUnit * getHigherPriority(SUnit *SU, SUnit *Other) const
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
MachineInstrBundleIterator< MachineInstr > iterator
Representation of each machine instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
virtual void initPolicy(MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, unsigned NumRegionInstrs)
Optionally override the per-region scheduling policy.
Helpers for implementing custom MachineSchedStrategy classes.
Track the current register pressure at some position in the instruction stream, and remember the high...
const std::vector< unsigned > & getRegSetPressureAtPos() const
Get the register set pressure at the current position, which may be less than the pressure across the...
static bool isDS(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isMFMAorWMMA(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
Scheduling unit. This is a node in the scheduling DAG.
unsigned TopReadyCycle
Cycle relative to start when node is ready.
unsigned NodeNum
Entry # of node in the node vector.
unsigned getHeight() const
Returns the height of this node, which is the length of the maximum path down to any node which has n...
unsigned getDepth() const
Returns the depth of this node, which is the length of the maximum path up to any node which has no p...
bool isScheduled
True once scheduled.
unsigned ParentClusterIdx
The parent cluster id.
bool isBottomReady() const
bool isTopReady() const
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Each Scheduling boundary is associated with ready queues.
LLVM_ABI unsigned getLatencyStallCycles(SUnit *SU)
Get the difference between the given SUnit's ready time and the current cycle.
LLVM_ABI SUnit * pickOnlyChoice()
Call this before applying any other heuristics to the Available queue.
unsigned getCurrCycle() const
Number of cycles to issue the instructions scheduled in this zone.
A ScheduleDAG for scheduling lists of MachineInstr.
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
const MCWriteProcResEntry * ProcResIter
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr StringRef getFlavorName(InstructionFlavor F)
constexpr StringRef getReasonName(AMDGPUSchedReason R)
InstructionFlavor classifyFlavor(const MachineInstr &MI, const SIInstrInfo &SII)
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI int biasPhysReg(const SUnit *SU, bool isTop, bool BiasPRegsExtra=false)
Minimize physical register live ranges.
LLVM_ABI unsigned getWeakLeft(const SUnit *SU, bool isTop)
CycleInfo::CycleT Cycle
Definition CycleInfo.h:26
LLVM_ABI bool tryPressure(const PressureChange &TryP, const PressureChange &CandP, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason, const TargetRegisterInfo *TRI, const MachineFunction &MF)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
ScheduleDAGInstrs * createGCNNoopPostMachineScheduler(MachineSchedContext *C)
LLVM_ABI bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone)
ScheduleDAGInstrs * createGCNCoExecMachineScheduler(MachineSchedContext *C)
bool isTheSameCluster(unsigned A, unsigned B)
Return whether the input cluster ID's are the same and valid.
LLVM_ABI bool tryGreater(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
LLVM_ABI bool tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
Return true if this heuristic determines order.
@ Enabled
Convert any .debug_str_offsets tables to DWARF64 if needed.
Definition DWP.h:31
LLVM_ABI cl::opt< MISched::Direction > PreRADirection
Policy for scheduling the next instruction in the candidate's zone.
Store the state used by GenericScheduler heuristics, required for the lifetime of one invocation of p...
LLVM_ABI void initResourceDelta(const ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel)
Status of an instruction's critical resource consumption.
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:129
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...