20#define DEBUG_TYPE "machine-scheduler"
31 void schedule()
override {}
47 if (
MI.isDebugInstr())
50 unsigned Opc =
MI.getOpcode();
53 if (
Opc == AMDGPU::ATOMIC_FENCE ||
Opc == AMDGPU::S_WAIT_ASYNCCNT ||
54 Opc == AMDGPU::S_WAIT_TENSORCNT ||
Opc == AMDGPU::S_BARRIER_WAIT ||
55 Opc == AMDGPU::S_BARRIER_SIGNAL_IMM)
83 for (
SUnit *PrioritySU : PrioritySUs) {
84 if (!PrioritySU->isTopReady())
91 unsigned MinDepth = std::numeric_limits<unsigned int>::max();
92 SUnit *TargetSU =
nullptr;
93 for (
auto *SU : AllSUs) {
100 if (SU->getDepth() < MinDepth) {
101 MinDepth = SU->getDepth();
109 if (!AllSUs.insert(SU))
112 TotalCycles += BlockingCycles;
114 if (PrioritySUs.empty()) {
115 PrioritySUs.insert(SU);
119 unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
120 if (SUDepth > CurrDepth)
123 if (SUDepth == CurrDepth) {
124 PrioritySUs.insert(SU);
130 PrioritySUs.insert(SU);
137 if (TotalCycles == 0)
141 PrioritySUs.remove(SU);
143 TotalCycles -= BlockingCycles;
147 if (PrioritySUs.empty()) {
148 for (
auto SU : AllSUs) {
149 if (PrioritySUs.empty()) {
150 PrioritySUs.insert(SU);
154 unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
155 if (SUDepth > CurrDepth)
158 if (SUDepth == CurrDepth) {
159 PrioritySUs.insert(SU);
165 PrioritySUs.insert(SU);
173 if (HWUICand.getType() == Flavor) {
182 unsigned ReleaseAtCycle = 0;
187 ReleaseAtCycle = std::max(ReleaseAtCycle, (
unsigned)PI->ReleaseAtCycle);
189 return ReleaseAtCycle;
211 for (
unsigned I = 0;
I <
HWUInfo.size();
I++) {
227 for (
auto &SU :
DAG->SUnits) {
238 <<
" (" <<
DAG->SUnits.size() <<
" SUs) ===\n";
240 dbgs() <<
"\nHWUI Resource Pressure:\n";
242 if (HWUI.getTotalCycles() == 0)
246 dbgs() <<
" " << Name <<
": " << HWUI.getTotalCycles() <<
" cycles, "
247 << HWUI.size() <<
" instrs\n";
256 if (
A.producesCoexecWindow() !=
B.producesCoexecWindow())
257 return A.producesCoexecWindow();
260 if (
A.getTotalCycles() !=
B.getTotalCycles())
261 return A.getTotalCycles() >
B.getTotalCycles();
264 if (
A.size() !=
B.size())
265 return A.size() <
B.size();
268 return static_cast<unsigned>(
A.getType()) <
269 static_cast<unsigned>(
B.getType());
277 auto HasPrioritySU = [
this, &Cand, &TryCand](
unsigned ResourceIdx) {
294 auto TryEnablesResource = [&Cand, &TryCand,
this](
unsigned ResourceIdx) {
304 TargetSU != Cand.
SU &&
DAG->IsReachable(TargetSU, Cand.
SU);
305 bool TryCandEnables =
306 TargetSU != TryCand.
SU &&
DAG->IsReachable(TargetSU, TryCand.
SU);
308 if (!CandEnables && !TryCandEnables)
311 if (CandEnables && !TryCandEnables) {
318 if (!CandEnables && TryCandEnables) {
327 if (CandHeight > TryCandHeight) {
334 if (CandHeight < TryCandHeight) {
346 for (
unsigned I = 0;
I <
HWUInfo.size();
I++) {
349 if (!HasPrioritySU(
I))
352 bool Enabled = TryEnablesResource(
I);
363 for (
unsigned I = 0;
I <
HWUInfo.size();
I++) {
367 bool TryCandUsesCrit = HWUI.
contains(TryCand.
SU);
369 if (!CandUsesCrit && !TryCandUsesCrit)
372 if (CandUsesCrit != TryCandUsesCrit) {
393 if (Match == Cand.
SU) {
422 "coexec scheduler only supports top-down scheduling");
439 Heurs.updateForScheduling(SU);
445 "coexec scheduler only supports top-down scheduling");
447 if (
DAG->top() ==
DAG->bottom()) {
449 Bot.Available.empty() &&
Bot.Pending.empty() &&
"ReadyQ garbage");
453 bool PickedPending =
false;
459 PickedPending =
false;
465 PickedPending,
false);
479 unsigned CurrentCycle =
Top.getCurrCycle();
480 if (ReadyCycle > CurrentCycle)
481 Top.bumpCycle(ReadyCycle);
484 while (
Top.checkHazard(SU))
485 Top.bumpCycle(
Top.getCurrCycle() + 1);
487 Top.releasePending();
498 assert(IsTopNode &&
"coexec scheduler must only schedule from top boundary");
505 bool &PickedPending,
bool IsBottomUp) {
506 assert(Zone.
isTop() &&
"coexec scheduler only supports top boundary");
507 assert(!IsBottomUp &&
"coexec scheduler only supports top-down scheduling");
511 unsigned SGPRPressure = 0;
512 unsigned VGPRPressure = 0;
513 PickedPending =
false;
514 if (
DAG->isTrackingPressure()) {
516 SGPRPressure =
Pressure[AMDGPU::RegisterPressureSets::SReg_32];
517 VGPRPressure =
Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
524 auto EvaluateQueue = [&](
ReadyQueue &Q,
bool FromPending) {
525 for (
SUnit *SU : Q) {
528 VGPRPressure, IsBottomUp);
535 PickedPending = FromPending;
547 EvaluateQueue(Zone.
Pending,
true);
554 unsigned Cycle = IsTopNode ?
Top.getCurrCycle() :
Bot.getCurrCycle();
556 dbgs() <<
"=== Pick @ Cycle " <<
Cycle <<
" ===\n";
564 dbgs() <<
" Reason: ";
592 if (
DAG->isTrackingPressure() &&
602 bool SameBoundary = Zone !=
nullptr;
609 Heurs.sortHWUIResources();
610 if (
Heurs.tryCriticalResource(TryCand, Cand, Zone)) {
615 if (
Heurs.tryCriticalResourceDependency(TryCand, Cand, Zone)) {
629 bool CandIsClusterSucc =
631 bool TryCandIsClusterSucc =
634 if (
tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
646 if (
DAG->isTrackingPressure() &&
655 !
Rem.IsAcyclicLatencyLimited &&
tryLatency(TryCand, Cand, *Zone))
676 unsigned Structural = 0;
678 unsigned Effective = 0;
682 auto GetStallCosts = [&](
SUnit *SU) {
683 unsigned ReadyCycle = Zone.
isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
685 Costs.Ready = ReadyCycle > CurrCycle ? ReadyCycle - CurrCycle : 0;
688 Costs.Effective = std::max({Costs.Ready, Costs.Structural, Costs.Latency});
692 StallCosts TryCosts = GetStallCosts(TryCand.
SU);
693 StallCosts CandCosts = GetStallCosts(Cand.
SU);
695 LLVM_DEBUG(
if (TryCosts.Effective || CandCosts.Effective) {
696 dbgs() <<
"Effective stalls: try=" << TryCosts.Effective
697 <<
" (ready=" << TryCosts.Ready <<
", struct=" << TryCosts.Structural
698 <<
", lat=" << TryCosts.Latency <<
") cand=" << CandCosts.Effective
699 <<
" (ready=" << CandCosts.Ready
700 <<
", struct=" << CandCosts.Structural
701 <<
", lat=" << CandCosts.Latency <<
")\n";
704 return tryLess(TryCosts.Effective, CandCosts.Effective, TryCand, Cand,
Stall);
709 LLVM_DEBUG(
dbgs() <<
"AMDGPU coexec preRA scheduler selected for "
710 <<
C->MF->getName() <<
'\n');
712 C, std::make_unique<AMDGPUCoExecSchedStrategy>(
C));
718 <<
C->MF->getName() <<
'\n');
719 return new GCNNoopPostScheduleDAG(
C);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SUnit * pickOnlyChoice(SchedBoundary &Zone)
Coexecution-focused scheduling strategy for AMDGPU.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Register const TargetRegisterInfo * TRI
bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary &Zone) const
AMDGPU::AMDGPUSchedReason LastAMDGPUReason
void initPolicy(MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, unsigned NumRegionInstrs) override
Optionally override the per-region scheduling policy.
CandidateHeuristics Heurs
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand, bool &PickedPending, bool IsBottomUp)
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
AMDGPUCoExecSchedStrategy(const MachineSchedContext *C)
void dumpPickSummary(SUnit *SU, bool IsTopNode, SchedCandidate &Cand)
bool tryCandidateCoexec(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone)
Represent a constant reference to an array (0 or more elements consecutively in memory),...
void updateForScheduling(SUnit *SU)
Update the state to reflect that SU is going to be scheduled.
HardwareUnitInfo * getHWUIFromFlavor(AMDGPU::InstructionFlavor Flavor)
Given a Flavor , find the corresponding HardwareUnit.
void sortHWUIResources()
Sort the HWUInfo vector.
bool tryCriticalResource(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const
Check for critical resource consumption.
bool tryCriticalResourceDependency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const
Check for dependencies of instructions that use prioritized HardwareUnits.
SmallVector< HardwareUnitInfo, 8 > HWUInfo
const SIRegisterInfo * SRI
const TargetSchedModel * SchedModel
void collectHWUIPressure()
Walk over the region and collect total usage per HardwareUnit.
void initialize(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel, const TargetRegisterInfo *TRI)
unsigned getHWUICyclesForInst(SUnit *SU)
Compute the blocking cycles for the appropriate HardwareUnit given an SU.
GCNDownwardRPTracker DownwardTracker
bool useGCNTrackers() const
GCNSchedStrategy(const MachineSchedContext *C)
SmallVector< GCNSchedStageID, 4 > SchedStages
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
std::vector< unsigned > Pressure
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
void printCandidateDecision(const SchedCandidate &Current, const SchedCandidate &Preferred)
unsigned getStructuralStallCycles(SchedBoundary &Zone, SUnit *SU) const
Estimate how many cycles SU must wait due to structural hazards at the current boundary cycle.
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp)
MachineSchedPolicy RegionPolicy
const TargetSchedModel * SchedModel
static const char * getReasonStr(GenericSchedulerBase::CandReason Reason)
const TargetRegisterInfo * TRI
SchedCandidate TopCand
Candidate last picked from Top boundary.
HardwareUnitInfo is a wrapper class which maps to some real hardware resource.
void markScheduled(SUnit *SU, unsigned BlockingCycles)
Update the state for SU being scheduled by removing it from the AllSUs and reducing its BlockingCycle...
bool contains(SUnit *SU) const
SUnit * getNextTargetSU(bool LookDeep=false) const
void insert(SUnit *SU, unsigned BlockingCycles)
Insert the SU into AllSUs and account its BlockingCycles into the TotalCycles.
AMDGPU::InstructionFlavor getType() const
SUnit * getHigherPriority(SUnit *SU, SUnit *Other) const
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
MachineInstrBundleIterator< MachineInstr > iterator
Representation of each machine instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
virtual void initPolicy(MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, unsigned NumRegionInstrs)
Optionally override the per-region scheduling policy.
Helpers for implementing custom MachineSchedStrategy classes.
Track the current register pressure at some position in the instruction stream, and remember the high...
const std::vector< unsigned > & getRegSetPressureAtPos() const
Get the register set pressure at the current position, which may be less than the pressure across the...
static bool isDS(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isMFMAorWMMA(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
Scheduling unit. This is a node in the scheduling DAG.
unsigned TopReadyCycle
Cycle relative to start when node is ready.
unsigned NodeNum
Entry # of node in the node vector.
unsigned getHeight() const
Returns the height of this node, which is the length of the maximum path down to any node which has n...
unsigned getDepth() const
Returns the depth of this node, which is the length of the maximum path up to any node which has no p...
bool isScheduled
True once scheduled.
unsigned ParentClusterIdx
The parent cluster id.
bool isBottomReady() const
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Each Scheduling boundary is associated with ready queues.
LLVM_ABI unsigned getLatencyStallCycles(SUnit *SU)
Get the difference between the given SUnit's ready time and the current cycle.
LLVM_ABI SUnit * pickOnlyChoice()
Call this before applying any other heuristics to the Available queue.
unsigned getCurrCycle() const
Number of cycles to issue the instructions scheduled in this zone.
A ScheduleDAG for scheduling lists of MachineInstr.
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Represent a constant reference to a string, i.e.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
const MCWriteProcResEntry * ProcResIter
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr StringRef getFlavorName(InstructionFlavor F)
constexpr StringRef getReasonName(AMDGPUSchedReason R)
InstructionFlavor classifyFlavor(const MachineInstr &MI, const SIInstrInfo &SII)
@ C
The default llvm calling convention, compatible with C.
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI int biasPhysReg(const SUnit *SU, bool isTop, bool BiasPRegsExtra=false)
Minimize physical register live ranges.
LLVM_ABI unsigned getWeakLeft(const SUnit *SU, bool isTop)
LLVM_ABI bool tryPressure(const PressureChange &TryP, const PressureChange &CandP, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason, const TargetRegisterInfo *TRI, const MachineFunction &MF)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
ScheduleDAGInstrs * createGCNNoopPostMachineScheduler(MachineSchedContext *C)
LLVM_ABI bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone)
ScheduleDAGInstrs * createGCNCoExecMachineScheduler(MachineSchedContext *C)
bool isTheSameCluster(unsigned A, unsigned B)
Return whether the input cluster ID's are the same and valid.
LLVM_ABI bool tryGreater(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
LLVM_ABI bool tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
Return true if this heuristic determines order.
@ Enabled
Convert any .debug_str_offsets tables to DWARF64 if needed.
LLVM_ABI cl::opt< MISched::Direction > PreRADirection
Policy for scheduling the next instruction in the candidate's zone.
Store the state used by GenericScheduler heuristics, required for the lifetime of one invocation of p...
void setBest(SchedCandidate &Best)
LLVM_ABI void initResourceDelta(const ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel)
SchedResourceDelta ResDelta
Status of an instruction's critical resource consumption.
Summarize the scheduling resources required for an instruction of a particular scheduling class.
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
PressureChange CurrentMax