LLVM 23.0.0git
AMDGPUCoExecSchedStrategy.h
Go to the documentation of this file.
1//===- AMDGPUCoExecSchedStrategy.h - CoExec Scheduling Strategy -*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Coexecution-focused scheduling strategy for AMDGPU.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
15#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
16
17#include "GCNSchedStrategy.h"
19
20namespace llvm {
21
22namespace AMDGPU {
23
24//===----------------------------------------------------------------------===//
25// Instruction Flavor Classification
26//===----------------------------------------------------------------------===//
27
29 WMMA, // WMMA/MFMA matrix operations
30 SingleCycleVALU, // Single-cycle VALU (not TRANS32, not multi-cycle CVT)
31 TRANS, // Transcendental ops (v_exp, v_log, etc.)
32 MultiCycleVALU, // VALU instructions with repeat rate > 1
33 VMEM, // FLAT/GLOBAL memory operations
34 DS, // LDS/GDS operations
35 SALU, // Scalar ALU
36 DMA, // Tensor DMA operations
37 Fence, // Fences and waits
38 Other, // Everything else
40};
41
43 switch (F) {
45 return "WMMA";
47 return "VALU(1c)";
49 return "TRANS";
51 return "VALU(Nc)";
53 return "VMEM";
55 return "DS";
57 return "SALU";
59 return "DMA";
61 return "Fence";
63 return "Other";
65 llvm_unreachable("Unknown InstructionFlavor");
66 }
67 llvm_unreachable("Unknown InstructionFlavor");
68}
69
71 switch (F) {
73 return "W";
75 return "V";
77 return "T";
79 return "C";
81 return "M";
83 return "D";
85 return "S";
87 return "X";
89 return "F";
91 return "O";
93 llvm_unreachable("Unknown InstructionFlavor");
94 }
95 llvm_unreachable("Unknown InstructionFlavor");
96}
97
98InstructionFlavor classifyFlavor(const MachineInstr &MI,
99 const SIInstrInfo &SII);
100
102
103namespace FlavorGroups {
113inline FlavorGroup all() {
115 for (unsigned I = 0;
117 G.push_back(static_cast<InstructionFlavor>(I));
118 return G;
119}
120} // namespace FlavorGroups
121
122/// AMDGPU-specific scheduling decision reasons. These provide more granularity
123/// than the generic CandReason enum for debugging purposes.
126 CritResourceBalance, // tryCriticalResource chose based on resource pressure
127 CritResourceDep, // tryCriticalResourceDependency chose based on enabling
129};
130
132 switch (R) {
134 return "None";
136 return "CritResource";
138 return "CritResourceDep";
140 llvm_unreachable("Unknown AMDGPUSchedReason");
141 }
142 llvm_unreachable("Unknown AMDGPUSchedReason");
143}
144
145} // End namespace AMDGPU
146
147//===----------------------------------------------------------------------===//
148// Hardware Unit Information
149//===----------------------------------------------------------------------===//
150
151/// HardwareUnitInfo is a wrapper class which maps to some real hardware
152/// resource. This is used to model hardware resource pressure per region, and
153/// guide scheduling heuristics.
155private:
156 /// PrioritySUs maintains a list of the SUs we want to prioritize scheduling
157 /// for this HardwareUnit. This is used for agreement between
158 /// tryCriticalResourceDependency and tryCriticalResource: we schedule the
159 /// dependencies for a SU on critical resource, then schedule that same SU on
160 /// the critical resource. This agreement results in shorter live ranges and
161 /// more regular HardwareUnit access patterns. SUs are prioritized based on
162 /// depth for top-down scheduling.
163 SmallSetVector<SUnit *, 16> PrioritySUs;
164 /// All the SUs in the region that consume this resource.
166 /// The total number of busy cycles for this HardwareUnit for a given region.
167 unsigned TotalCycles = 0;
168 /// InstructionFlavor mapping.
170 /// Whether or not instructions on this HardwareUnit may produce a window in
171 /// which instructions in other HardwareUnits can coexecute. For example, WMMA
172 /// / MFMA instructions may take multiple cycles, which may be overlapped with
173 /// instructions on other HardwareUnits.
174 bool ProducesCoexecWindow = false;
175
176public:
178
179 unsigned size() { return AllSUs.size(); }
180
181 unsigned getTotalCycles() { return TotalCycles; }
182
183 void setType(unsigned TheType) {
185 Type = (AMDGPU::InstructionFlavor)(TheType);
186 }
187
188 AMDGPU::InstructionFlavor getType() const { return Type; }
189
190 bool producesCoexecWindow() const { return ProducesCoexecWindow; }
191
192 void setProducesCoexecWindow(bool Val) { ProducesCoexecWindow = Val; }
193
194 bool contains(SUnit *SU) const { return AllSUs.contains(SU); }
195
196 /// \returns the SUnit with higher priority or nullptr if they are the same.
197 /// This method looks through the PrioritySUs to determine if one SU is more
198 /// prioritized than the other. If neither are in the PrioritySUs list, then
199 /// neither have priority over each other.
201 for (SUnit *SUOrder : PrioritySUs) {
202 if (SUOrder == SU)
203 return SU;
204
205 if (SUOrder == Other)
206 return Other;
207 }
208 return nullptr;
209 }
210
211 void reset() {
212 AllSUs.clear();
213 PrioritySUs.clear();
214 TotalCycles = 0;
216 ProducesCoexecWindow = false;
217 }
218
219 /// \returns the next SU in PrioritySUs that is not ready. If \p LookDeep is
220 /// set, we will look beyond the PrioritySUs (if all the PrioritySUs are
221 /// ready) to AllSUs to attempt to find a target SU. When looking through
222 /// AllSUs we sort pick the target SU by minimal depth for top-down
223 /// scheduling. getNextTargetSU is useful for determining which SU on this
224 /// HardwareUnit we are trying to schedule - this info helps us determine
225 /// which dependencies to schedule. LookDeep is useful if the dependencies are
226 /// long latency (e.g. memory instructions). If we have many long latency
227 /// dependencies, it is beneficial to enable SUs multiple levels ahead.
228 SUnit *getNextTargetSU(bool LookDeep = false) const;
229 /// Insert the \p SU into AllSUs and account its \p BlockingCycles into
230 /// the TotalCycles. This maintains the list of PrioritySUs.
231 void insert(SUnit *SU, unsigned BlockingCycles);
232 /// Update the state for \p SU being scheduled by removing it from the AllSUs
233 /// and reducing its \p BlockingCycles from the TotalCycles. This maintains
234 /// the list of PrioritySUs.
235 void markScheduled(SUnit *SU, unsigned BlockingCycles);
236};
237
238//===----------------------------------------------------------------------===//
239// Candidate Heuristics
240//===----------------------------------------------------------------------===//
241
242/// CandidateHeuristics contains state and implementations to facilitate making
243/// per instruction scheduling decisions; it contains methods used in
244/// tryCandidate to decide which instruction to schedule next.
246protected:
252
253 /// Walk over the region and collect total usage per HardwareUnit.
254 void collectHWUIPressure();
255
256 /// Compute the blocking cycles for the appropriate HardwareUnit given an \p
257 /// SU.
258 unsigned getHWUICyclesForInst(SUnit *SU);
259
260 /// Given a \p Flavor , find the corresponding HardwareUnit. \returns the
261 /// mapped HardwareUnit.
263
264public:
266
268 const TargetRegisterInfo *TRI);
269
270 /// Update the state to reflect that \p SU is going to be scheduled.
271 void updateForScheduling(SUnit *SU);
272
273 /// Sort the HWUInfo vector. After sorting, the HardwareUnits that are highest
274 /// priority are first. Priority is determined by maximizing coexecution and
275 /// keeping the critical HardwareUnit busy.
276 void sortHWUIResources();
277
278 /// Check for critical resource consumption. Prefer the candidate that uses
279 /// the most prioritized HardwareUnit. If both candidates use the same
280 /// HarwareUnit, prefer the candidate with higher priority on that
281 /// HardwareUnit.
284 SchedBoundary *Zone) const;
285
286 /// Check for dependencies of instructions that use prioritized HardwareUnits.
287 /// Prefer the candidate that is a dependency of an instruction that uses the
288 /// most prioritized HardwareUnit. If both candidates enable the same
289 /// HardwareUnit, prefer the candidate that enables the higher priority
290 /// instruction on that HardwareUnit.
291 bool
294 SchedBoundary *Zone) const;
295
296 void dumpRegionSummary();
297};
298
300protected:
302 SchedBoundary &Zone) const;
305
306#ifndef NDEBUG
307 void dumpPickSummary(SUnit *SU, bool IsTopNode, SchedCandidate &Cand);
308#endif
309
311 SchedBoundary *Zone);
312 void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
313 const RegPressureTracker &RPTracker,
314 SchedCandidate &Cand, bool &PickedPending,
315 bool IsBottomUp);
316
317public:
319
322 unsigned NumRegionInstrs) override;
323 void initialize(ScheduleDAGMI *DAG) override;
324 SUnit *pickNode(bool &IsTopNode) override;
325 void schedNode(SUnit *SU, bool IsTopNode) override;
326};
327
330
331} // End namespace llvm
332
333#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Register const TargetRegisterInfo * TRI
bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary &Zone) const
void initPolicy(MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, unsigned NumRegionInstrs) override
Optionally override the per-region scheduling policy.
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand, bool &PickedPending, bool IsBottomUp)
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
AMDGPUCoExecSchedStrategy(const MachineSchedContext *C)
void dumpPickSummary(SUnit *SU, bool IsTopNode, SchedCandidate &Cand)
bool tryCandidateCoexec(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone)
CandidateHeuristics contains state and implementations to facilitate making per instruction schedulin...
void updateForScheduling(SUnit *SU)
Update the state to reflect that SU is going to be scheduled.
HardwareUnitInfo * getHWUIFromFlavor(AMDGPU::InstructionFlavor Flavor)
Given a Flavor , find the corresponding HardwareUnit.
void sortHWUIResources()
Sort the HWUInfo vector.
bool tryCriticalResource(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const
Check for critical resource consumption.
bool tryCriticalResourceDependency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const
Check for dependencies of instructions that use prioritized HardwareUnits.
SmallVector< HardwareUnitInfo, 8 > HWUInfo
const TargetSchedModel * SchedModel
void collectHWUIPressure()
Walk over the region and collect total usage per HardwareUnit.
void initialize(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel, const TargetRegisterInfo *TRI)
unsigned getHWUICyclesForInst(SUnit *SU)
Compute the blocking cycles for the appropriate HardwareUnit given an SU.
GCNSchedStrategy(const MachineSchedContext *C)
ScheduleDAGMILive * DAG
HardwareUnitInfo is a wrapper class which maps to some real hardware resource.
void markScheduled(SUnit *SU, unsigned BlockingCycles)
Update the state for SU being scheduled by removing it from the AllSUs and reducing its BlockingCycle...
SUnit * getNextTargetSU(bool LookDeep=false) const
void insert(SUnit *SU, unsigned BlockingCycles)
Insert the SU into AllSUs and account its BlockingCycles into the TotalCycles.
AMDGPU::InstructionFlavor getType() const
SUnit * getHigherPriority(SUnit *SU, SUnit *Other) const
MachineInstrBundleIterator< MachineInstr > iterator
Representation of each machine instruction.
Track the current register pressure at some position in the instruction stream, and remember the high...
Scheduling unit. This is a node in the scheduling DAG.
Each Scheduling boundary is associated with ready queues.
A ScheduleDAG for scheduling lists of MachineInstr.
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
FlavorGroup individual(InstructionFlavor F)
AMDGPUSchedReason
AMDGPU-specific scheduling decision reasons.
constexpr StringRef getFlavorName(InstructionFlavor F)
constexpr StringRef getReasonName(AMDGPUSchedReason R)
InstructionFlavor classifyFlavor(const MachineInstr &MI, const SIInstrInfo &SII)
SmallVector< InstructionFlavor, 4 > FlavorGroup
constexpr StringRef getFlavorShortName(InstructionFlavor F)
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
This is an optimization pass for GlobalISel generic memory operations.
ScheduleDAGInstrs * createGCNNoopPostMachineScheduler(MachineSchedContext *C)
@ Other
Any other memory.
Definition ModRef.h:68
ScheduleDAGInstrs * createGCNCoExecMachineScheduler(MachineSchedContext *C)
Policy for scheduling the next instruction in the candidate's zone.
Store the state used by GenericScheduler heuristics, required for the lifetime of one invocation of p...
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...