LLVM 23.0.0git
AMDGPUResourceUsageAnalysis.cpp
Go to the documentation of this file.
1//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// \brief Analyzes how many registers and other resources are used by
11/// functions.
12///
13/// The results of this analysis are used to fill the register usage, flat
14/// usage, etc. into hardware registers.
15///
16//===----------------------------------------------------------------------===//
17
19#include "AMDGPU.h"
20#include "AMDGPUTargetMachine.h"
21#include "GCNSubtarget.h"
26#include "llvm/IR/GlobalValue.h"
28
29using namespace llvm;
30using namespace llvm::AMDGPU;
31
32#define DEBUG_TYPE "amdgpu-resource-usage"
33
37
38// In code object v4 and older, we need to tell the runtime some amount ahead of
39// time if we don't know the true stack size. Assume a smaller number if this is
40// only due to dynamic / non-entry block allocas.
42 "amdgpu-assume-external-call-stack-size",
43 cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
44 cl::init(16384));
45
47 "amdgpu-assume-dynamic-stack-object-size",
48 cl::desc("Assumed extra stack use if there are any "
49 "variable sized objects (in bytes)"),
50 cl::Hidden, cl::init(4096));
51
53 "Function register usage analysis", true, true)
54
55static const Function *getCalleeFunction(const MachineOperand &Op) {
56 if (Op.isImm()) {
57 assert(Op.getImm() == 0);
58 return nullptr;
59 }
60 return cast<Function>(Op.getGlobal()->stripPointerCastsAndAliases());
61}
62
64 const SIInstrInfo &TII, unsigned Reg) {
65 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
66 if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
67 return true;
68 }
69
70 return false;
71}
72
74 MachineFunction &MF) {
76 if (!TPC)
77 return false;
78
79 const TargetMachine &TM = TPC->getTM<TargetMachine>();
80 const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
81
82 // By default, for code object v5 and later, track only the minimum scratch
83 // size
84 uint32_t AssumedStackSizeForDynamicSizeObjects =
86 uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
89 STI.getTargetTriple().getOS() == Triple::AMDPAL) {
90 if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
91 AssumedStackSizeForDynamicSizeObjects = 0;
92 if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
93 AssumedStackSizeForExternalCall = 0;
94 }
95
97 MF, AssumedStackSizeForDynamicSizeObjects,
98 AssumedStackSizeForExternalCall);
99
100 return false;
101}
102
103AnalysisKey AMDGPUResourceUsageAnalysis::Key;
107 const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
108
109 // By default, for code object v5 and later, track only the minimum scratch
110 // size
111 uint32_t AssumedStackSizeForDynamicSizeObjects =
113 uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
117 if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
118 AssumedStackSizeForDynamicSizeObjects = 0;
119 if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
120 AssumedStackSizeForExternalCall = 0;
121 }
122
124 MF, AssumedStackSizeForDynamicSizeObjects,
125 AssumedStackSizeForExternalCall);
126}
127
130 const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects,
131 uint32_t AssumedStackSizeForExternalCall) const {
133
135 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
136 const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
137 const MachineRegisterInfo &MRI = MF.getRegInfo();
138 const SIInstrInfo *TII = ST.getInstrInfo();
139 const SIRegisterInfo &TRI = TII->getRegisterInfo();
140
141 Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
142 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
143 MRI.isLiveIn(MFI->getPreloadedReg(
145
146 Info.NumNamedBarrier = MFI->getNumNamedBarriers();
147
148 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
149 // instructions aren't used to access the scratch buffer. Inline assembly may
150 // need it though.
151 //
152 // If we only have implicit uses of flat_scr on flat instructions, it is not
153 // really needed.
154 if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
155 (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
156 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
157 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
158 Info.UsesFlatScratch = false;
159 }
160
161 Info.PrivateSegmentSize = FrameInfo.getStackSize();
162
163 // Assume a big number if there are any unknown sized objects.
164 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
165 if (Info.HasDynamicallySizedStack)
166 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
167
168 if (MFI->isStackRealigned())
169 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
170
171 Info.UsesVCC =
172 MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
173 Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass,
174 /*IncludeCalls=*/false);
175 if (ST.hasMAIInsts())
176 Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass,
177 /*IncludeCalls=*/false);
178
179 // If there are no calls, MachineRegisterInfo can tell us the used register
180 // count easily.
181 // A tail call isn't considered a call for MachineFrameInfo's purposes.
182 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
183 Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass,
184 /*IncludeCalls=*/false);
185 return Info;
186 }
187
188 int32_t MaxVGPR = -1;
189 Info.CalleeSegmentSize = 0;
190
191 for (const MachineBasicBlock &MBB : MF) {
192 for (const MachineInstr &MI : MBB) {
193 for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
194 const MachineOperand &MO = MI.getOperand(I);
195
196 if (!MO.isReg())
197 continue;
198
199 Register Reg = MO.getReg();
200 switch (Reg) {
201 case AMDGPU::NoRegister:
202 assert(MI.isDebugInstr() &&
203 "Instruction uses invalid noreg register");
204 continue;
205
206 case AMDGPU::XNACK_MASK:
207 case AMDGPU::XNACK_MASK_LO:
208 case AMDGPU::XNACK_MASK_HI:
209 llvm_unreachable("xnack_mask registers should not be used");
210
211 case AMDGPU::LDS_DIRECT:
212 llvm_unreachable("lds_direct register should not be used");
213
214 case AMDGPU::TBA:
215 case AMDGPU::TBA_LO:
216 case AMDGPU::TBA_HI:
217 case AMDGPU::TMA:
218 case AMDGPU::TMA_LO:
219 case AMDGPU::TMA_HI:
220 llvm_unreachable("trap handler registers should not be used");
221
222 case AMDGPU::SRC_VCCZ:
223 llvm_unreachable("src_vccz register should not be used");
224
225 case AMDGPU::SRC_EXECZ:
226 llvm_unreachable("src_execz register should not be used");
227
228 case AMDGPU::SRC_SCC:
229 llvm_unreachable("src_scc register should not be used");
230
231 default:
232 break;
233 }
234
235 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg);
236 assert((!RC || TRI.isVGPRClass(RC) || TRI.isSGPRClass(RC) ||
237 TRI.isAGPRClass(RC) || AMDGPU::TTMP_32RegClass.contains(Reg) ||
238 AMDGPU::TTMP_64RegClass.contains(Reg) ||
239 AMDGPU::TTMP_128RegClass.contains(Reg) ||
240 AMDGPU::TTMP_256RegClass.contains(Reg) ||
241 AMDGPU::TTMP_512RegClass.contains(Reg)) &&
242 "Unknown register class");
243
244 if (!RC || !TRI.isVGPRClass(RC))
245 continue;
246
247 if (MI.isCall() || MI.isMetaInstruction())
248 continue;
249
250 unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32);
251 unsigned HWReg = TRI.getHWRegIndex(Reg);
252 int MaxUsed = HWReg + Width - 1;
253 MaxVGPR = std::max(MaxUsed, MaxVGPR);
254 }
255
256 if (MI.isCall()) {
257 // Pseudo used just to encode the underlying global. Is there a better
258 // way to track this?
259
260 // TODO: Some of the generic call-like pseudos do not encode the callee,
261 // so we overly conservatively treat this as an indirect call.
262 const MachineOperand *CalleeOp =
263 TII->getNamedOperand(MI, AMDGPU::OpName::callee);
264
265 const Function *Callee =
266 CalleeOp ? getCalleeFunction(*CalleeOp) : nullptr;
267
268 auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
269 return F == &MF.getFunction();
270 };
271
272 if (Callee && !isSameFunction(MF, Callee))
273 Info.Callees.push_back(Callee);
274
275 bool IsIndirect = !Callee || Callee->isDeclaration();
276 Info.HasIndirectCall |= IsIndirect;
277
278 // In object linking mode the linker has the full cross-TU view. It
279 // propagates resource usage across both direct calls to external
280 // declarations and true indirect calls. Skip the compile-time
281 // conservative assumptions so that the locally emitted metadata
282 // describes this function's own usage only.
284 continue;
285
286 // FIXME: Call site could have norecurse on it
287 if (!Callee || !Callee->doesNotRecurse()) {
288 Info.HasRecursion = true;
289
290 // TODO: If we happen to know there is no stack usage in the
291 // callgraph, we don't need to assume an infinitely growing stack.
292 if (!MI.isReturn()) {
293 // We don't need to assume an unknown stack size for tail calls.
294
295 // FIXME: This only benefits in the case where the kernel does not
296 // directly call the tail called function. If a kernel directly
297 // calls a tail recursive function, we'll assume maximum stack size
298 // based on the regular call instruction.
299 Info.CalleeSegmentSize = std::max(
300 Info.CalleeSegmentSize,
301 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
302 }
303 }
304
305 if (IsIndirect) {
306 Info.CalleeSegmentSize =
307 std::max(Info.CalleeSegmentSize,
308 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
309
310 // Register usage of indirect calls gets handled later
311 Info.UsesVCC = true;
312 Info.UsesFlatScratch = ST.hasFlatAddressSpace();
313 Info.HasDynamicallySizedStack = true;
314 }
315 }
316 }
317 }
318
319 Info.NumVGPR = MaxVGPR + 1;
320
321 return Info;
322}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
static cl::opt< uint32_t > clAssumedStackSizeForDynamicSizeObjects("amdgpu-assume-dynamic-stack-object-size", cl::desc("Assumed extra stack use if there are any " "variable sized objects (in bytes)"), cl::Hidden, cl::init(4096))
static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, const SIInstrInfo &TII, unsigned Reg)
static cl::opt< uint32_t > clAssumedStackSizeForExternalCall("amdgpu-assume-external-call-stack-size", cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, cl::init(16384))
Analyzes how many registers and other resources are used by functions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
Target-Independent Code Generator Pass Configuration Options pass.
Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
AMDGPUResourceUsageAnalysisImpl::SIFunctionResourceInfo Result
Module * getParent()
Get the module that this global value is contained inside of...
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
iterator_range< reg_iterator > reg_operands(Register Reg) const
LLVM_ABI bool isLiveIn(Register Reg) const
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
AnalysisType * getAnalysisIfAvailable() const
getAnalysisIfAvailable<AnalysisType>() - Subclasses use this function to get analysis information tha...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
Primary interface to the complete machine description for the target machine.
const MCSubtargetInfo * getMCSubtargetInfo() const
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:438
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getAMDHSACodeObjectVersion(const Module &M)
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
char & AMDGPUResourceUsageAnalysisID
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects, uint32_t AssumedStackSizeForExternalCall) const
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition Analysis.h:29