41#define DEBUG_TYPE "amdgpu-resource-usage"
50 "amdgpu-assume-external-call-stack-size",
55 "amdgpu-assume-dynamic-stack-object-size",
56 cl::desc(
"Assumed extra stack use if there are any "
57 "variable sized objects (in bytes)"),
61 "Function register usage analysis",
true,
true)
68 if (
auto *GA = dyn_cast<GlobalAlias>(
Op.getGlobal()))
69 return cast<Function>(GA->getOperand(0));
70 return cast<Function>(
Op.getGlobal());
76 if (!UseOp.isImplicit() || !
TII.isFLAT(*UseOp.getParent()))
87 ST.getTargetID().isXnackOnOrAny());
91 const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
const {
101 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
108 bool HasIndirectCall =
false;
115 uint32_t AssumedStackSizeForDynamicSizeObjects =
121 AssumedStackSizeForDynamicSizeObjects = 0;
123 AssumedStackSizeForExternalCall = 0;
128 if (!
F ||
F->isDeclaration())
132 assert(MF &&
"function must have been generated already");
137 assert(CI.second &&
"should only be called once per function");
138 Info = analyzeResourceUsage(*MF,
TM, AssumedStackSizeForDynamicSizeObjects,
139 AssumedStackSizeForExternalCall);
140 HasIndirectCall |=
Info.HasIndirectCall;
146 for (
const auto &
IT : CG) {
148 if (!
F ||
F->isDeclaration())
158 assert(MF &&
"function must have been generated already");
159 Info = analyzeResourceUsage(*MF,
TM, AssumedStackSizeForDynamicSizeObjects,
160 AssumedStackSizeForExternalCall);
161 HasIndirectCall |=
Info.HasIndirectCall;
165 propagateIndirectCallRegisterUsage();
171AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
173 uint32_t AssumedStackSizeForDynamicSizeObjects,
174 uint32_t AssumedStackSizeForExternalCall)
const {
175 SIFunctionResourceInfo
Info;
184 Info.UsesFlatScratch =
MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
185 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
199 Info.UsesFlatScratch =
false;
202 Info.PrivateSegmentSize = FrameInfo.getStackSize();
205 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
206 if (
Info.HasDynamicallySizedStack)
207 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
210 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
213 MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
MRI.isPhysRegUsed(AMDGPU::VCC_HI);
218 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
219 MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
221 if (
MRI.isPhysRegUsed(Reg)) {
222 HighestVGPRReg = Reg;
227 if (
ST.hasMAIInsts()) {
228 MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
230 if (
MRI.isPhysRegUsed(Reg)) {
231 HighestAGPRReg =
Reg;
235 Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
237 :
TRI.getHWRegIndex(HighestAGPRReg) + 1;
240 MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
242 if (
MRI.isPhysRegUsed(Reg)) {
243 HighestSGPRReg =
Reg;
250 Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
252 :
TRI.getHWRegIndex(HighestVGPRReg) + 1;
253 Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
255 :
TRI.getHWRegIndex(HighestSGPRReg) + 1;
260 int32_t MaxVGPR = -1;
261 int32_t MaxAGPR = -1;
262 int32_t MaxSGPR = -1;
279 case AMDGPU::EXEC_LO:
280 case AMDGPU::EXEC_HI:
283 case AMDGPU::M0_LO16:
284 case AMDGPU::M0_HI16:
285 case AMDGPU::SRC_SHARED_BASE_LO:
286 case AMDGPU::SRC_SHARED_BASE:
287 case AMDGPU::SRC_SHARED_LIMIT_LO:
288 case AMDGPU::SRC_SHARED_LIMIT:
289 case AMDGPU::SRC_PRIVATE_BASE_LO:
290 case AMDGPU::SRC_PRIVATE_BASE:
291 case AMDGPU::SRC_PRIVATE_LIMIT_LO:
292 case AMDGPU::SRC_PRIVATE_LIMIT:
293 case AMDGPU::SGPR_NULL:
294 case AMDGPU::SGPR_NULL64:
298 case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
301 case AMDGPU::NoRegister:
303 "Instruction uses invalid noreg register");
309 case AMDGPU::VCC_LO_LO16:
310 case AMDGPU::VCC_LO_HI16:
311 case AMDGPU::VCC_HI_LO16:
312 case AMDGPU::VCC_HI_HI16:
316 case AMDGPU::FLAT_SCR:
317 case AMDGPU::FLAT_SCR_LO:
318 case AMDGPU::FLAT_SCR_HI:
321 case AMDGPU::XNACK_MASK:
322 case AMDGPU::XNACK_MASK_LO:
323 case AMDGPU::XNACK_MASK_HI:
326 case AMDGPU::LDS_DIRECT:
337 case AMDGPU::SRC_VCCZ:
340 case AMDGPU::SRC_EXECZ:
343 case AMDGPU::SRC_SCC:
350 if (AMDGPU::SGPR_32RegClass.
contains(Reg) ||
351 AMDGPU::SGPR_LO16RegClass.
contains(Reg) ||
352 AMDGPU::SGPR_HI16RegClass.
contains(Reg)) {
355 }
else if (AMDGPU::VGPR_32RegClass.
contains(Reg) ||
356 AMDGPU::VGPR_16RegClass.
contains(Reg)) {
359 }
else if (AMDGPU::AGPR_32RegClass.
contains(Reg) ||
360 AMDGPU::AGPR_LO16RegClass.
contains(Reg)) {
364 }
else if (AMDGPU::SGPR_64RegClass.
contains(Reg)) {
367 }
else if (AMDGPU::VReg_64RegClass.
contains(Reg)) {
370 }
else if (AMDGPU::AReg_64RegClass.
contains(Reg)) {
374 }
else if (AMDGPU::VReg_96RegClass.
contains(Reg)) {
377 }
else if (AMDGPU::SReg_96RegClass.
contains(Reg)) {
380 }
else if (AMDGPU::AReg_96RegClass.
contains(Reg)) {
384 }
else if (AMDGPU::SGPR_128RegClass.
contains(Reg)) {
387 }
else if (AMDGPU::VReg_128RegClass.
contains(Reg)) {
390 }
else if (AMDGPU::AReg_128RegClass.
contains(Reg)) {
394 }
else if (AMDGPU::VReg_160RegClass.
contains(Reg)) {
397 }
else if (AMDGPU::SReg_160RegClass.
contains(Reg)) {
400 }
else if (AMDGPU::AReg_160RegClass.
contains(Reg)) {
404 }
else if (AMDGPU::VReg_192RegClass.
contains(Reg)) {
407 }
else if (AMDGPU::SReg_192RegClass.
contains(Reg)) {
410 }
else if (AMDGPU::AReg_192RegClass.
contains(Reg)) {
414 }
else if (AMDGPU::VReg_224RegClass.
contains(Reg)) {
417 }
else if (AMDGPU::SReg_224RegClass.
contains(Reg)) {
420 }
else if (AMDGPU::AReg_224RegClass.
contains(Reg)) {
424 }
else if (AMDGPU::SReg_256RegClass.
contains(Reg)) {
427 }
else if (AMDGPU::VReg_256RegClass.
contains(Reg)) {
430 }
else if (AMDGPU::AReg_256RegClass.
contains(Reg)) {
434 }
else if (AMDGPU::VReg_288RegClass.
contains(Reg)) {
437 }
else if (AMDGPU::SReg_288RegClass.
contains(Reg)) {
440 }
else if (AMDGPU::AReg_288RegClass.
contains(Reg)) {
444 }
else if (AMDGPU::VReg_320RegClass.
contains(Reg)) {
447 }
else if (AMDGPU::SReg_320RegClass.
contains(Reg)) {
450 }
else if (AMDGPU::AReg_320RegClass.
contains(Reg)) {
454 }
else if (AMDGPU::VReg_352RegClass.
contains(Reg)) {
457 }
else if (AMDGPU::SReg_352RegClass.
contains(Reg)) {
460 }
else if (AMDGPU::AReg_352RegClass.
contains(Reg)) {
464 }
else if (AMDGPU::VReg_384RegClass.
contains(Reg)) {
467 }
else if (AMDGPU::SReg_384RegClass.
contains(Reg)) {
470 }
else if (AMDGPU::AReg_384RegClass.
contains(Reg)) {
474 }
else if (AMDGPU::SReg_512RegClass.
contains(Reg)) {
477 }
else if (AMDGPU::VReg_512RegClass.
contains(Reg)) {
480 }
else if (AMDGPU::AReg_512RegClass.
contains(Reg)) {
484 }
else if (AMDGPU::SReg_1024RegClass.
contains(Reg)) {
487 }
else if (AMDGPU::VReg_1024RegClass.
contains(Reg)) {
490 }
else if (AMDGPU::AReg_1024RegClass.
contains(Reg)) {
498 AMDGPU::TTMP_64RegClass.
contains(Reg) ||
499 AMDGPU::TTMP_128RegClass.
contains(Reg) ||
500 AMDGPU::TTMP_256RegClass.
contains(Reg) ||
501 AMDGPU::TTMP_512RegClass.
contains(Reg) ||
502 !
TRI.getPhysRegBaseClass(Reg)) &&
503 "Unknown register class");
505 unsigned HWReg =
TRI.getHWRegIndex(Reg);
506 int MaxUsed = HWReg + Width - 1;
508 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
510 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
512 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
521 TII->getNamedOperand(
MI, AMDGPU::OpName::callee);
525 CallGraphResourceInfo.end();
536 I = CallGraphResourceInfo.find(Callee);
539 if (!Callee || !
Callee->doesNotRecurse()) {
540 Info.HasRecursion =
true;
544 if (!
MI.isReturn()) {
551 CalleeFrameSize = std::max(
553 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
557 if (IsIndirect ||
I == CallGraphResourceInfo.end()) {
559 std::max(CalleeFrameSize,
560 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
564 Info.UsesFlatScratch =
ST.hasFlatAddressSpace();
565 Info.HasDynamicallySizedStack =
true;
566 Info.HasIndirectCall =
true;
570 MaxSGPR = std::max(
I->second.NumExplicitSGPR - 1, MaxSGPR);
571 MaxVGPR = std::max(
I->second.NumVGPR - 1, MaxVGPR);
572 MaxAGPR = std::max(
I->second.NumAGPR - 1, MaxAGPR);
574 std::max(
I->second.PrivateSegmentSize, CalleeFrameSize);
575 Info.UsesVCC |=
I->second.UsesVCC;
576 Info.UsesFlatScratch |=
I->second.UsesFlatScratch;
577 Info.HasDynamicallySizedStack |=
I->second.HasDynamicallySizedStack;
578 Info.HasRecursion |=
I->second.HasRecursion;
579 Info.HasIndirectCall |=
I->second.HasIndirectCall;
585 Info.NumExplicitSGPR = MaxSGPR + 1;
586 Info.NumVGPR = MaxVGPR + 1;
587 Info.NumAGPR = MaxAGPR + 1;
588 Info.PrivateSegmentSize += CalleeFrameSize;
593void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
596 int32_t NonKernelMaxSGPRs = 0;
597 int32_t NonKernelMaxVGPRs = 0;
598 int32_t NonKernelMaxAGPRs = 0;
600 for (
const auto &
I : CallGraphResourceInfo) {
602 auto &
Info =
I.getSecond();
603 NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs,
Info.NumExplicitSGPR);
604 NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs,
Info.NumVGPR);
605 NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs,
Info.NumAGPR);
612 for (
auto &
I : CallGraphResourceInfo) {
613 auto &
Info =
I.getSecond();
614 if (
Info.HasIndirectCall) {
615 Info.NumExplicitSGPR = std::max(
Info.NumExplicitSGPR, NonKernelMaxSGPRs);
616 Info.NumVGPR = std::max(
Info.NumVGPR, NonKernelMaxVGPRs);
617 Info.NumAGPR = std::max(
Info.NumAGPR, NonKernelMaxAGPRs);
unsigned const MachineRegisterInfo * MRI
static cl::opt< uint32_t > clAssumedStackSizeForDynamicSizeObjects("amdgpu-assume-dynamic-stack-object-size", cl::desc("Assumed extra stack use if there are any " "variable sized objects (in bytes)"), cl::Hidden, cl::init(4096))
static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, const SIInstrInfo &TII, unsigned Reg)
static cl::opt< uint32_t > clAssumedStackSizeForExternalCall("amdgpu-assume-external-call-stack-size", cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, cl::init(16384))
Analyzes how many registers and other resources are used by functions.
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Analysis containing CSE Info
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Target-Independent Code Generator Pass Configuration Options pass.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
The basic data container for the call graph of a Module of IR.
This class represents an Operation in the Expression.
bool hasFlatScratchInit() const
Generic base class for all target subtargets.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
This class contains meta information specific to a module.
MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A Module instance is used to store all the information related to an LLVM module.
Wrapper class representing virtual and physical registers.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
bool isStackRealigned() const
MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
Primary interface to the complete machine description for the target machine.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
char & AMDGPUResourceUsageAnalysisID
po_iterator< T > po_begin(const T &G)
auto reverse(ContainerTy &&C)
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
po_iterator< T > po_end(const T &G)
int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const
int32_t getTotalNumVGPRs(const GCNSubtarget &ST, int32_t NumAGPR, int32_t NumVGPR) const
bool runOnModule(Module &M) override
runOnModule - Virtual method overriden by subclasses to process the module being operated on.