41#define DEBUG_TYPE "amdgpu-resource-usage"
50 "amdgpu-assume-external-call-stack-size",
55 "amdgpu-assume-dynamic-stack-object-size",
56 cl::desc(
"Assumed extra stack use if there are any "
57 "variable sized objects (in bytes)"),
61 "Function register usage analysis",
true,
true)
68 if (
auto *GA = dyn_cast<GlobalAlias>(
Op.getGlobal()))
69 return cast<Function>(GA->getOperand(0));
70 return cast<Function>(
Op.getGlobal());
76 if (!UseOp.isImplicit() || !
TII.isFLAT(*UseOp.getParent()))
87 ST.getTargetID().isXnackOnOrAny());
91 const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
const {
101 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
108 bool HasIndirectCall =
false;
125 if (!
F ||
F->isDeclaration())
129 assert(MF &&
"function must have been generated already");
134 assert(CI.second &&
"should only be called once per function");
135 Info = analyzeResourceUsage(*MF,
TM);
136 HasIndirectCall |=
Info.HasIndirectCall;
142 for (
const auto &
IT : CG) {
144 if (!
F ||
F->isDeclaration())
154 assert(MF &&
"function must have been generated already");
155 Info = analyzeResourceUsage(*MF,
TM);
156 HasIndirectCall |=
Info.HasIndirectCall;
160 propagateIndirectCallRegisterUsage();
166AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
168 SIFunctionResourceInfo
Info;
177 Info.UsesFlatScratch =
MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
178 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
192 Info.UsesFlatScratch =
false;
195 Info.PrivateSegmentSize = FrameInfo.getStackSize();
198 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
199 if (
Info.HasDynamicallySizedStack)
203 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
206 MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
MRI.isPhysRegUsed(AMDGPU::VCC_HI);
211 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
212 MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
214 if (
MRI.isPhysRegUsed(Reg)) {
215 HighestVGPRReg = Reg;
220 if (
ST.hasMAIInsts()) {
221 MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
223 if (
MRI.isPhysRegUsed(Reg)) {
224 HighestAGPRReg =
Reg;
228 Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
230 :
TRI.getHWRegIndex(HighestAGPRReg) + 1;
233 MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
235 if (
MRI.isPhysRegUsed(Reg)) {
236 HighestSGPRReg =
Reg;
243 Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
245 :
TRI.getHWRegIndex(HighestVGPRReg) + 1;
246 Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
248 :
TRI.getHWRegIndex(HighestSGPRReg) + 1;
253 int32_t MaxVGPR = -1;
254 int32_t MaxAGPR = -1;
255 int32_t MaxSGPR = -1;
272 case AMDGPU::EXEC_LO:
273 case AMDGPU::EXEC_HI:
276 case AMDGPU::M0_LO16:
277 case AMDGPU::M0_HI16:
278 case AMDGPU::SRC_SHARED_BASE_LO:
279 case AMDGPU::SRC_SHARED_BASE:
280 case AMDGPU::SRC_SHARED_LIMIT_LO:
281 case AMDGPU::SRC_SHARED_LIMIT:
282 case AMDGPU::SRC_PRIVATE_BASE_LO:
283 case AMDGPU::SRC_PRIVATE_BASE:
284 case AMDGPU::SRC_PRIVATE_LIMIT_LO:
285 case AMDGPU::SRC_PRIVATE_LIMIT:
286 case AMDGPU::SGPR_NULL:
287 case AMDGPU::SGPR_NULL64:
291 case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
294 case AMDGPU::NoRegister:
296 "Instruction uses invalid noreg register");
302 case AMDGPU::VCC_LO_LO16:
303 case AMDGPU::VCC_LO_HI16:
304 case AMDGPU::VCC_HI_LO16:
305 case AMDGPU::VCC_HI_HI16:
309 case AMDGPU::FLAT_SCR:
310 case AMDGPU::FLAT_SCR_LO:
311 case AMDGPU::FLAT_SCR_HI:
314 case AMDGPU::XNACK_MASK:
315 case AMDGPU::XNACK_MASK_LO:
316 case AMDGPU::XNACK_MASK_HI:
319 case AMDGPU::LDS_DIRECT:
330 case AMDGPU::SRC_VCCZ:
333 case AMDGPU::SRC_EXECZ:
336 case AMDGPU::SRC_SCC:
343 if (AMDGPU::SGPR_32RegClass.
contains(Reg) ||
344 AMDGPU::SGPR_LO16RegClass.
contains(Reg) ||
345 AMDGPU::SGPR_HI16RegClass.
contains(Reg)) {
348 }
else if (AMDGPU::VGPR_32RegClass.
contains(Reg) ||
349 AMDGPU::VGPR_LO16RegClass.
contains(Reg) ||
350 AMDGPU::VGPR_HI16RegClass.
contains(Reg)) {
353 }
else if (AMDGPU::AGPR_32RegClass.
contains(Reg) ||
354 AMDGPU::AGPR_LO16RegClass.
contains(Reg)) {
358 }
else if (AMDGPU::SGPR_64RegClass.
contains(Reg)) {
361 }
else if (AMDGPU::VReg_64RegClass.
contains(Reg)) {
364 }
else if (AMDGPU::AReg_64RegClass.
contains(Reg)) {
368 }
else if (AMDGPU::VReg_96RegClass.
contains(Reg)) {
371 }
else if (AMDGPU::SReg_96RegClass.
contains(Reg)) {
374 }
else if (AMDGPU::AReg_96RegClass.
contains(Reg)) {
378 }
else if (AMDGPU::SGPR_128RegClass.
contains(Reg)) {
381 }
else if (AMDGPU::VReg_128RegClass.
contains(Reg)) {
384 }
else if (AMDGPU::AReg_128RegClass.
contains(Reg)) {
388 }
else if (AMDGPU::VReg_160RegClass.
contains(Reg)) {
391 }
else if (AMDGPU::SReg_160RegClass.
contains(Reg)) {
394 }
else if (AMDGPU::AReg_160RegClass.
contains(Reg)) {
398 }
else if (AMDGPU::VReg_192RegClass.
contains(Reg)) {
401 }
else if (AMDGPU::SReg_192RegClass.
contains(Reg)) {
404 }
else if (AMDGPU::AReg_192RegClass.
contains(Reg)) {
408 }
else if (AMDGPU::VReg_224RegClass.
contains(Reg)) {
411 }
else if (AMDGPU::SReg_224RegClass.
contains(Reg)) {
414 }
else if (AMDGPU::AReg_224RegClass.
contains(Reg)) {
418 }
else if (AMDGPU::SReg_256RegClass.
contains(Reg)) {
421 }
else if (AMDGPU::VReg_256RegClass.
contains(Reg)) {
424 }
else if (AMDGPU::AReg_256RegClass.
contains(Reg)) {
428 }
else if (AMDGPU::VReg_288RegClass.
contains(Reg)) {
431 }
else if (AMDGPU::SReg_288RegClass.
contains(Reg)) {
434 }
else if (AMDGPU::AReg_288RegClass.
contains(Reg)) {
438 }
else if (AMDGPU::VReg_320RegClass.
contains(Reg)) {
441 }
else if (AMDGPU::SReg_320RegClass.
contains(Reg)) {
444 }
else if (AMDGPU::AReg_320RegClass.
contains(Reg)) {
448 }
else if (AMDGPU::VReg_352RegClass.
contains(Reg)) {
451 }
else if (AMDGPU::SReg_352RegClass.
contains(Reg)) {
454 }
else if (AMDGPU::AReg_352RegClass.
contains(Reg)) {
458 }
else if (AMDGPU::VReg_384RegClass.
contains(Reg)) {
461 }
else if (AMDGPU::SReg_384RegClass.
contains(Reg)) {
464 }
else if (AMDGPU::AReg_384RegClass.
contains(Reg)) {
468 }
else if (AMDGPU::SReg_512RegClass.
contains(Reg)) {
471 }
else if (AMDGPU::VReg_512RegClass.
contains(Reg)) {
474 }
else if (AMDGPU::AReg_512RegClass.
contains(Reg)) {
478 }
else if (AMDGPU::SReg_1024RegClass.
contains(Reg)) {
481 }
else if (AMDGPU::VReg_1024RegClass.
contains(Reg)) {
484 }
else if (AMDGPU::AReg_1024RegClass.
contains(Reg)) {
492 AMDGPU::TTMP_64RegClass.
contains(Reg) ||
493 AMDGPU::TTMP_128RegClass.
contains(Reg) ||
494 AMDGPU::TTMP_256RegClass.
contains(Reg) ||
495 AMDGPU::TTMP_512RegClass.
contains(Reg) ||
496 !
TRI.getPhysRegBaseClass(Reg)) &&
497 "Unknown register class");
499 unsigned HWReg =
TRI.getHWRegIndex(Reg);
500 int MaxUsed = HWReg +
Width - 1;
502 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
504 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
506 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
515 TII->getNamedOperand(
MI, AMDGPU::OpName::callee);
519 CallGraphResourceInfo.end();
530 I = CallGraphResourceInfo.find(Callee);
533 if (!Callee || !
Callee->doesNotRecurse()) {
534 Info.HasRecursion =
true;
538 if (!
MI.isReturn()) {
546 std::max(CalleeFrameSize,
551 if (IsIndirect ||
I == CallGraphResourceInfo.end()) {
553 std::max(CalleeFrameSize,
558 Info.UsesFlatScratch =
ST.hasFlatAddressSpace();
559 Info.HasDynamicallySizedStack =
true;
560 Info.HasIndirectCall =
true;
564 MaxSGPR = std::max(
I->second.NumExplicitSGPR - 1, MaxSGPR);
565 MaxVGPR = std::max(
I->second.NumVGPR - 1, MaxVGPR);
566 MaxAGPR = std::max(
I->second.NumAGPR - 1, MaxAGPR);
568 std::max(
I->second.PrivateSegmentSize, CalleeFrameSize);
569 Info.UsesVCC |=
I->second.UsesVCC;
570 Info.UsesFlatScratch |=
I->second.UsesFlatScratch;
571 Info.HasDynamicallySizedStack |=
I->second.HasDynamicallySizedStack;
572 Info.HasRecursion |=
I->second.HasRecursion;
573 Info.HasIndirectCall |=
I->second.HasIndirectCall;
579 Info.NumExplicitSGPR = MaxSGPR + 1;
580 Info.NumVGPR = MaxVGPR + 1;
581 Info.NumAGPR = MaxAGPR + 1;
582 Info.PrivateSegmentSize += CalleeFrameSize;
587void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
590 int32_t NonKernelMaxSGPRs = 0;
591 int32_t NonKernelMaxVGPRs = 0;
592 int32_t NonKernelMaxAGPRs = 0;
594 for (
const auto &
I : CallGraphResourceInfo) {
596 auto &
Info =
I.getSecond();
597 NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs,
Info.NumExplicitSGPR);
598 NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs,
Info.NumVGPR);
599 NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs,
Info.NumAGPR);
606 for (
auto &
I : CallGraphResourceInfo) {
607 auto &
Info =
I.getSecond();
608 if (
Info.HasIndirectCall) {
609 Info.NumExplicitSGPR = std::max(
Info.NumExplicitSGPR, NonKernelMaxSGPRs);
610 Info.NumVGPR = std::max(
Info.NumVGPR, NonKernelMaxVGPRs);
611 Info.NumAGPR = std::max(
Info.NumAGPR, NonKernelMaxAGPRs);
unsigned const MachineRegisterInfo * MRI
static cl::opt< uint32_t > AssumedStackSizeForExternalCall("amdgpu-assume-external-call-stack-size", cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, cl::init(16384))
static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, const SIInstrInfo &TII, unsigned Reg)
static cl::opt< uint32_t > AssumedStackSizeForDynamicSizeObjects("amdgpu-assume-dynamic-stack-object-size", cl::desc("Assumed extra stack use if there are any " "variable sized objects (in bytes)"), cl::Hidden, cl::init(4096))
Analyzes how many registers and other resources are used by functions.
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Analysis containing CSE Info
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Target-Independent Code Generator Pass Configuration Options pass.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
The basic data container for the call graph of a Module of IR.
This class represents an Operation in the Expression.
bool hasFlatScratchInit() const
Generic base class for all target subtargets.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
This class contains meta information specific to a module.
MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A Module instance is used to store all the information related to an LLVM module.
Wrapper class representing virtual and physical registers.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
bool isStackRealigned() const
MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
Primary interface to the complete machine description for the target machine.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
unsigned getCodeObjectVersion(const Module &M)
bool isEntryFunctionCC(CallingConv::ID CC)
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
char & AMDGPUResourceUsageAnalysisID
po_iterator< T > po_begin(const T &G)
auto reverse(ContainerTy &&C)
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
po_iterator< T > po_end(const T &G)
int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const
int32_t getTotalNumVGPRs(const GCNSubtarget &ST, int32_t NumAGPR, int32_t NumVGPR) const
bool runOnModule(Module &M) override
runOnModule - Virtual method overriden by subclasses to process the module being operated on.