41#define DEBUG_TYPE "amdgpu-resource-usage"
50 "amdgpu-assume-external-call-stack-size",
55 "amdgpu-assume-dynamic-stack-object-size",
56 cl::desc(
"Assumed extra stack use if there are any "
57 "variable sized objects (in bytes)"),
61 "Function register usage analysis",
true,
true)
68 if (
auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal()))
69 return cast<Function>(GA->getOperand(0));
70 return cast<Function>(Op.getGlobal());
76 if (!UseOp.isImplicit() || !
TII.isFLAT(*UseOp.getParent()))
87 ST.getTargetID().isXnackOnOrAny());
91 const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
const {
101 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
107 bool HasIndirectCall =
false;
123 if (!
F ||
F->isDeclaration())
127 assert(MF &&
"function must have been generated already");
132 assert(CI.second &&
"should only be called once per function");
133 Info = analyzeResourceUsage(*MF,
TM);
134 HasIndirectCall |=
Info.HasIndirectCall;
140 for (
const auto &
IT : CG) {
142 if (!
F ||
F->isDeclaration())
152 assert(MF &&
"function must have been generated already");
153 Info = analyzeResourceUsage(*MF,
TM);
154 HasIndirectCall |=
Info.HasIndirectCall;
158 propagateIndirectCallRegisterUsage();
164AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
166 SIFunctionResourceInfo
Info;
175 Info.UsesFlatScratch =
MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
176 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
190 Info.UsesFlatScratch =
false;
193 Info.PrivateSegmentSize = FrameInfo.getStackSize();
196 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
197 if (
Info.HasDynamicallySizedStack)
201 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
204 MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
MRI.isPhysRegUsed(AMDGPU::VCC_HI);
209 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
210 MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
212 if (
MRI.isPhysRegUsed(Reg)) {
213 HighestVGPRReg = Reg;
218 if (
ST.hasMAIInsts()) {
219 MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
221 if (
MRI.isPhysRegUsed(Reg)) {
222 HighestAGPRReg =
Reg;
226 Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
228 :
TRI.getHWRegIndex(HighestAGPRReg) + 1;
231 MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
233 if (
MRI.isPhysRegUsed(Reg)) {
234 HighestSGPRReg =
Reg;
241 Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
243 :
TRI.getHWRegIndex(HighestVGPRReg) + 1;
244 Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
246 :
TRI.getHWRegIndex(HighestSGPRReg) + 1;
251 int32_t MaxVGPR = -1;
252 int32_t MaxAGPR = -1;
253 int32_t MaxSGPR = -1;
270 case AMDGPU::EXEC_LO:
271 case AMDGPU::EXEC_HI:
274 case AMDGPU::M0_LO16:
275 case AMDGPU::M0_HI16:
276 case AMDGPU::SRC_SHARED_BASE_LO:
277 case AMDGPU::SRC_SHARED_BASE:
278 case AMDGPU::SRC_SHARED_LIMIT_LO:
279 case AMDGPU::SRC_SHARED_LIMIT:
280 case AMDGPU::SRC_PRIVATE_BASE_LO:
281 case AMDGPU::SRC_PRIVATE_BASE:
282 case AMDGPU::SRC_PRIVATE_LIMIT_LO:
283 case AMDGPU::SRC_PRIVATE_LIMIT:
284 case AMDGPU::SGPR_NULL:
285 case AMDGPU::SGPR_NULL64:
289 case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
292 case AMDGPU::NoRegister:
294 "Instruction uses invalid noreg register");
300 case AMDGPU::VCC_LO_LO16:
301 case AMDGPU::VCC_LO_HI16:
302 case AMDGPU::VCC_HI_LO16:
303 case AMDGPU::VCC_HI_HI16:
307 case AMDGPU::FLAT_SCR:
308 case AMDGPU::FLAT_SCR_LO:
309 case AMDGPU::FLAT_SCR_HI:
312 case AMDGPU::XNACK_MASK:
313 case AMDGPU::XNACK_MASK_LO:
314 case AMDGPU::XNACK_MASK_HI:
317 case AMDGPU::LDS_DIRECT:
328 case AMDGPU::SRC_VCCZ:
331 case AMDGPU::SRC_EXECZ:
334 case AMDGPU::SRC_SCC:
341 if (AMDGPU::SGPR_32RegClass.
contains(Reg) ||
342 AMDGPU::SGPR_LO16RegClass.
contains(Reg) ||
343 AMDGPU::SGPR_HI16RegClass.
contains(Reg)) {
346 }
else if (AMDGPU::VGPR_32RegClass.
contains(Reg) ||
347 AMDGPU::VGPR_LO16RegClass.
contains(Reg) ||
348 AMDGPU::VGPR_HI16RegClass.
contains(Reg)) {
351 }
else if (AMDGPU::AGPR_32RegClass.
contains(Reg) ||
352 AMDGPU::AGPR_LO16RegClass.
contains(Reg)) {
356 }
else if (AMDGPU::SGPR_64RegClass.
contains(Reg)) {
359 }
else if (AMDGPU::VReg_64RegClass.
contains(Reg)) {
362 }
else if (AMDGPU::AReg_64RegClass.
contains(Reg)) {
366 }
else if (AMDGPU::VReg_96RegClass.
contains(Reg)) {
369 }
else if (AMDGPU::SReg_96RegClass.
contains(Reg)) {
372 }
else if (AMDGPU::AReg_96RegClass.
contains(Reg)) {
376 }
else if (AMDGPU::SGPR_128RegClass.
contains(Reg)) {
379 }
else if (AMDGPU::VReg_128RegClass.
contains(Reg)) {
382 }
else if (AMDGPU::AReg_128RegClass.
contains(Reg)) {
386 }
else if (AMDGPU::VReg_160RegClass.
contains(Reg)) {
389 }
else if (AMDGPU::SReg_160RegClass.
contains(Reg)) {
392 }
else if (AMDGPU::AReg_160RegClass.
contains(Reg)) {
396 }
else if (AMDGPU::VReg_192RegClass.
contains(Reg)) {
399 }
else if (AMDGPU::SReg_192RegClass.
contains(Reg)) {
402 }
else if (AMDGPU::AReg_192RegClass.
contains(Reg)) {
406 }
else if (AMDGPU::VReg_224RegClass.
contains(Reg)) {
409 }
else if (AMDGPU::SReg_224RegClass.
contains(Reg)) {
412 }
else if (AMDGPU::AReg_224RegClass.
contains(Reg)) {
416 }
else if (AMDGPU::SReg_256RegClass.
contains(Reg)) {
419 }
else if (AMDGPU::VReg_256RegClass.
contains(Reg)) {
422 }
else if (AMDGPU::AReg_256RegClass.
contains(Reg)) {
426 }
else if (AMDGPU::VReg_288RegClass.
contains(Reg)) {
429 }
else if (AMDGPU::SReg_288RegClass.
contains(Reg)) {
432 }
else if (AMDGPU::AReg_288RegClass.
contains(Reg)) {
436 }
else if (AMDGPU::VReg_320RegClass.
contains(Reg)) {
439 }
else if (AMDGPU::SReg_320RegClass.
contains(Reg)) {
442 }
else if (AMDGPU::AReg_320RegClass.
contains(Reg)) {
446 }
else if (AMDGPU::VReg_352RegClass.
contains(Reg)) {
449 }
else if (AMDGPU::SReg_352RegClass.
contains(Reg)) {
452 }
else if (AMDGPU::AReg_352RegClass.
contains(Reg)) {
456 }
else if (AMDGPU::VReg_384RegClass.
contains(Reg)) {
459 }
else if (AMDGPU::SReg_384RegClass.
contains(Reg)) {
462 }
else if (AMDGPU::AReg_384RegClass.
contains(Reg)) {
466 }
else if (AMDGPU::SReg_512RegClass.
contains(Reg)) {
469 }
else if (AMDGPU::VReg_512RegClass.
contains(Reg)) {
472 }
else if (AMDGPU::AReg_512RegClass.
contains(Reg)) {
476 }
else if (AMDGPU::SReg_1024RegClass.
contains(Reg)) {
479 }
else if (AMDGPU::VReg_1024RegClass.
contains(Reg)) {
482 }
else if (AMDGPU::AReg_1024RegClass.
contains(Reg)) {
490 AMDGPU::TTMP_64RegClass.
contains(Reg) ||
491 AMDGPU::TTMP_128RegClass.
contains(Reg) ||
492 AMDGPU::TTMP_256RegClass.
contains(Reg) ||
493 AMDGPU::TTMP_512RegClass.
contains(Reg) ||
494 !
TRI.getPhysRegBaseClass(Reg)) &&
495 "Unknown register class");
497 unsigned HWReg =
TRI.getHWRegIndex(Reg);
498 int MaxUsed = HWReg +
Width - 1;
500 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
502 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
504 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
513 TII->getNamedOperand(
MI, AMDGPU::OpName::callee);
517 CallGraphResourceInfo.end();
528 I = CallGraphResourceInfo.find(
Callee);
532 Info.HasRecursion =
true;
536 if (!
MI.isReturn()) {
544 std::max(CalleeFrameSize,
549 if (IsIndirect ||
I == CallGraphResourceInfo.end()) {
551 std::max(CalleeFrameSize,
556 Info.UsesFlatScratch =
ST.hasFlatAddressSpace();
557 Info.HasDynamicallySizedStack =
true;
558 Info.HasIndirectCall =
true;
562 MaxSGPR = std::max(
I->second.NumExplicitSGPR - 1, MaxSGPR);
563 MaxVGPR = std::max(
I->second.NumVGPR - 1, MaxVGPR);
564 MaxAGPR = std::max(
I->second.NumAGPR - 1, MaxAGPR);
566 std::max(
I->second.PrivateSegmentSize, CalleeFrameSize);
567 Info.UsesVCC |=
I->second.UsesVCC;
568 Info.UsesFlatScratch |=
I->second.UsesFlatScratch;
569 Info.HasDynamicallySizedStack |=
I->second.HasDynamicallySizedStack;
570 Info.HasRecursion |=
I->second.HasRecursion;
571 Info.HasIndirectCall |=
I->second.HasIndirectCall;
577 Info.NumExplicitSGPR = MaxSGPR + 1;
578 Info.NumVGPR = MaxVGPR + 1;
579 Info.NumAGPR = MaxAGPR + 1;
580 Info.PrivateSegmentSize += CalleeFrameSize;
585void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
588 int32_t NonKernelMaxSGPRs = 0;
589 int32_t NonKernelMaxVGPRs = 0;
590 int32_t NonKernelMaxAGPRs = 0;
592 for (
const auto &
I : CallGraphResourceInfo) {
594 auto &
Info =
I.getSecond();
595 NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs,
Info.NumExplicitSGPR);
596 NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs,
Info.NumVGPR);
597 NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs,
Info.NumAGPR);
604 for (
auto &
I : CallGraphResourceInfo) {
605 auto &
Info =
I.getSecond();
606 if (
Info.HasIndirectCall) {
607 Info.NumExplicitSGPR = std::max(
Info.NumExplicitSGPR, NonKernelMaxSGPRs);
608 Info.NumVGPR = std::max(
Info.NumVGPR, NonKernelMaxVGPRs);
609 Info.NumAGPR = std::max(
Info.NumAGPR, NonKernelMaxAGPRs);
unsigned const MachineRegisterInfo * MRI
amdgpu Simplify well known AMD library false FunctionCallee Callee
static cl::opt< uint32_t > AssumedStackSizeForExternalCall("amdgpu-assume-external-call-stack-size", cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, cl::init(16384))
static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, const SIInstrInfo &TII, unsigned Reg)
static cl::opt< uint32_t > AssumedStackSizeForDynamicSizeObjects("amdgpu-assume-dynamic-stack-object-size", cl::desc("Assumed extra stack use if there are any " "variable sized objects (in bytes)"), cl::Hidden, cl::init(4096))
Analyzes how many registers and other resources are used by functions.
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Analysis containing CSE Info
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Target-Independent Code Generator Pass Configuration Options pass.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
The basic data container for the call graph of a Module of IR.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
This class contains meta information specific to a module.
MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A Module instance is used to store all the information related to an LLVM module.
Wrapper class representing virtual and physical registers.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isStackRealigned() const
bool hasFlatScratchInit() const
MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
Primary interface to the complete machine description for the target machine.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
unsigned getCodeObjectVersion(const Module &M)
bool isEntryFunctionCC(CallingConv::ID CC)
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
char & AMDGPUResourceUsageAnalysisID
po_iterator< T > po_begin(const T &G)
auto reverse(ContainerTy &&C)
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
po_iterator< T > po_end(const T &G)
int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const
int32_t getTotalNumVGPRs(const GCNSubtarget &ST, int32_t NumAGPR, int32_t NumVGPR) const
bool runOnModule(Module &M) override
runOnModule - Virtual method overriden by subclasses to process the module being operated on.