Go to the documentation of this file.
32 #define DEBUG_TYPE "amdgpu-perf-hint"
36 cl::desc(
"Function mem bound threshold in %"));
40 cl::desc(
"Kernel limit wave threshold in %"));
44 cl::desc(
"Indirect access memory instruction weight"));
48 cl::desc(
"Large stride memory access weight"));
52 cl::desc(
"Large stride memory access threshold"));
54 STATISTIC(NumMemBound,
"Number of functions marked as memory bound");
55 STATISTIC(NumLimitWave,
"Number of functions marked as needing limit wave");
61 "Analysis if a function is memory bound",
true,
true)
65 struct AMDGPUPerfHint {
71 : FIM(FIM_),
DL(
nullptr), TLI(TLI_) {}
76 struct MemAccessInfo {
80 MemAccessInfo() : V(
nullptr),
Base(
nullptr), Offset(0) {}
81 bool isLargeStride(MemAccessInfo &Reference)
const;
82 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
85 OS <<
"Value: " << *V <<
'\n'
86 <<
"Base: " << *
Base <<
" Offset: " << Offset <<
'\n';
92 MemAccessInfo makeMemAccessInfo(
Instruction *)
const;
94 MemAccessInfo LastAccess;
106 bool isIndirectAccess(
const Instruction *Inst)
const;
117 bool isGlobalAddr(
const Value *V)
const;
118 bool isLocalAddr(
const Value *V)
const;
119 bool isConstantAddr(
const Value *V)
const;
122 static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
124 if (
auto LI = dyn_cast<LoadInst>(Inst))
125 return {LI->getPointerOperand(), LI->getType()};
126 if (
auto SI = dyn_cast<StoreInst>(Inst))
127 return {
SI->getPointerOperand(),
SI->getValueOperand()->getType()};
128 if (
auto AI = dyn_cast<AtomicCmpXchgInst>(Inst))
129 return {AI->getPointerOperand(), AI->getCompareOperand()->getType()};
130 if (
auto AI = dyn_cast<AtomicRMWInst>(Inst))
131 return {AI->getPointerOperand(), AI->getValOperand()->getType()};
132 if (
auto MI = dyn_cast<AnyMemIntrinsic>(Inst))
135 return {
nullptr,
nullptr};
138 bool AMDGPUPerfHint::isIndirectAccess(
const Instruction *Inst)
const {
142 if (
const Value *MO = getMemoryInstrPtrAndType(Inst).first) {
143 if (isGlobalAddr(MO))
147 while (!WorkSet.
empty()) {
150 if (!Visited.
insert(V).second)
154 if (
auto LD = dyn_cast<LoadInst>(V)) {
155 auto M =
LD->getPointerOperand();
156 if (isGlobalAddr(
M)) {
163 if (
auto GEP = dyn_cast<GetElementPtrInst>(V)) {
164 auto P =
GEP->getPointerOperand();
166 for (
unsigned I = 1,
E =
GEP->getNumIndices() + 1;
I !=
E; ++
I)
171 if (
auto U = dyn_cast<UnaryInstruction>(V)) {
172 WorkSet.
insert(U->getOperand(0));
176 if (
auto BO = dyn_cast<BinaryOperator>(V)) {
177 WorkSet.
insert(BO->getOperand(0));
178 WorkSet.
insert(BO->getOperand(1));
182 if (
auto S = dyn_cast<SelectInst>(V)) {
183 WorkSet.
insert(
S->getFalseValue());
184 WorkSet.
insert(
S->getTrueValue());
188 if (
auto E = dyn_cast<ExtractElementInst>(V)) {
189 WorkSet.
insert(
E->getVectorOperand());
203 LLVM_DEBUG(
dbgs() <<
"[AMDGPUPerfHint] process " <<
F.getName() <<
'\n');
206 LastAccess = MemAccessInfo();
208 if (
const Type *Ty = getMemoryInstrPtrAndType(&
I).second) {
209 unsigned Size =
divideCeil(Ty->getPrimitiveSizeInBits(), 32);
210 if (isIndirectAccess(&
I))
212 if (isLargeStride(&
I))
218 if (
auto *CB = dyn_cast<CallBase>(&
I)) {
227 auto Loc = FIM.find(
Callee);
228 if (Loc == FIM.end())
232 FI.
InstCost += Loc->second.InstCost;
235 }
else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
238 AM.
BaseGV = dyn_cast_or_null<GlobalValue>(
const_cast<Value *
>(Ptr));
240 if (TLI->isLegalAddressingMode(*
DL, AM,
GEP->getResultElementType(),
241 GEP->getPointerAddressSpace()))
256 DL = &
M.getDataLayout();
258 if (
F.hasFnAttribute(
"amdgpu-wave-limiter") &&
259 F.hasFnAttribute(
"amdgpu-memory-bound"))
266 <<
" IAMInst cost: " <<
Info->IAMInstCost <<
'\n'
267 <<
" LSMInst cost: " <<
Info->LSMInstCost <<
'\n'
268 <<
" TotalInst cost: " <<
Info->InstCost <<
'\n');
270 bool Changed =
false;
272 if (isMemBound(*
Info)) {
275 F.addFnAttr(
"amdgpu-memory-bound",
"true");
282 F.addFnAttr(
"amdgpu-wave-limiter",
"true");
298 bool AMDGPUPerfHint::isGlobalAddr(
const Value *V)
const {
299 if (
auto PT = dyn_cast<PointerType>(V->
getType())) {
300 unsigned As = PT->getAddressSpace();
307 bool AMDGPUPerfHint::isLocalAddr(
const Value *V)
const {
308 if (
auto PT = dyn_cast<PointerType>(V->
getType()))
313 bool AMDGPUPerfHint::isLargeStride(
const Instruction *Inst) {
316 MemAccessInfo MAI = makeMemAccessInfo(
const_cast<Instruction *
>(Inst));
317 bool IsLargeStride = MAI.isLargeStride(LastAccess);
321 return IsLargeStride;
324 AMDGPUPerfHint::MemAccessInfo
325 AMDGPUPerfHint::makeMemAccessInfo(
Instruction *Inst)
const {
327 const Value *MO = getMemoryInstrPtrAndType(Inst).first;
339 bool AMDGPUPerfHint::isConstantAddr(
const Value *V)
const {
340 if (
auto PT = dyn_cast<PointerType>(V->
getType())) {
341 unsigned As = PT->getAddressSpace();
348 bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
349 MemAccessInfo &Reference)
const {
351 if (!
Base || !Reference.Base ||
Base != Reference.Base)
354 uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset
355 : Reference.Offset - Offset;
358 <<
print() <<
"<=>\n"
359 << Reference.print() <<
"Result:" << Result <<
'\n');
365 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
371 bool Changed =
false;
374 if (!
F ||
F->isDeclaration())
378 AMDGPUPerfHint Analyzer(FIM,
ST->getTargetLowering());
380 if (Analyzer.runOnFunction(*
F))
388 auto FI = FIM.
find(
F);
392 return AMDGPUPerfHint::isMemBound(FI->second);
396 auto FI = FIM.
find(
F);
400 return AMDGPUPerfHint::needLimitWave(FI->second);
bool isMemoryBound(const Function *F) const
This is an optimization pass for GlobalISel generic memory operations.
static cl::opt< unsigned > LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, cl::desc("Large stride memory access threshold"))
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
A parsed version of the target data layout string in and methods for querying it.
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
bool runOnSCC(CallGraphSCC &SCC) override
runOnSCC - This method should be implemented by the subclass to perform whatever action is necessary ...
static cl::opt< unsigned > LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, cl::desc("Kernel limit wave threshold in %"))
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
static cl::opt< unsigned > IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, cl::desc("Indirect access memory instruction weight"))
static cl::opt< unsigned > LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, cl::desc("Large stride memory access weight"))
The instances of the Type class are immutable: once they are created, they are never changed.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
bool needsWaveLimiter(const Function *F) const
@ FLAT_ADDRESS
Address space for flat memory.
static IntegerType * getInt8Ty(LLVMContext &C)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
CallGraphSCC - This is a single SCC that a CallGraphSCCPass is run on.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
STATISTIC(NumFunctions, "Total number of functions")
This class implements an extremely fast bulk output stream that can only output to a stream.
A node in the call graph for a module.
Analysis containing CSE Info
bool isEntryFunctionCC(CallingConv::ID CC)
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Analyzes if a function potentially memory bound and if a kernel kernel may benefit from limiting numb...
static cl::opt< unsigned > MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, cl::desc("Function mem bound threshold in %"))
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional move
initializer< Ty > init(const Ty &Val)
Primary interface to the complete machine description for the target machine.
StandardInstrumentations SI(Debug, VerifyEach)
const_iterator begin() const
INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE, "Analysis if a function is memory bound", true, true) namespace
A Module instance is used to store all the information related to an LLVM module.
Type * getType() const
All values are typed, get the type of this value.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
TargetSubtargetInfo - Generic base class for all target subtargets.
amdgpu Simplify well known AMD library false FunctionCallee Callee
static bool runOnFunction(Function &F, bool PostInlining)
iterator find(const KeyT &Val)
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ LOCAL_ADDRESS
Address space for local memory.
Simple wrapper around std::function<void(raw_ostream&)>.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
LLVM_NODISCARD bool empty() const
const char LLVMTargetMachineRef TM
char & AMDGPUPerfHintAnalysisID
LLVM Value Representation.