LLVM 19.0.0git
AMDGPUPerfHintAnalysis.cpp
Go to the documentation of this file.
1//===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// \brief Analyzes if a function potentially memory bound and if a kernel
11/// kernel may benefit from limiting number of waves to reduce cache thrashing.
12///
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
18#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/Statistic.h"
29
30using namespace llvm;
31
32#define DEBUG_TYPE "amdgpu-perf-hint"
33
35 MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden,
36 cl::desc("Function mem bound threshold in %"));
37
39 LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden,
40 cl::desc("Kernel limit wave threshold in %"));
41
43 IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden,
44 cl::desc("Indirect access memory instruction weight"));
45
47 LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden,
48 cl::desc("Large stride memory access weight"));
49
51 LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden,
52 cl::desc("Large stride memory access threshold"));
53
54STATISTIC(NumMemBound, "Number of functions marked as memory bound");
55STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave");
56
59
61 "Analysis if a function is memory bound", true, true)
62
63namespace {
64
65struct AMDGPUPerfHint {
67
68public:
69 AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_,
70 const TargetLowering *TLI_)
71 : FIM(FIM_), DL(nullptr), TLI(TLI_) {}
72
74
75private:
76 struct MemAccessInfo {
77 const Value *V = nullptr;
78 const Value *Base = nullptr;
79 int64_t Offset = 0;
80 MemAccessInfo() = default;
81 bool isLargeStride(MemAccessInfo &Reference) const;
82#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
83 Printable print() const {
84 return Printable([this](raw_ostream &OS) {
85 OS << "Value: " << *V << '\n'
86 << "Base: " << *Base << " Offset: " << Offset << '\n';
87 });
88 }
89#endif
90 };
91
92 MemAccessInfo makeMemAccessInfo(Instruction *) const;
93
94 MemAccessInfo LastAccess; // Last memory access info
95
97
98 const DataLayout *DL;
99
100 const TargetLowering *TLI;
101
103 static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
104 static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);
105
106 bool isIndirectAccess(const Instruction *Inst) const;
107
108 /// Check if the instruction is large stride.
109 /// The purpose is to identify memory access pattern like:
110 /// x = a[i];
111 /// y = a[i+1000];
112 /// z = a[i+2000];
113 /// In the above example, the second and third memory access will be marked
114 /// large stride memory access.
115 bool isLargeStride(const Instruction *Inst);
116
117 bool isGlobalAddr(const Value *V) const;
118 bool isLocalAddr(const Value *V) const;
119 bool isGlobalLoadUsedInBB(const Instruction &) const;
120};
121
122static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
123 const Instruction *Inst) {
124 if (auto LI = dyn_cast<LoadInst>(Inst))
125 return {LI->getPointerOperand(), LI->getType()};
126 if (auto SI = dyn_cast<StoreInst>(Inst))
127 return {SI->getPointerOperand(), SI->getValueOperand()->getType()};
128 if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst))
129 return {AI->getPointerOperand(), AI->getCompareOperand()->getType()};
130 if (auto AI = dyn_cast<AtomicRMWInst>(Inst))
131 return {AI->getPointerOperand(), AI->getValOperand()->getType()};
132 if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst))
133 return {MI->getRawDest(), Type::getInt8Ty(MI->getContext())};
134
135 return {nullptr, nullptr};
136}
137
138bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
139 LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
142 if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) {
143 if (isGlobalAddr(MO))
144 WorkSet.insert(MO);
145 }
146
147 while (!WorkSet.empty()) {
148 const Value *V = *WorkSet.begin();
149 WorkSet.erase(*WorkSet.begin());
150 if (!Visited.insert(V).second)
151 continue;
152 LLVM_DEBUG(dbgs() << " check: " << *V << '\n');
153
154 if (auto LD = dyn_cast<LoadInst>(V)) {
155 auto M = LD->getPointerOperand();
156 if (isGlobalAddr(M)) {
157 LLVM_DEBUG(dbgs() << " is IA\n");
158 return true;
159 }
160 continue;
161 }
162
163 if (auto GEP = dyn_cast<GetElementPtrInst>(V)) {
164 auto P = GEP->getPointerOperand();
165 WorkSet.insert(P);
166 for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I)
167 WorkSet.insert(GEP->getOperand(I));
168 continue;
169 }
170
171 if (auto U = dyn_cast<UnaryInstruction>(V)) {
172 WorkSet.insert(U->getOperand(0));
173 continue;
174 }
175
176 if (auto BO = dyn_cast<BinaryOperator>(V)) {
177 WorkSet.insert(BO->getOperand(0));
178 WorkSet.insert(BO->getOperand(1));
179 continue;
180 }
181
182 if (auto S = dyn_cast<SelectInst>(V)) {
183 WorkSet.insert(S->getFalseValue());
184 WorkSet.insert(S->getTrueValue());
185 continue;
186 }
187
188 if (auto E = dyn_cast<ExtractElementInst>(V)) {
189 WorkSet.insert(E->getVectorOperand());
190 continue;
191 }
192
193 LLVM_DEBUG(dbgs() << " dropped\n");
194 }
195
196 LLVM_DEBUG(dbgs() << " is not IA\n");
197 return false;
198}
199
200// Returns true if the global load `I` is used in its own basic block.
201bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const {
202 const auto *Ld = dyn_cast<LoadInst>(&I);
203 if (!Ld)
204 return false;
205 if (!isGlobalAddr(Ld->getPointerOperand()))
206 return false;
207
208 for (const User *Usr : Ld->users()) {
209 if (const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) {
210 if (UsrInst->getParent() == I.getParent())
211 return true;
212 }
213 }
214
215 return false;
216}
217
218AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
220
221 LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');
222
223 for (auto &B : F) {
224 LastAccess = MemAccessInfo();
225 unsigned UsedGlobalLoadsInBB = 0;
226 for (auto &I : B) {
227 if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) {
228 unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32);
229 // TODO: Check if the global load and its user are close to each other
230 // instead (Or do this analysis in GCNSchedStrategy?).
231 if (isGlobalLoadUsedInBB(I))
232 UsedGlobalLoadsInBB += Size;
233 if (isIndirectAccess(&I))
234 FI.IAMInstCost += Size;
235 if (isLargeStride(&I))
236 FI.LSMInstCost += Size;
237 FI.MemInstCost += Size;
238 FI.InstCost += Size;
239 continue;
240 }
241 if (auto *CB = dyn_cast<CallBase>(&I)) {
242 Function *Callee = CB->getCalledFunction();
243 if (!Callee || Callee->isDeclaration()) {
244 ++FI.InstCost;
245 continue;
246 }
247 if (&F == Callee) // Handle immediate recursion
248 continue;
249
250 auto Loc = FIM.find(Callee);
251 if (Loc == FIM.end())
252 continue;
253
254 FI.MemInstCost += Loc->second.MemInstCost;
255 FI.InstCost += Loc->second.InstCost;
256 FI.IAMInstCost += Loc->second.IAMInstCost;
257 FI.LSMInstCost += Loc->second.LSMInstCost;
258 } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
261 AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr));
262 AM.HasBaseReg = !AM.BaseGV;
263 if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(),
264 GEP->getPointerAddressSpace()))
265 // Offset will likely be folded into load or store
266 continue;
267 ++FI.InstCost;
268 } else {
269 ++FI.InstCost;
270 }
271 }
272
273 if (!FI.HasDenseGlobalMemAcc) {
274 unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size();
275 if (GlobalMemAccPercentage > 50) {
276 LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since "
277 << B.getName() << " has " << GlobalMemAccPercentage
278 << "% global memory access\n");
279 FI.HasDenseGlobalMemAcc = true;
280 }
281 }
282 }
283
284 return &FI;
285}
286
287bool AMDGPUPerfHint::runOnFunction(Function &F) {
288 const Module &M = *F.getParent();
289 DL = &M.getDataLayout();
290
291 if (F.hasFnAttribute("amdgpu-wave-limiter") &&
292 F.hasFnAttribute("amdgpu-memory-bound"))
293 return false;
294
296
297 LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost
298 << '\n'
299 << " IAMInst cost: " << Info->IAMInstCost << '\n'
300 << " LSMInst cost: " << Info->LSMInstCost << '\n'
301 << " TotalInst cost: " << Info->InstCost << '\n');
302
303 bool Changed = false;
304
305 if (isMemBound(*Info)) {
306 LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
307 NumMemBound++;
308 F.addFnAttr("amdgpu-memory-bound", "true");
309 Changed = true;
310 }
311
312 if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) {
313 LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
314 NumLimitWave++;
315 F.addFnAttr("amdgpu-wave-limiter", "true");
316 Changed = true;
317 }
318
319 return Changed;
320}
321
322bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
323 // Reverting optimal scheduling in favour of occupancy with basic block(s)
324 // having dense global memory access can potentially hurt performance.
326 return true;
327
328 return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh;
329}
330
331bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
332 return ((FI.MemInstCost + FI.IAMInstCost * IAWeight +
333 FI.LSMInstCost * LSWeight) * 100 / FI.InstCost) > LimitWaveThresh;
334}
335
336bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
337 if (auto PT = dyn_cast<PointerType>(V->getType())) {
338 unsigned As = PT->getAddressSpace();
339 // Flat likely points to global too.
341 }
342 return false;
343}
344
345bool AMDGPUPerfHint::isLocalAddr(const Value *V) const {
346 if (auto PT = dyn_cast<PointerType>(V->getType()))
347 return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
348 return false;
349}
350
351bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
352 LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n');
353
354 MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst));
355 bool IsLargeStride = MAI.isLargeStride(LastAccess);
356 if (MAI.Base)
357 LastAccess = std::move(MAI);
358
359 return IsLargeStride;
360}
361
362AMDGPUPerfHint::MemAccessInfo
363AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
364 MemAccessInfo MAI;
365 const Value *MO = getMemoryInstrPtrAndType(Inst).first;
366
367 LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
368 // Do not treat local-addr memory access as large stride.
369 if (isLocalAddr(MO))
370 return MAI;
371
372 MAI.V = MO;
373 MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL);
374 return MAI;
375}
376
377bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
378 MemAccessInfo &Reference) const {
379
380 if (!Base || !Reference.Base || Base != Reference.Base)
381 return false;
382
383 uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset
384 : Reference.Offset - Offset;
385 bool Result = Diff > LargeStrideThresh;
386 LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n"
387 << print() << "<=>\n"
388 << Reference.print() << "Result:" << Result << '\n');
389 return Result;
390}
391} // namespace
392
394 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
395 if (!TPC)
396 return false;
397
398 const TargetMachine &TM = TPC->getTM<TargetMachine>();
399
400 bool Changed = false;
401 for (CallGraphNode *I : SCC) {
402 Function *F = I->getFunction();
403 if (!F || F->isDeclaration())
404 continue;
405
406 const TargetSubtargetInfo *ST = TM.getSubtargetImpl(*F);
407 AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
408
409 if (Analyzer.runOnFunction(*F))
410 Changed = true;
411 }
412
413 return Changed;
414}
415
417 auto FI = FIM.find(F);
418 if (FI == FIM.end())
419 return false;
420
421 return AMDGPUPerfHint::isMemBound(FI->second);
422}
423
425 auto FI = FIM.find(F);
426 if (FI == FIM.end())
427 return false;
428
429 return AMDGPUPerfHint::needLimitWave(FI->second);
430}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, cl::desc("Large stride memory access threshold"))
static cl::opt< unsigned > IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, cl::desc("Indirect access memory instruction weight"))
static cl::opt< unsigned > LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, cl::desc("Kernel limit wave threshold in %"))
static cl::opt< unsigned > LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, cl::desc("Large stride memory access weight"))
#define DEBUG_TYPE
static cl::opt< unsigned > MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, cl::desc("Function mem bound threshold in %"))
Analyzes if a function potentially memory bound and if a kernel kernel may benefit from limiting numb...
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
static bool runOnFunction(Function &F, bool PostInlining)
Hexagon Common GEP
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define P(N)
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
raw_pwrite_stream & OS
This file defines the SmallSet class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
A node in the call graph for a module.
Definition: CallGraph.h:166
CallGraphSCC - This is a single SCC that a CallGraphSCCPass is run on.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Simple wrapper around std::function<void(raw_ostream&)>.
Definition: Printable.h:38
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
const_iterator begin() const
Definition: SmallSet.h:223
bool empty() const
Definition: SmallSet.h:159
bool erase(const T &V)
Definition: SmallSet.h:207
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TargetSubtargetInfo - Generic base class for all target subtargets.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static IntegerType * getInt8Ty(LLVMContext &C)
iterator find(const KeyT &Val)
Definition: ValueMap.h:155
iterator end()
Definition: ValueMap.h:135
LLVM Value Representation.
Definition: Value.h:74
iterator_range< user_iterator > users()
Definition: Value.h:421
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isEntryFunctionCC(CallingConv::ID CC)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:456
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
char & AMDGPUPerfHintAnalysisID
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool runOnSCC(CallGraphSCC &SCC) override
runOnSCC - This method should be implemented by the subclass to perform whatever action is necessary ...
bool isMemoryBound(const Function *F) const
bool needsWaveLimiter(const Function *F) const
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...