LLVM  16.0.0git
AMDGPUPerfHintAnalysis.cpp
Go to the documentation of this file.
1 //===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Analyzes if a function potentially memory bound and if a kernel
11 /// kernel may benefit from limiting number of waves to reduce cache thrashing.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUPerfHintAnalysis.h"
17 #include "Utils/AMDGPUBaseInfo.h"
18 #include "llvm/ADT/SmallSet.h"
19 #include "llvm/ADT/Statistic.h"
25 #include "llvm/IR/Instructions.h"
26 #include "llvm/IR/IntrinsicInst.h"
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-perf-hint"
33 
34 static cl::opt<unsigned>
35  MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden,
36  cl::desc("Function mem bound threshold in %"));
37 
38 static cl::opt<unsigned>
39  LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden,
40  cl::desc("Kernel limit wave threshold in %"));
41 
42 static cl::opt<unsigned>
43  IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden,
44  cl::desc("Indirect access memory instruction weight"));
45 
46 static cl::opt<unsigned>
47  LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden,
48  cl::desc("Large stride memory access weight"));
49 
50 static cl::opt<unsigned>
51  LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden,
52  cl::desc("Large stride memory access threshold"));
53 
54 STATISTIC(NumMemBound, "Number of functions marked as memory bound");
55 STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave");
56 
59 
61  "Analysis if a function is memory bound", true, true)
62 
63 namespace {
64 
65 struct AMDGPUPerfHint {
67 
68 public:
69  AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_,
70  const TargetLowering *TLI_)
71  : FIM(FIM_), DL(nullptr), TLI(TLI_) {}
72 
73  bool runOnFunction(Function &F);
74 
75 private:
76  struct MemAccessInfo {
77  const Value *V = nullptr;
78  const Value *Base = nullptr;
79  int64_t Offset = 0;
80  MemAccessInfo() = default;
81  bool isLargeStride(MemAccessInfo &Reference) const;
82 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
83  Printable print() const {
84  return Printable([this](raw_ostream &OS) {
85  OS << "Value: " << *V << '\n'
86  << "Base: " << *Base << " Offset: " << Offset << '\n';
87  });
88  }
89 #endif
90  };
91 
92  MemAccessInfo makeMemAccessInfo(Instruction *) const;
93 
94  MemAccessInfo LastAccess; // Last memory access info
95 
97 
98  const DataLayout *DL;
99 
100  const TargetLowering *TLI;
101 
103  static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
104  static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);
105 
106  bool isIndirectAccess(const Instruction *Inst) const;
107 
108  /// Check if the instruction is large stride.
109  /// The purpose is to identify memory access pattern like:
110  /// x = a[i];
111  /// y = a[i+1000];
112  /// z = a[i+2000];
113  /// In the above example, the second and third memory access will be marked
114  /// large stride memory access.
115  bool isLargeStride(const Instruction *Inst);
116 
117  bool isGlobalAddr(const Value *V) const;
118  bool isLocalAddr(const Value *V) const;
119  bool isGlobalLoadUsedInBB(const Instruction &) const;
120 };
121 
122 static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
123  const Instruction *Inst) {
124  if (auto LI = dyn_cast<LoadInst>(Inst))
125  return {LI->getPointerOperand(), LI->getType()};
126  if (auto SI = dyn_cast<StoreInst>(Inst))
127  return {SI->getPointerOperand(), SI->getValueOperand()->getType()};
128  if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst))
129  return {AI->getPointerOperand(), AI->getCompareOperand()->getType()};
130  if (auto AI = dyn_cast<AtomicRMWInst>(Inst))
131  return {AI->getPointerOperand(), AI->getValOperand()->getType()};
132  if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst))
133  return {MI->getRawDest(), Type::getInt8Ty(MI->getContext())};
134 
135  return {nullptr, nullptr};
136 }
137 
138 bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
139  LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
142  if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) {
143  if (isGlobalAddr(MO))
144  WorkSet.insert(MO);
145  }
146 
147  while (!WorkSet.empty()) {
148  const Value *V = *WorkSet.begin();
149  WorkSet.erase(*WorkSet.begin());
150  if (!Visited.insert(V).second)
151  continue;
152  LLVM_DEBUG(dbgs() << " check: " << *V << '\n');
153 
154  if (auto LD = dyn_cast<LoadInst>(V)) {
155  auto M = LD->getPointerOperand();
156  if (isGlobalAddr(M)) {
157  LLVM_DEBUG(dbgs() << " is IA\n");
158  return true;
159  }
160  continue;
161  }
162 
163  if (auto GEP = dyn_cast<GetElementPtrInst>(V)) {
164  auto P = GEP->getPointerOperand();
165  WorkSet.insert(P);
166  for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I)
167  WorkSet.insert(GEP->getOperand(I));
168  continue;
169  }
170 
171  if (auto U = dyn_cast<UnaryInstruction>(V)) {
172  WorkSet.insert(U->getOperand(0));
173  continue;
174  }
175 
176  if (auto BO = dyn_cast<BinaryOperator>(V)) {
177  WorkSet.insert(BO->getOperand(0));
178  WorkSet.insert(BO->getOperand(1));
179  continue;
180  }
181 
182  if (auto S = dyn_cast<SelectInst>(V)) {
183  WorkSet.insert(S->getFalseValue());
184  WorkSet.insert(S->getTrueValue());
185  continue;
186  }
187 
188  if (auto E = dyn_cast<ExtractElementInst>(V)) {
189  WorkSet.insert(E->getVectorOperand());
190  continue;
191  }
192 
193  LLVM_DEBUG(dbgs() << " dropped\n");
194  }
195 
196  LLVM_DEBUG(dbgs() << " is not IA\n");
197  return false;
198 }
199 
200 // Returns true if the global load `I` is used in its own basic block.
201 bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const {
202  const auto *Ld = dyn_cast<LoadInst>(&I);
203  if (!Ld)
204  return false;
205  if (!isGlobalAddr(Ld->getPointerOperand()))
206  return false;
207 
208  for (const User *Usr : Ld->users()) {
209  if (const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) {
210  if (UsrInst->getParent() == I.getParent())
211  return true;
212  }
213  }
214 
215  return false;
216 }
217 
218 AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
220 
221  LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');
222 
223  for (auto &B : F) {
224  LastAccess = MemAccessInfo();
225  unsigned UsedGlobalLoadsInBB = 0;
226  for (auto &I : B) {
227  if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) {
228  unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32);
229  // TODO: Check if the global load and its user are close to each other
230  // instead (Or do this analysis in GCNSchedStrategy?).
231  if (isGlobalLoadUsedInBB(I))
232  UsedGlobalLoadsInBB += Size;
233  if (isIndirectAccess(&I))
234  FI.IAMInstCost += Size;
235  if (isLargeStride(&I))
236  FI.LSMInstCost += Size;
237  FI.MemInstCost += Size;
238  FI.InstCost += Size;
239  continue;
240  }
241  if (auto *CB = dyn_cast<CallBase>(&I)) {
242  Function *Callee = CB->getCalledFunction();
243  if (!Callee || Callee->isDeclaration()) {
244  ++FI.InstCost;
245  continue;
246  }
247  if (&F == Callee) // Handle immediate recursion
248  continue;
249 
250  auto Loc = FIM.find(Callee);
251  if (Loc == FIM.end())
252  continue;
253 
254  FI.MemInstCost += Loc->second.MemInstCost;
255  FI.InstCost += Loc->second.InstCost;
256  FI.IAMInstCost += Loc->second.IAMInstCost;
257  FI.LSMInstCost += Loc->second.LSMInstCost;
258  } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
261  AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr));
262  AM.HasBaseReg = !AM.BaseGV;
263  if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(),
264  GEP->getPointerAddressSpace()))
265  // Offset will likely be folded into load or store
266  continue;
267  ++FI.InstCost;
268  } else {
269  ++FI.InstCost;
270  }
271  }
272 
273  if (!FI.HasDenseGlobalMemAcc) {
274  unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size();
275  if (GlobalMemAccPercentage > 50) {
276  LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since "
277  << B.getName() << " has " << GlobalMemAccPercentage
278  << "% global memory access\n");
279  FI.HasDenseGlobalMemAcc = true;
280  }
281  }
282  }
283 
284  return &FI;
285 }
286 
288  const Module &M = *F.getParent();
289  DL = &M.getDataLayout();
290 
291  if (F.hasFnAttribute("amdgpu-wave-limiter") &&
292  F.hasFnAttribute("amdgpu-memory-bound"))
293  return false;
294 
295  const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F);
296 
297  LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost
298  << '\n'
299  << " IAMInst cost: " << Info->IAMInstCost << '\n'
300  << " LSMInst cost: " << Info->LSMInstCost << '\n'
301  << " TotalInst cost: " << Info->InstCost << '\n');
302 
303  bool Changed = false;
304 
305  if (isMemBound(*Info)) {
306  LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
307  NumMemBound++;
308  F.addFnAttr("amdgpu-memory-bound", "true");
309  Changed = true;
310  }
311 
312  if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) {
313  LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
314  NumLimitWave++;
315  F.addFnAttr("amdgpu-wave-limiter", "true");
316  Changed = true;
317  }
318 
319  return Changed;
320 }
321 
322 bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
323  // Reverting optimal scheduling in favour of occupancy with basic block(s)
324  // having dense global memory access can potentially hurt performance.
325  if (FI.HasDenseGlobalMemAcc)
326  return true;
327 
328  return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh;
329 }
330 
331 bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
332  return ((FI.MemInstCost + FI.IAMInstCost * IAWeight +
333  FI.LSMInstCost * LSWeight) * 100 / FI.InstCost) > LimitWaveThresh;
334 }
335 
336 bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
337  if (auto PT = dyn_cast<PointerType>(V->getType())) {
338  unsigned As = PT->getAddressSpace();
339  // Flat likely points to global too.
340  return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS;
341  }
342  return false;
343 }
344 
345 bool AMDGPUPerfHint::isLocalAddr(const Value *V) const {
346  if (auto PT = dyn_cast<PointerType>(V->getType()))
347  return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
348  return false;
349 }
350 
351 bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
352  LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n');
353 
354  MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst));
355  bool IsLargeStride = MAI.isLargeStride(LastAccess);
356  if (MAI.Base)
357  LastAccess = std::move(MAI);
358 
359  return IsLargeStride;
360 }
361 
362 AMDGPUPerfHint::MemAccessInfo
363 AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
364  MemAccessInfo MAI;
365  const Value *MO = getMemoryInstrPtrAndType(Inst).first;
366 
367  LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
368  // Do not treat local-addr memory access as large stride.
369  if (isLocalAddr(MO))
370  return MAI;
371 
372  MAI.V = MO;
373  MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL);
374  return MAI;
375 }
376 
377 bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
378  MemAccessInfo &Reference) const {
379 
380  if (!Base || !Reference.Base || Base != Reference.Base)
381  return false;
382 
383  uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset
384  : Reference.Offset - Offset;
385  bool Result = Diff > LargeStrideThresh;
386  LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n"
387  << print() << "<=>\n"
388  << Reference.print() << "Result:" << Result << '\n');
389  return Result;
390 }
391 } // namespace
392 
394  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
395  if (!TPC)
396  return false;
397 
398  const TargetMachine &TM = TPC->getTM<TargetMachine>();
399 
400  bool Changed = false;
401  for (CallGraphNode *I : SCC) {
402  Function *F = I->getFunction();
403  if (!F || F->isDeclaration())
404  continue;
405 
406  const TargetSubtargetInfo *ST = TM.getSubtargetImpl(*F);
407  AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
408 
409  if (Analyzer.runOnFunction(*F))
410  Changed = true;
411  }
412 
413  return Changed;
414 }
415 
417  auto FI = FIM.find(F);
418  if (FI == FIM.end())
419  return false;
420 
421  return AMDGPUPerfHint::isMemBound(FI->second);
422 }
423 
425  auto FI = FIM.find(F);
426  if (FI == FIM.end())
427  return false;
428 
429  return AMDGPUPerfHint::needLimitWave(FI->second);
430 }
llvm::AMDGPUPerfHintAnalysis::isMemoryBound
bool isMemoryBound(const Function *F) const
Definition: AMDGPUPerfHintAnalysis.cpp:416
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:108
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
LargeStrideThresh
static cl::opt< unsigned > LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, cl::desc("Large stride memory access threshold"))
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::AMDGPUPerfHintAnalysis::runOnSCC
bool runOnSCC(CallGraphSCC &SCC) override
runOnSCC - This method should be implemented by the subclass to perform whatever action is necessary ...
Definition: AMDGPUPerfHintAnalysis.cpp:393
LimitWaveThresh
static cl::opt< unsigned > LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, cl::desc("Kernel limit wave threshold in %"))
IntrinsicInst.h
llvm::ValueMap::end
iterator end()
Definition: ValueMap.h:136
llvm::Function
Definition: Function.h:60
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
IAWeight
static cl::opt< unsigned > IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, cl::desc("Indirect access memory instruction weight"))
llvm::ARM_MB::LD
@ LD
Definition: ARMBaseInfo.h:72
Statistic.h
LSWeight
static cl::opt< unsigned > LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, cl::desc("Large stride memory access weight"))
ValueTracking.h
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:140
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:136
llvm::AMDGPUPerfHintAnalysis::needsWaveLimiter
bool needsWaveLimiter(const Function *F) const
Definition: AMDGPUPerfHintAnalysis.cpp:424
llvm::Type::getInt8Ty
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:237
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:372
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:371
llvm::AMDGPUPerfHintAnalysis::FuncInfo
Definition: AMDGPUPerfHintAnalysis.h:39
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
CommandLine.h
TargetLowering.h
llvm::AMDGPUPerfHintAnalysis::FuncInfo::InstCost
unsigned InstCost
Definition: AMDGPUPerfHintAnalysis.h:41
llvm::CallGraphSCC
CallGraphSCC - This is a single SCC that a CallGraphSCCPass is run on.
Definition: CallGraphSCCPass.h:87
TargetMachine.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::User
Definition: User.h:44
llvm::TargetLoweringBase::AddrMode::HasBaseReg
bool HasBaseReg
Definition: TargetLowering.h:2591
llvm::TargetLowering
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Definition: TargetLowering.h:3506
llvm::AMDGPUPerfHintAnalysis::ID
static char ID
Definition: AMDGPUPerfHintAnalysis.h:24
SI
@ SI
Definition: SIInstrInfo.cpp:7966
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::Instruction
Definition: Instruction.h:42
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
llvm::CallGraphNode
A node in the call graph for a module.
Definition: CallGraph.h:166
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::AMDGPU::isEntryFunctionCC
bool isEntryFunctionCC(CallingConv::ID CC)
Definition: AMDGPUBaseInfo.cpp:1831
llvm::AMDGPUPerfHintAnalysis::FuncInfo::HasDenseGlobalMemAcc
bool HasDenseGlobalMemAcc
Definition: AMDGPUPerfHintAnalysis.h:44
llvm::cl::opt
Definition: CommandLine.h:1412
llvm::divideCeil
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:683
AMDGPUPerfHintAnalysis.h
Analyzes if a function potentially memory bound and if a kernel kernel may benefit from limiting numb...
llvm::AMDGPUPerfHintAnalysis::FuncInfo::MemInstCost
unsigned MemInstCost
Definition: AMDGPUPerfHintAnalysis.h:40
uint64_t
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
MemBoundThresh
static cl::opt< unsigned > MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, cl::desc("Function mem bound threshold in %"))
move
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional move
Definition: README.txt:546
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:447
llvm::AMDGPUPerfHintAnalysis::FuncInfo::LSMInstCost
unsigned LSMInstCost
Definition: AMDGPUPerfHintAnalysis.h:43
TargetPassConfig.h
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
llvm::SmallSet::begin
const_iterator begin() const
Definition: SmallSet.h:222
Ptr
@ Ptr
Definition: TargetLibraryInfo.cpp:60
llvm::AMDGPUPerfHintAnalysis::FuncInfo::IAMInstCost
unsigned IAMInstCost
Definition: AMDGPUPerfHintAnalysis.h:42
INITIALIZE_PASS
INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE, "Analysis if a function is memory bound", true, true) namespace
Definition: AMDGPUPerfHintAnalysis.cpp:60
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
llvm::SmallSet::erase
bool erase(const T &V)
Definition: SmallSet.h:206
llvm::print
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr)
Definition: GCNRegPressure.cpp:138
llvm::AMDGPU::CPol::SCC
@ SCC
Definition: SIDefines.h:307
AMDGPU.h
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::TargetLoweringBase::AddrMode::BaseGV
GlobalValue * BaseGV
Definition: TargetLowering.h:2589
TargetSubtargetInfo.h
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
llvm::ValueMap< const Function *, FuncInfo >
llvm::TargetSubtargetInfo
TargetSubtargetInfo - Generic base class for all target subtargets.
Definition: TargetSubtargetInfo.h:62
Callee
amdgpu Simplify well known AMD library false FunctionCallee Callee
Definition: AMDGPULibCalls.cpp:187
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:85
llvm::TargetLoweringBase::AddrMode::BaseOffs
int64_t BaseOffs
Definition: TargetLowering.h:2590
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:376
llvm::ValueMap::find
iterator find(const KeyT &Val)
Definition: ValueMap.h:156
llvm::GetPointerBaseWithConstantOffset
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
Definition: ValueTracking.h:272
llvm::SmallSet::insert
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:178
llvm::AMDGPUPerfHintAnalysis
Definition: AMDGPUPerfHintAnalysis.h:23
CallGraph.h
Instructions.h
llvm::Printable
Simple wrapper around std::function<void(raw_ostream&)>.
Definition: Printable.h:38
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPUPerfHintAnalysis.cpp:32
llvm::TargetLoweringBase::AddrMode
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
Definition: TargetLowering.h:2588
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:171
llvm::cl::desc
Definition: CommandLine.h:413
llvm::AMDGPUPerfHintAnalysisID
char & AMDGPUPerfHintAnalysisID
Definition: AMDGPUPerfHintAnalysis.cpp:58
llvm::SmallSet::empty
bool empty() const
Definition: SmallSet.h:158
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:421
llvm::sampleprof::Base
@ Base
Definition: Discriminator.h:58
AMDGPUBaseInfo.h
SmallSet.h