LLVM  13.0.0git
AMDGPULDSUtils.cpp
Go to the documentation of this file.
1 //===- AMDGPULDSUtils.cpp -------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // AMDGPU LDS related helper utility functions.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AMDGPULDSUtils.h"
14 #include "Utils/AMDGPUBaseInfo.h"
16 #include "llvm/ADT/SetVector.h"
18 #include "llvm/IR/Constants.h"
20 
21 using namespace llvm;
22 
23 namespace llvm {
24 
25 namespace AMDGPU {
26 
27 // An helper class for collecting all reachable callees for each kernel defined
28 // within the module.
30  Module &M;
31  CallGraph CG;
32  SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions;
33 
34  // Collect all address taken functions within the module.
35  void collectAddressTakenFunctions() {
36  auto *ECNode = CG.getExternalCallingNode();
37 
38  for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) {
39  auto *CGN = GI->second;
40  auto *F = CGN->getFunction();
41  if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F))
42  continue;
43  AddressTakenFunctions.insert(CGN);
44  }
45  }
46 
47  // For given kernel, collect all its reachable non-kernel functions.
48  SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) {
49  SmallPtrSet<Function *, 8> ReachableCallees;
50 
51  // Call graph node which represents this kernel.
52  auto *KCGN = CG[K];
53 
54  // Go through all call graph nodes reachable from the node representing this
55  // kernel, visit all their call sites, if the call site is direct, add
56  // corresponding callee to reachable callee set, if it is indirect, resolve
57  // the indirect call site to potential reachable callees, add them to
58  // reachable callee set, and repeat the process for the newly added
59  // potential callee nodes.
60  //
61  // FIXME: Need to handle bit-casted function pointers.
62  //
63  SmallVector<CallGraphNode *, 8> CGNStack(df_begin(KCGN), df_end(KCGN));
64  SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes;
65  while (!CGNStack.empty()) {
66  auto *CGN = CGNStack.pop_back_val();
67 
68  if (!VisitedCGNodes.insert(CGN).second)
69  continue;
70 
71  for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) {
72  auto *RCB = cast<CallBase>(GI->first.getValue());
73  auto *RCGN = GI->second;
74 
75  if (auto *DCallee = RCGN->getFunction()) {
76  ReachableCallees.insert(DCallee);
77  } else if (RCB->isIndirectCall()) {
78  auto *RCBFTy = RCB->getFunctionType();
79  for (auto *ACGN : AddressTakenFunctions) {
80  auto *ACallee = ACGN->getFunction();
81  if (ACallee->getFunctionType() == RCBFTy) {
82  ReachableCallees.insert(ACallee);
83  CGNStack.append(df_begin(ACGN), df_end(ACGN));
84  }
85  }
86  }
87  }
88  }
89 
90  return ReachableCallees;
91  }
92 
93 public:
94  explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) {
95  // Collect address taken functions.
96  collectAddressTakenFunctions();
97  }
98 
100  DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
101  // Collect reachable callee set for each kernel defined in the module.
102  for (Function &F : M.functions()) {
103  if (!AMDGPU::isKernelCC(&F))
104  continue;
105  Function *K = &F;
106  KernelToCallees[K] = collectReachableCallees(K);
107  }
108  }
109 };
110 
112  Module &M,
113  DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
115  CRC.collectReachableCallees(KernelToCallees);
116 }
117 
119  SmallPtrSet<Function *, 8> LDSAccessors;
120  SmallVector<User *, 8> UserStack(GV->users());
121  SmallPtrSet<User *, 8> VisitedUsers;
122 
123  while (!UserStack.empty()) {
124  auto *U = UserStack.pop_back_val();
125 
126  // `U` is already visited? continue to next one.
127  if (!VisitedUsers.insert(U).second)
128  continue;
129 
130  // `U` is a global variable which is initialized with LDS. Ignore LDS.
131  if (isa<GlobalValue>(U))
133 
134  // Recursively explore constant users.
135  if (isa<Constant>(U)) {
136  append_range(UserStack, U->users());
137  continue;
138  }
139 
140  // `U` should be an instruction, if it belongs to a non-kernel function F,
141  // then collect F.
142  Function *F = cast<Instruction>(U)->getFunction();
143  if (!AMDGPU::isKernelCC(F))
144  LDSAccessors.insert(F);
145  }
146 
147  return LDSAccessors;
148 }
149 
151 getFunctionToInstsMap(User *U, bool CollectKernelInsts) {
153  SmallVector<User *, 8> UserStack;
154  SmallPtrSet<User *, 8> VisitedUsers;
155 
156  UserStack.push_back(U);
157 
158  while (!UserStack.empty()) {
159  auto *UU = UserStack.pop_back_val();
160 
161  if (!VisitedUsers.insert(UU).second)
162  continue;
163 
164  if (isa<GlobalValue>(UU))
165  continue;
166 
167  if (isa<Constant>(UU)) {
168  append_range(UserStack, UU->users());
169  continue;
170  }
171 
172  auto *I = cast<Instruction>(UU);
173  Function *F = I->getFunction();
174  if (CollectKernelInsts) {
175  if (!AMDGPU::isKernelCC(F)) {
176  continue;
177  }
178  } else {
179  if (AMDGPU::isKernelCC(F)) {
180  continue;
181  }
182  }
183 
184  FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>()));
185  FunctionToInsts[F].insert(I);
186  }
187 
188  return FunctionToInsts;
189 }
190 
191 bool isKernelCC(const Function *Func) {
192  return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
193 }
194 
196  return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
197  GV->getValueType());
198 }
199 
200 static void collectFunctionUses(User *U, const Function *F,
201  SetVector<Instruction *> &InstUsers) {
202  SmallVector<User *> Stack{U};
203 
204  while (!Stack.empty()) {
205  U = Stack.pop_back_val();
206 
207  if (auto *I = dyn_cast<Instruction>(U)) {
208  if (I->getFunction() == F)
209  InstUsers.insert(I);
210  continue;
211  }
212 
213  if (!isa<ConstantExpr>(U))
214  continue;
215 
216  append_range(Stack, U->users());
217  }
218 }
219 
221  SetVector<Instruction *> InstUsers;
222 
223  collectFunctionUses(C, F, InstUsers);
224  for (Instruction *I : InstUsers) {
226  }
227 }
228 
232 
233  while (!Stack.empty()) {
234  const User *U = Stack.pop_back_val();
235 
236  if (!Visited.insert(U).second)
237  continue;
238 
239  if (isa<Instruction>(U))
240  return true;
241 
242  append_range(Stack, U->users());
243  }
244 
245  return false;
246 }
247 
249  // We are not interested in kernel LDS lowering for module LDS itself.
250  if (F && GV.getName() == "llvm.amdgcn.module.lds")
251  return false;
252 
253  bool Ret = false;
257 
258  assert(!F || isKernelCC(F));
259 
260  while (!Stack.empty()) {
261  const User *V = Stack.pop_back_val();
262  Visited.insert(V);
263 
264  if (auto *G = dyn_cast<GlobalValue>(V)) {
265  StringRef GName = G->getName();
266  if (F && GName != "llvm.used" && GName != "llvm.compiler.used") {
267  // For kernel LDS lowering, if G is not a compiler.used list, then we
268  // cannot lower the lds GV since we cannot replace the use of GV within
269  // G.
270  return false;
271  }
272  GlobalUsers.insert(G);
273  continue;
274  }
275 
276  if (auto *I = dyn_cast<Instruction>(V)) {
277  const Function *UF = I->getFunction();
278  if (UF == F) {
279  // Used from this kernel, we want to put it into the structure.
280  Ret = true;
281  } else if (!F) {
282  // For module LDS lowering, lowering is required if the user instruction
283  // is from non-kernel function.
284  Ret |= !isKernelCC(UF);
285  }
286  continue;
287  }
288 
289  // User V should be a constant, recursively visit users of V.
290  assert(isa<Constant>(V) && "Expected a constant.");
291  append_range(Stack, V->users());
292  }
293 
294  if (!F && !Ret) {
295  // For module LDS lowering, we have not yet decided if we should lower GV or
296  // not. Explore all global users of GV, and check if atleast one of these
297  // global users appear as an use within an instruction (possibly nested use
298  // via constant expression), if so, then conservately lower LDS.
299  for (auto *G : GlobalUsers)
301  }
302 
303  return Ret;
304 }
305 
306 std::vector<GlobalVariable *> findVariablesToLower(Module &M,
307  const Function *F) {
308  std::vector<llvm::GlobalVariable *> LocalVars;
309  for (auto &GV : M.globals()) {
310  if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
311  continue;
312  }
313  if (!GV.hasInitializer()) {
314  // addrspace(3) without initializer implies cuda/hip extern __shared__
315  // the semantics for such a variable appears to be that all extern
316  // __shared__ variables alias one another, in which case this transform
317  // is not required
318  continue;
319  }
320  if (!isa<UndefValue>(GV.getInitializer())) {
321  // Initializers are unimplemented for local address space.
322  // Leave such variables in place for consistent error reporting.
323  continue;
324  }
325  if (GV.isConstant()) {
326  // A constant undef variable can't be written to, and any load is
327  // undef, so it should be eliminated by the optimizer. It could be
328  // dropped by the back end if not. This pass skips over it.
329  continue;
330  }
331  if (!shouldLowerLDSToStruct(GV, F)) {
332  continue;
333  }
334  LocalVars.push_back(&GV);
335  }
336  return LocalVars;
337 }
338 
341 
343  collectUsedGlobalVariables(M, TmpVec, true);
344  UsedList.insert(TmpVec.begin(), TmpVec.end());
345 
346  TmpVec.clear();
347  collectUsedGlobalVariables(M, TmpVec, false);
348  UsedList.insert(TmpVec.begin(), TmpVec.end());
349 
350  return UsedList;
351 }
352 
353 } // end namespace AMDGPU
354 
355 } // end namespace llvm
llvm::AMDGPU::findVariablesToLower
std::vector< GlobalVariable * > findVariablesToLower(Module &M, const Function *F)
Definition: AMDGPULDSUtils.cpp:306
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::Value::getPointerAlignment
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:906
llvm::CallGraph::getExternalCallingNode
CallGraphNode * getExternalCallingNode() const
Returns the CallGraphNode which is used to represent undetermined calls into the callgraph.
Definition: CallGraph.h:128
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::Function
Definition: Function.h:61
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::AMDGPU::getFunctionToInstsMap
DenseMap< Function *, SmallPtrSet< Instruction *, 8 > > getFunctionToInstsMap(User *U, bool CollectKernelInsts)
Collect all the instructions where user U belongs to.
Definition: AMDGPULDSUtils.cpp:151
llvm::GlobalVariable
Definition: GlobalVariable.h:40
llvm::df_end
df_iterator< T > df_end(const T &G)
Definition: DepthFirstIterator.h:223
llvm::CallGraph
The basic data container for the call graph of a Module of IR.
Definition: CallGraph.h:73
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:116
llvm::SmallVectorImpl::pop_back_val
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:635
DepthFirstIterator.h
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::collectUsedGlobalVariables
GlobalVariable * collectUsedGlobalVariables(const Module &M, SmallVectorImpl< GlobalValue * > &Vec, bool CompilerUsed)
Given "llvm.used" or "llvm.compiler.used" as a global name, collect the initializer elements of that ...
Definition: Module.cpp:774
llvm::AMDGPU::getUsedList
SmallPtrSet< GlobalValue *, 32 > getUsedList(Module &M)
Definition: AMDGPULDSUtils.cpp:339
AMDGPULDSUtils.h
llvm::AMDGPU::CollectReachableCallees::collectReachableCallees
void collectReachableCallees(DenseMap< Function *, SmallPtrSet< Function *, 8 >> &KernelToCallees)
Definition: AMDGPULDSUtils.cpp:99
Constants.h
llvm::SmallVectorImpl::append
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:648
llvm::User
Definition: User.h:44
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::convertConstantExprsToInstructions
void convertConstantExprsToInstructions(Instruction *I, ConstantExpr *CE, SmallPtrSetImpl< Instruction * > *Insts=nullptr)
The given instruction I contains given constant expression CE as one of its operands,...
Definition: ReplaceConstant.cpp:28
llvm::AMDGPU::collectNonKernelAccessorsOfLDS
SmallPtrSet< Function *, 8 > collectNonKernelAccessorsOfLDS(GlobalVariable *GV)
For the given LDS global GV, visit all its users and collect all non-kernel functions within which GV...
Definition: AMDGPULDSUtils.cpp:118
llvm::Instruction
Definition: Instruction.h:45
llvm::AMDGPU::CollectReachableCallees::CollectReachableCallees
CollectReachableCallees(Module &M)
Definition: AMDGPULDSUtils.cpp:94
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::AMDGPU::collectFunctionUses
static void collectFunctionUses(User *U, const Function *F, SetVector< Instruction * > &InstUsers)
Definition: AMDGPULDSUtils.cpp:200
G
const DataFlowGraph & G
Definition: RDFGraph.cpp:202
llvm::GlobalValue
Definition: GlobalValue.h:44
llvm::AMDGPU::replaceConstantUsesInFunction
void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F)
Replace all uses of constant C with instructions in F.
Definition: AMDGPULDSUtils.cpp:220
llvm::DenseMap
Definition: DenseMap.h:714
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::df_begin
df_iterator< T > df_begin(const T &G)
Definition: DepthFirstIterator.h:218
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:386
llvm::AArch64CC::GE
@ GE
Definition: AArch64BaseInfo.h:246
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
llvm::AMDGPU::isModuleEntryFunctionCC
bool isModuleEntryFunctionCC(CallingConv::ID CC)
Definition: AMDGPUBaseInfo.cpp:1398
llvm::AMDGPU::shouldLowerLDSToStruct
bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F)
Definition: AMDGPULDSUtils.cpp:248
llvm::append_range
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
Definition: STLExtras.h:1724
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:297
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:207
llvm::AMDGPU::getAlign
Align getAlign(DataLayout const &DL, const GlobalVariable *GV)
Definition: AMDGPULDSUtils.cpp:195
llvm::ConstantExpr
A constant value that is initialized with an expression using other constant values.
Definition: Constants.h:936
llvm::AMDGPU::collectReachableCallees
void collectReachableCallees(Module &M, DenseMap< Function *, SmallPtrSet< Function *, 8 >> &KernelToCallees)
Collect reachable callees for each kernel defined in the module M and return collected callees at Ker...
Definition: AMDGPULDSUtils.cpp:111
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:585
CallGraph.h
ReplaceConstant.h
llvm::AMDGPU::hasUserInstruction
bool hasUserInstruction(const GlobalValue *GV)
Definition: AMDGPULDSUtils.cpp:229
llvm::GlobalValue::getValueType
Type * getValueType() const
Definition: GlobalValue.h:273
llvm::SetVector
A vector that has set insertion semantics.
Definition: SetVector.h:40
llvm::AMDGPU::isKernelCC
bool isKernelCC(const Function *Func)
Definition: AMDGPULDSUtils.cpp:191
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:422
SetVector.h
llvm::AMDGPU::CollectReachableCallees
Definition: AMDGPULDSUtils.cpp:29
AMDGPUBaseInfo.h
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364