LLVM  13.0.0git
AMDGPUAnnotateKernelFeatures.cpp
Go to the documentation of this file.
1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass adds target attributes to functions which use intrinsics
10 /// which will impact calling convention lowering.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
19 #include "llvm/IR/IntrinsicsAMDGPU.h"
20 #include "llvm/IR/IntrinsicsR600.h"
22 
23 #define DEBUG_TYPE "amdgpu-annotate-kernel-features"
24 
25 using namespace llvm;
26 
27 namespace {
28 static constexpr StringLiteral ImplicitAttrNames[] = {
29  // X ids unnecessarily propagated to kernels.
30  "amdgpu-work-item-id-x", "amdgpu-work-item-id-y",
31  "amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
32  "amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
33  "amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
34  "amdgpu-queue-ptr", "amdgpu-implicitarg-ptr"};
35 
36 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
37 private:
38  const TargetMachine *TM = nullptr;
40 
41  bool addFeatureAttributes(Function &F);
42  bool processUniformWorkGroupAttribute();
43  bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
44 
45 public:
46  static char ID;
47 
48  AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
49 
50  bool doInitialization(CallGraph &CG) override;
51  bool runOnSCC(CallGraphSCC &SCC) override;
52 
53  StringRef getPassName() const override {
54  return "AMDGPU Annotate Kernel Features";
55  }
56 
57  void getAnalysisUsage(AnalysisUsage &AU) const override {
58  AU.setPreservesAll();
60  }
61 
62  static bool visitConstantExpr(const ConstantExpr *CE);
63  static bool visitConstantExprsRecursively(
64  const Constant *EntryC,
65  SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc,
66  bool HasApertureRegs);
67 };
68 
69 } // end anonymous namespace
70 
72 
74 
75 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
76  "Add AMDGPU function attributes", false, false)
77 
78 
79 // The queue ptr is only needed when casting to flat, not from it.
80 static bool castRequiresQueuePtr(unsigned SrcAS) {
81  return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
82 }
83 
84 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
86 }
87 
88 static bool isDSAddress(const Constant *C) {
89  const GlobalValue *GV = dyn_cast<GlobalValue>(C);
90  if (!GV)
91  return false;
92  unsigned AS = GV->getAddressSpace();
94 }
95 
96 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
97  if (CE->getOpcode() == Instruction::AddrSpaceCast) {
98  unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
99  return castRequiresQueuePtr(SrcAS);
100  }
101 
102  return false;
103 }
104 
105 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
106  const Constant *EntryC,
107  SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
108  bool IsFunc, bool HasApertureRegs) {
109 
110  if (!ConstantExprVisited.insert(EntryC).second)
111  return false;
112 
114  Stack.push_back(EntryC);
115 
116  while (!Stack.empty()) {
117  const Constant *C = Stack.pop_back_val();
118 
119  // We need to trap on DS globals in non-entry functions.
120  if (IsFunc && isDSAddress(C))
121  return true;
122 
123  // Check this constant expression.
124  if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
125  if (!HasApertureRegs && visitConstantExpr(CE))
126  return true;
127  }
128 
129  // Visit all sub-expressions.
130  for (const Use &U : C->operands()) {
131  const auto *OpC = dyn_cast<Constant>(U);
132  if (!OpC)
133  continue;
134 
135  if (!ConstantExprVisited.insert(OpC).second)
136  continue;
137 
138  Stack.push_back(OpC);
139  }
140  }
141 
142  return false;
143 }
144 
145 // We do not need to note the x workitem or workgroup id because they are always
146 // initialized.
147 //
148 // TODO: We should not add the attributes if the known compile time workgroup
149 // size is 1 for y/z.
151  bool &NonKernelOnly,
152  bool &IsQueuePtr) {
153  switch (ID) {
154  case Intrinsic::amdgcn_workitem_id_x:
155  NonKernelOnly = true;
156  return "amdgpu-work-item-id-x";
157  case Intrinsic::amdgcn_workgroup_id_x:
158  NonKernelOnly = true;
159  return "amdgpu-work-group-id-x";
160  case Intrinsic::amdgcn_workitem_id_y:
161  case Intrinsic::r600_read_tidig_y:
162  return "amdgpu-work-item-id-y";
163  case Intrinsic::amdgcn_workitem_id_z:
164  case Intrinsic::r600_read_tidig_z:
165  return "amdgpu-work-item-id-z";
166  case Intrinsic::amdgcn_workgroup_id_y:
167  case Intrinsic::r600_read_tgid_y:
168  return "amdgpu-work-group-id-y";
169  case Intrinsic::amdgcn_workgroup_id_z:
170  case Intrinsic::r600_read_tgid_z:
171  return "amdgpu-work-group-id-z";
172  case Intrinsic::amdgcn_dispatch_ptr:
173  return "amdgpu-dispatch-ptr";
174  case Intrinsic::amdgcn_dispatch_id:
175  return "amdgpu-dispatch-id";
176  case Intrinsic::amdgcn_kernarg_segment_ptr:
177  return "amdgpu-kernarg-segment-ptr";
178  case Intrinsic::amdgcn_implicitarg_ptr:
179  return "amdgpu-implicitarg-ptr";
180  case Intrinsic::amdgcn_queue_ptr:
181  case Intrinsic::amdgcn_is_shared:
182  case Intrinsic::amdgcn_is_private:
183  // TODO: Does not require queue ptr on gfx9+
184  case Intrinsic::trap:
185  case Intrinsic::debugtrap:
186  IsQueuePtr = true;
187  return "amdgpu-queue-ptr";
188  default:
189  return "";
190  }
191 }
192 
193 static bool handleAttr(Function &Parent, const Function &Callee,
194  StringRef Name) {
195  if (Callee.hasFnAttribute(Name)) {
196  Parent.addFnAttr(Name);
197  return true;
198  }
199  return false;
200 }
201 
202 static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
203  bool &NeedQueuePtr) {
204  if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
205  NeedQueuePtr = true;
206 
207  for (StringRef AttrName : ImplicitAttrNames)
208  handleAttr(Parent, Callee, AttrName);
209 }
210 
211 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
212  bool Changed = false;
213 
214  for (auto *Node : reverse(NodeList)) {
215  Function *Caller = Node->getFunction();
216 
217  for (auto I : *Node) {
218  Function *Callee = std::get<1>(I)->getFunction();
219  if (Callee)
220  Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
221  }
222  }
223 
224  return Changed;
225 }
226 
227 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
228  Function &Caller, Function &Callee) {
229 
230  // Check for externally defined function
231  if (!Callee.hasExactDefinition()) {
232  Callee.addFnAttr("uniform-work-group-size", "false");
233  if (!Caller.hasFnAttribute("uniform-work-group-size"))
234  Caller.addFnAttr("uniform-work-group-size", "false");
235 
236  return true;
237  }
238  // Check if the Caller has the attribute
239  if (Caller.hasFnAttribute("uniform-work-group-size")) {
240  // Check if the value of the attribute is true
241  if (Caller.getFnAttribute("uniform-work-group-size")
242  .getValueAsString().equals("true")) {
243  // Propagate the attribute to the Callee, if it does not have it
244  if (!Callee.hasFnAttribute("uniform-work-group-size")) {
245  Callee.addFnAttr("uniform-work-group-size", "true");
246  return true;
247  }
248  } else {
249  Callee.addFnAttr("uniform-work-group-size", "false");
250  return true;
251  }
252  } else {
253  // If the attribute is absent, set it as false
254  Caller.addFnAttr("uniform-work-group-size", "false");
255  Callee.addFnAttr("uniform-work-group-size", "false");
256  return true;
257  }
258  return false;
259 }
260 
261 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
262  const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
263  bool HasApertureRegs = ST.hasApertureRegs();
264  SmallPtrSet<const Constant *, 8> ConstantExprVisited;
265 
266  bool HaveStackObjects = false;
267  bool Changed = false;
268  bool NeedQueuePtr = false;
269  bool HaveCall = false;
270  bool HasIndirectCall = false;
271  bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
272  CallingConv::ID CC = F.getCallingConv();
273  bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
274 
275  // If this function hasAddressTaken() = true
276  // then add all attributes corresponding to the implicit args.
277  if (CallingConvSupportsAllImplicits &&
278  F.hasAddressTaken(nullptr, true, true, true)) {
279  for (StringRef AttrName : ImplicitAttrNames) {
280  F.addFnAttr(AttrName);
281  }
282  Changed = true;
283  }
284 
285  for (BasicBlock &BB : F) {
286  for (Instruction &I : BB) {
287  if (isa<AllocaInst>(I)) {
288  HaveStackObjects = true;
289  continue;
290  }
291 
292  if (auto *CB = dyn_cast<CallBase>(&I)) {
293  const Function *Callee =
294  dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
295 
296  // Note the occurence of indirect call.
297  if (!Callee) {
298  if (!CB->isInlineAsm()) {
299  HasIndirectCall = true;
300  HaveCall = true;
301  }
302  continue;
303  }
304 
305  Intrinsic::ID IID = Callee->getIntrinsicID();
306  if (IID == Intrinsic::not_intrinsic) {
307  HaveCall = true;
308  copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
309  Changed = true;
310  } else {
311  bool NonKernelOnly = false;
312 
313  if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) {
314  F.addFnAttr("amdgpu-kernarg-segment-ptr");
315  } else {
316  StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly,
317  NeedQueuePtr);
318  if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
319  F.addFnAttr(AttrName);
320  Changed = true;
321  }
322  }
323  }
324  }
325 
326  if (NeedQueuePtr || (!IsFunc && HasApertureRegs))
327  continue;
328 
329  if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
330  if (!HasApertureRegs && castRequiresQueuePtr(ASC)) {
331  NeedQueuePtr = true;
332  continue;
333  }
334  }
335 
336  for (const Use &U : I.operands()) {
337  const auto *OpC = dyn_cast<Constant>(U);
338  if (!OpC)
339  continue;
340 
341  if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc,
342  HasApertureRegs)) {
343  NeedQueuePtr = true;
344  break;
345  }
346  }
347  }
348  }
349 
350  if (NeedQueuePtr) {
351  F.addFnAttr("amdgpu-queue-ptr");
352  Changed = true;
353  }
354 
355  // TODO: We could refine this to captured pointers that could possibly be
356  // accessed by flat instructions. For now this is mostly a poor way of
357  // estimating whether there are calls before argument lowering.
358  if (!IsFunc && HaveCall) {
359  F.addFnAttr("amdgpu-calls");
360  Changed = true;
361  }
362 
363  if (HaveStackObjects) {
364  F.addFnAttr("amdgpu-stack-objects");
365  Changed = true;
366  }
367 
368  // This pass cannot copy attributes from callees to callers
369  // if there is an indirect call and in thus such cases,
370  // hasAddressTaken() would be false for kernels and functions
371  // making an indirect call (if they are themselves not indirectly called).
372  // We must tag all such kernels/functions with all implicits attributes
373  // for correctness.
374  // e.g.
375  // 1. Kernel K1 makes an indirect call to function F1.
376  // Without detecting an indirect call in K1, this pass will not
377  // add all implicit args to K1 (which is incorrect).
378  // 2. Kernel K1 makes direct call to F1 which makes indirect call to function
379  // F2.
380  // Without detecting an indirect call in F1 (whose hasAddressTaken() is
381  // false), the pass will not add all implicit args to F1 (which is
382  // essential for correctness).
383  if (CallingConvSupportsAllImplicits && HasIndirectCall) {
384  for (StringRef AttrName : ImplicitAttrNames) {
385  F.addFnAttr(AttrName);
386  }
387  Changed = true;
388  }
389 
390  return Changed;
391 }
392 
393 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
394  bool Changed = false;
395 
396  for (CallGraphNode *I : SCC) {
397  // Build a list of CallGraphNodes from most number of uses to least
398  if (I->getNumReferences())
399  NodeList.push_back(I);
400  else {
401  processUniformWorkGroupAttribute();
402  NodeList.clear();
403  }
404 
405  Function *F = I->getFunction();
406  // Ignore functions with graphics calling conventions, these are currently
407  // not allowed to have kernel arguments.
408  if (!F || F->isDeclaration() || AMDGPU::isGraphics(F->getCallingConv()))
409  continue;
410  // Add feature attributes
411  Changed |= addFeatureAttributes(*F);
412  }
413 
414  return Changed;
415 }
416 
417 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
418  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
419  if (!TPC)
420  report_fatal_error("TargetMachine is required");
421 
422  TM = &TPC->getTM<TargetMachine>();
423  return false;
424 }
425 
427  return new AMDGPUAnnotateKernelFeatures();
428 }
llvm
Definition: AllocatorList.h:23
llvm::StringRef::empty
LLVM_NODISCARD bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:156
llvm::SystemZISD::TM
@ TM
Definition: SystemZISelLowering.h:65
llvm::AddrSpaceCastInst::getSrcAddressSpace
unsigned getSrcAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:5227
llvm::Function
Definition: Function.h:61
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::CallGraph
The basic data container for the call graph of a Module of IR.
Definition: CallGraph.h:73
llvm::reverse
auto reverse(ContainerTy &&C, std::enable_if_t< has_rbegin< ContainerTy >::value > *=nullptr)
Definition: STLExtras.h:329
llvm::rdf::NodeList
SmallVector< NodeAddr< NodeBase * >, 4 > NodeList
Definition: RDFGraph.h:512
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::GCNSubtarget
Definition: GCNSubtarget.h:38
llvm::Intrinsic::not_intrinsic
@ not_intrinsic
Definition: Intrinsics.h:45
llvm::Function::addFnAttr
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.h:255
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::StringLiteral
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:872
llvm::CallGraphSCC
CallGraphSCC - This is a single SCC that a CallGraphSCCPass is run on.
Definition: CallGraphSCCPass.h:87
TargetMachine.h
llvm::AddrSpaceCastInst
This class represents a conversion between pointers from one address space to another.
Definition: Instructions.h:5178
castRequiresQueuePtr
static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC)
Definition: AMDGPUAnnotateKernelFeatures.cpp:84
GCNSubtarget.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
NodeList
Definition: MicrosoftDemangle.cpp:37
llvm::pdb::PDB_SymType::Caller
@ Caller
llvm::Instruction
Definition: Instruction.h:45
llvm::report_fatal_error
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:140
llvm::CallingConv::AMDGPU_Gfx
@ AMDGPU_Gfx
Calling convention used for AMD graphics targets.
Definition: CallingConv.h:245
llvm::CallGraphNode
A node in the call graph for a module.
Definition: CallGraph.h:167
llvm::createAMDGPUAnnotateKernelFeaturesPass
Pass * createAMDGPUAnnotateKernelFeaturesPass()
Definition: AMDGPUAnnotateKernelFeatures.cpp:426
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:361
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::AMDGPU::isEntryFunctionCC
bool isEntryFunctionCC(CallingConv::ID CC)
Definition: AMDGPUBaseInfo.cpp:1336
llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:365
llvm::GlobalValue
Definition: GlobalValue.h:44
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::CallGraphSCCPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &Info) const override
getAnalysisUsage - For this class, we declare that we require and preserve the call graph.
Definition: CallGraphSCCPass.cpp:657
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
I
#define I(x, y, z)
Definition: MD5.cpp:59
TargetPassConfig.h
llvm::pdb::PDB_MemoryType::Stack
@ Stack
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
llvm::WinEH::EncodingType::CE
@ CE
Windows NT (Windows on ARM)
copyFeaturesToFunction
static void copyFeaturesToFunction(Function &Parent, const Function &Callee, bool &NeedQueuePtr)
Definition: AMDGPUAnnotateKernelFeatures.cpp:202
llvm::AMDGPU::CPol::SCC
@ SCC
Definition: SIDefines.h:285
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
AMDGPU.h
llvm::AMDGPU::isGraphics
bool isGraphics(CallingConv::ID cc)
Definition: AMDGPUBaseInfo.cpp:1328
CallGraphSCCPass.h
intrinsicToAttrName
static StringRef intrinsicToAttrName(Intrinsic::ID ID, bool &NonKernelOnly, bool &IsQueuePtr)
Definition: AMDGPUAnnotateKernelFeatures.cpp:150
Callee
amdgpu Simplify well known AMD library false FunctionCallee Callee
Definition: AMDGPULibCalls.cpp:205
llvm::ConstantExpr
A constant value that is initialized with an expression using other constant values.
Definition: Constants.h:931
llvm::GraphProgram::Name
Name
Definition: GraphWriter.h:52
llvm::AnalysisUsage::setPreservesAll
void setPreservesAll()
Set by analyses that do not transform their input at all.
Definition: PassAnalysisSupport.h:130
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPUAnnotateKernelFeatures.cpp:23
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:364
llvm::CallGraphSCCPass
Definition: CallGraphSCCPass.h:34
llvm::GlobalValue::getAddressSpace
unsigned getAddressSpace() const
Definition: Globals.cpp:112
CallGraph.h
llvm::Pass
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
isDSAddress
static bool isDSAddress(const Constant *C)
Definition: AMDGPUAnnotateKernelFeatures.cpp:88
handleAttr
static bool handleAttr(Function &Parent, const Function &Callee, StringRef Name)
Definition: AMDGPUAnnotateKernelFeatures.cpp:193
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::AMDGPUAnnotateKernelFeaturesID
char & AMDGPUAnnotateKernelFeaturesID
Definition: AMDGPUAnnotateKernelFeatures.cpp:73
INITIALIZE_PASS
INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, "Add AMDGPU function attributes", false, false) static bool castRequiresQueuePtr(unsigned SrcAS)
Definition: AMDGPUAnnotateKernelFeatures.cpp:75
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:44
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38