LLVM  16.0.0git
AMDGPUOpenCLEnqueuedBlockLowering.cpp
Go to the documentation of this file.
1 //===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This post-linking pass replaces the function pointer of enqueued
11 // block kernel with a global variable (runtime handle) and adds
12 // "runtime-handle" attribute to the enqueued block kernel.
13 //
14 // In LLVM CodeGen the runtime-handle metadata will be translated to
15 // RuntimeHandle metadata in code object. Runtime allocates a global buffer
16 // for each kernel with RuntimeHandle metadata and saves the kernel address
17 // required for the AQL packet into the buffer. __enqueue_kernel function
18 // in device library knows that the invoke function pointer in the block
19 // literal is actually runtime handle and loads the kernel address from it
20 // and put it into AQL packet for dispatching.
21 //
22 // This cannot be done in FE since FE cannot create a unique global variable
23 // with external linkage across LLVM modules. The global variable with internal
24 // linkage does not work since optimization passes will try to replace loads
25 // of the global variable with its initialization value.
26 //
27 // It also identifies the kernels directly or indirectly enqueues kernels
28 // and adds "calls-enqueue-kernel" function attribute to them, which will
29 // be used to determine whether to emit runtime metadata for the kernel
30 // enqueue related hidden kernel arguments.
31 //
32 //===----------------------------------------------------------------------===//
33 
34 #include "AMDGPU.h"
35 #include "llvm/ADT/DenseSet.h"
36 #include "llvm/ADT/SmallString.h"
37 #include "llvm/IR/Constants.h"
38 #include "llvm/IR/Instructions.h"
39 #include "llvm/IR/Mangler.h"
40 #include "llvm/IR/Module.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/Debug.h"
43 
44 #define DEBUG_TYPE "amdgpu-lower-enqueued-block"
45 
46 using namespace llvm;
47 
48 namespace {
49 
50 /// Lower enqueued blocks.
51 class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass {
52 public:
53  static char ID;
54 
55  explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {}
56 
57 private:
58  bool runOnModule(Module &M) override;
59 };
60 
61 } // end anonymous namespace
62 
64 
67 
68 INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE,
69  "Lower OpenCL enqueued blocks", false, false)
70 
72  return new AMDGPUOpenCLEnqueuedBlockLowering();
73 }
74 
75 /// Collect direct or indirect callers of \p F and save them
76 /// to \p Callers.
77 static void collectCallers(Function *F, DenseSet<Function *> &Callers) {
78  for (auto *U : F->users()) {
79  if (auto *CI = dyn_cast<CallInst>(&*U)) {
80  auto *Caller = CI->getParent()->getParent();
81  if (Callers.insert(Caller).second)
82  collectCallers(Caller, Callers);
83  }
84  }
85 }
86 
87 /// If \p U is instruction or constant, collect functions which directly or
88 /// indirectly use it.
90  if (auto *I = dyn_cast<Instruction>(U)) {
91  auto *F = I->getParent()->getParent();
92  if (Funcs.insert(F).second)
93  collectCallers(F, Funcs);
94  return;
95  }
96  if (!isa<Constant>(U))
97  return;
98  for (auto *UU : U->users())
99  collectFunctionUsers(&*UU, Funcs);
100 }
101 
102 bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
103  DenseSet<Function *> Callers;
104  auto &C = M.getContext();
105  bool Changed = false;
106  for (auto &F : M.functions()) {
107  if (F.hasFnAttribute("enqueued-block")) {
108  if (!F.hasName()) {
110  Mangler::getNameWithPrefix(Name, "__amdgpu_enqueued_kernel",
111  M.getDataLayout());
112  F.setName(Name);
113  }
114  LLVM_DEBUG(dbgs() << "found enqueued kernel: " << F.getName() << '\n');
115  auto RuntimeHandle = (F.getName() + ".runtime_handle").str();
116  auto T = ArrayType::get(Type::getInt64Ty(C), 2);
117  auto *GV = new GlobalVariable(
118  M, T,
119  /*isConstant=*/false, GlobalValue::ExternalLinkage,
120  /*Initializer=*/Constant::getNullValue(T), RuntimeHandle,
121  /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
123  /*isExternallyInitialized=*/false);
124  LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
125 
126  for (auto *U : F.users()) {
127  auto *UU = &*U;
128  if (!isa<ConstantExpr>(UU))
129  continue;
130  collectFunctionUsers(UU, Callers);
131  auto *BitCast = cast<ConstantExpr>(UU);
132  auto *NewPtr = ConstantExpr::getPointerCast(GV, BitCast->getType());
133  BitCast->replaceAllUsesWith(NewPtr);
134  F.addFnAttr("runtime-handle", RuntimeHandle);
135  F.setLinkage(GlobalValue::ExternalLinkage);
136  Changed = true;
137  }
138  }
139  }
140 
141  for (auto *F : Callers) {
142  if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
143  continue;
144  F->addFnAttr("calls-enqueue-kernel");
145  LLVM_DEBUG(dbgs() << "mark enqueue_kernel caller:" << F->getName() << '\n');
146  }
147  return Changed;
148 }
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::AArch64PACKey::ID
ID
Definition: AArch64BaseInfo.h:818
llvm::ModulePass
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:248
T
llvm::Function
Definition: Function.h:60
Pass.h
llvm::GlobalValue::NotThreadLocal
@ NotThreadLocal
Definition: GlobalValue.h:192
llvm::GlobalVariable
Definition: GlobalVariable.h:39
Module.h
llvm::detail::DenseSetImpl::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:372
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::ConstantExpr::getPointerCast
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2014
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
Constants.h
SmallString.h
llvm::User
Definition: User.h:44
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
DenseSet.h
llvm::Mangler::getNameWithPrefix
void getNameWithPrefix(raw_ostream &OS, const GlobalValue *GV, bool CannotUsePrivateLabel) const
Print the appropriate prefix and the specified global variable's name.
Definition: Mangler.cpp:119
llvm::AMDGPUOpenCLEnqueuedBlockLoweringID
char & AMDGPUOpenCLEnqueuedBlockLoweringID
Definition: AMDGPUOpenCLEnqueuedBlockLowering.cpp:65
collectCallers
static void collectCallers(Function *F, DenseSet< Function * > &Callers)
Collect direct or indirect callers of F and save them to Callers.
Definition: AMDGPUOpenCLEnqueuedBlockLowering.cpp:77
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::SmallString< 64 >
llvm::DenseSet< Function * >
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::ArrayType::get
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:638
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass
ModulePass * createAMDGPUOpenCLEnqueuedBlockLoweringPass()
INITIALIZE_PASS
INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE, "Lower OpenCL enqueued blocks", false, false) ModulePass *llvm
Definition: AMDGPUOpenCLEnqueuedBlockLowering.cpp:68
Mangler.h
AMDGPU.h
llvm::CallingConv::AMDGPU_KERNEL
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:201
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPUOpenCLEnqueuedBlockLowering.cpp:44
llvm::Type::getInt64Ty
static IntegerType * getInt64Ty(LLVMContext &C)
Definition: Type.cpp:240
llvm::GraphProgram::Name
Name
Definition: GraphWriter.h:50
llvm::Constant::getNullValue
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:350
llvm::AMDGPU::HSAMD::Kernel::Attrs::Key::RuntimeHandle
constexpr char RuntimeHandle[]
Key for Kernel::Attr::Metadata::mRuntimeHandle.
Definition: AMDGPUMetadata.h:134
llvm::GlobalValue::ExternalLinkage
@ ExternalLinkage
Externally visible function.
Definition: GlobalValue.h:48
Instructions.h
Debug.h
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:421
collectFunctionUsers
static void collectFunctionUsers(User *U, DenseSet< Function * > &Funcs)
If U is instruction or constant, collect functions which directly or indirectly use it.
Definition: AMDGPUOpenCLEnqueuedBlockLowering.cpp:89