LLVM 23.0.0git
AMDGPULowerExecSync.cpp
Go to the documentation of this file.
1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Lower LDS global variables with target extension type "amdgpu.named.barrier"
10// that require specialized address assignment. It assigns a unique
11// barrier identifier to each named-barrier LDS variable and encodes
12// this identifier within the !absolute_symbol metadata of that global.
13// This encoding ensures that subsequent LDS lowering passes can process these
14// barriers correctly without conflicts.
15//
16//===----------------------------------------------------------------------===//
17
18#include "AMDGPU.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
21#include "llvm/ADT/DenseMap.h"
24#include "llvm/IR/Constants.h"
28#include "llvm/Pass.h"
30
31#include <algorithm>
32
33#define DEBUG_TYPE "amdgpu-lower-exec-sync"
34
35using namespace llvm;
36using namespace AMDGPU;
37
38namespace {
39
40// If GV is also used directly by other kernels, create a new GV
41// used only by this kernel and its function.
42static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
43 Function *KF) {
44 bool NeedsReplacement = false;
45 for (Use &U : GV->uses()) {
46 if (auto *I = dyn_cast<Instruction>(U.getUser())) {
47 Function *F = I->getFunction();
48 if (isKernel(*F) && F != KF) {
49 NeedsReplacement = true;
50 break;
51 }
52 }
53 }
54 if (!NeedsReplacement)
55 return GV;
56 // Create a new GV used only by this kernel and its function
57 GlobalVariable *NewGV = new GlobalVariable(
58 M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
59 GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
61 NewGV->copyAttributesFrom(GV);
62 for (Use &U : make_early_inc_range(GV->uses())) {
63 if (auto *I = dyn_cast<Instruction>(U.getUser())) {
64 Function *F = I->getFunction();
65 if (!isKernel(*F) || F == KF) {
66 U.getUser()->replaceUsesOfWith(GV, NewGV);
67 }
68 }
69 }
70 return NewGV;
71}
72
73// Write the specified address into metadata where it can be retrieved by
74// the assembler. Format is a half open range, [Address Address+1)
75static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
76 uint32_t Address) {
77 LLVMContext &Ctx = M->getContext();
78 auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
79 auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
80 auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
81 GV->setMetadata(LLVMContext::MD_absolute_symbol,
82 MDNode::get(Ctx, {MinC, MaxC}));
83}
84
85template <typename T> SmallVector<T> sortByName(SmallVector<T> &&V) {
86 sort(V, [](const auto *L, const auto *R) {
87 return L->getName() < R->getName();
88 });
89 return {std::move(V)};
90}
91
92// Main utility function for special LDS variables lowering.
93static bool lowerExecSyncGlobalVariables(
94 Module &M, LDSUsesInfoTy &LDSUsesInfo,
95 VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
96 bool Changed = false;
97 const DataLayout &DL = M.getDataLayout();
98 // The 1st round: give module-absolute assignments
99 int NumAbsolutes = 0;
101 for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
102 GlobalVariable *GV = K.first;
103 if (!isNamedBarrier(*GV))
104 continue;
105 // give a module-absolute assignment if it is indirectly accessed by
106 // multiple kernels. This is not precise, but we don't want to duplicate
107 // a function when it is called by multiple kernels.
108 if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
109 OrderedGVs.push_back(GV);
110 } else {
111 // leave it to the 2nd round, which will give a kernel-relative
112 // assignment if it is only indirectly accessed by one kernel
113 LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
114 }
115 LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
116 }
117 OrderedGVs = sortByName(std::move(OrderedGVs));
118 for (GlobalVariable *GV : OrderedGVs) {
119 unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
120 unsigned BarId = NumAbsolutes + 1;
121 unsigned BarCnt = GV->getGlobalSize(DL) / 16;
122 NumAbsolutes += BarCnt;
123
124 // 4 bits for alignment, 5 bits for the barrier num,
125 // 3 bits for the barrier scope
126 unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
127 recordLDSAbsoluteAddress(&M, GV, Offset);
128 }
129 OrderedGVs.clear();
130
131 // The 2nd round: give a kernel-relative assignment for GV that
132 // either only indirectly accessed by single kernel or only directly
133 // accessed by multiple kernels.
134 SmallVector<Function *> OrderedKernels;
135 for (auto &K : LDSUsesInfo.direct_access) {
136 Function *F = K.first;
137 assert(isKernel(*F));
138 OrderedKernels.push_back(F);
139 }
140 OrderedKernels = sortByName(std::move(OrderedKernels));
141
143 for (Function *F : OrderedKernels) {
144 for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
145 if (!isNamedBarrier(*GV))
146 continue;
147
148 LDSUsesInfo.direct_access[F].erase(GV);
149 if (GV->isAbsoluteSymbolRef()) {
150 // already assigned
151 continue;
152 }
153 OrderedGVs.push_back(GV);
154 }
155 OrderedGVs = sortByName(std::move(OrderedGVs));
156 for (GlobalVariable *GV : OrderedGVs) {
157 // GV could also be used directly by other kernels. If so, we need to
158 // create a new GV used only by this kernel and its function.
159 auto NewGV = uniquifyGVPerKernel(M, GV, F);
160 Changed |= (NewGV != GV);
161 unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
162 unsigned BarId = Kernel2BarId[F];
163 BarId += NumAbsolutes + 1;
164 unsigned BarCnt = GV->getGlobalSize(DL) / 16;
165 Kernel2BarId[F] += BarCnt;
166 unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
167 recordLDSAbsoluteAddress(&M, NewGV, Offset);
168 }
169 OrderedGVs.clear();
170 }
171 // Also erase those special LDS variables from indirect_access.
172 for (auto &K : LDSUsesInfo.indirect_access) {
173 assert(isKernel(*K.first));
174 for (GlobalVariable *GV : K.second) {
175 if (isNamedBarrier(*GV))
176 K.second.erase(GV);
177 }
178 }
179 return Changed;
180}
181
182// With object linking, barrier ID assignment is deferred to the linker.
183// Externalize named barrier globals and emit self-contained metadata so the
184// AsmPrinter can generate the callgraph entries the linker needs.
185static bool handleNamedBarriersForObjectLinking(Module &M) {
187 for (GlobalVariable &GV : M.globals()) {
188 if (!isNamedBarrier(GV) || GV.use_empty())
189 continue;
190 for (User *U : GV.users()) {
191 if (auto *I = dyn_cast<Instruction>(U))
192 BarrierToFuncs[&GV].insert(I->getFunction());
193 }
194 }
195 if (BarrierToFuncs.empty())
196 return false;
197
198 LLVMContext &Ctx = M.getContext();
199 NamedMDNode *BarMD = M.getOrInsertNamedMetadata("amdgpu.named_barrier.uses");
200
201 std::string ModuleId;
202 ModuleId = getUniqueModuleId(&M);
203 assert(!ModuleId.empty() &&
204 "modules with named barriers should have a unique ID");
205 for (auto &[V, Funcs] : BarrierToFuncs) {
206 if (V->hasLocalLinkage())
207 V->setName("__amdgpu_named_barrier." + V->getName() + ModuleId);
208 else if (!V->getName().starts_with("__amdgpu_named_barrier"))
209 V->setName("__amdgpu_named_barrier." + V->getName());
210 V->setInitializer(nullptr);
211 V->setLinkage(GlobalValue::ExternalLinkage);
212
214 Ops.push_back(ValueAsMetadata::get(V));
215 for (Function *F : Funcs)
216 Ops.push_back(ValueAsMetadata::get(F));
217 BarMD->addOperand(MDNode::get(Ctx, Ops));
218 }
219 return true;
220}
221
222static bool runLowerExecSyncGlobals(Module &M) {
224 return handleNamedBarriersForObjectLinking(M);
225
226 CallGraph CG = CallGraph(M);
227 bool Changed = false;
229
230 // For each kernel, what variables does it access directly or through
231 // callees
232 LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
233
234 // For each variable accessed through callees, which kernels access it
235 VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
236 for (auto &K : LDSUsesInfo.indirect_access) {
237 Function *F = K.first;
238 assert(isKernel(*F));
239 for (GlobalVariable *GV : K.second) {
240 LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
241 }
242 }
243
244 if (LDSUsesInfo.HasSpecialGVs) {
245 // Special LDS variables need special address assignment
246 Changed |= lowerExecSyncGlobalVariables(
247 M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
248 }
249 return Changed;
250}
251
252class AMDGPULowerExecSyncLegacy : public ModulePass {
253public:
254 static char ID;
255 AMDGPULowerExecSyncLegacy() : ModulePass(ID) {}
256 bool runOnModule(Module &M) override;
257};
258
259} // namespace
260
261char AMDGPULowerExecSyncLegacy::ID = 0;
262char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID;
263
264INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
265 "AMDGPU lowering of execution synchronization", false,
266 false)
268INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
269 "AMDGPU lowering of execution synchronization", false,
270 false)
271
272bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) {
273 return runLowerExecSyncGlobals(M);
274}
275
277 return new AMDGPULowerExecSyncLegacy();
278}
279
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
#define DEBUG_TYPE
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Target-Independent Code Generator Pass Configuration Options pass.
The basic data container for the call graph of a Module of IR.
Definition CallGraph.h:72
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
bool empty() const
Definition DenseMap.h:109
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set a particular kind of metadata attachment.
LinkageTypes getLinkage() const
LLVM_ABI bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition Globals.cpp:455
ThreadLocalMode getThreadLocalMode() const
PointerType * getType() const
Global values are always pointers.
@ ExternalLinkage
Externally visible function.
Definition GlobalValue.h:53
Type * getValueType() const
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
LLVM_ABI void copyAttributesFrom(const GlobalVariable *Src)
copyAttributesFrom - copy all additional attributes (those not needed to create a GlobalVariable) fro...
Definition Globals.cpp:576
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:569
bool isConstant() const
If the value is a global constant, its value is immutable throughout the runtime execution of the pro...
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition Pass.h:255
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
A tuple of MDNodes.
Definition Metadata.h:1760
LLVM_ABI void addOperand(MDNode *M)
unsigned getAddressSpace() const
Return the address space of the Pointer type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Target-Independent Code Generator Pass Configuration Options.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
static LLVM_ABI ValueAsMetadata * get(Value *V)
Definition Metadata.cpp:509
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
Changed
@ LOCAL_ADDRESS
Address space for local memory.
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M)
DenseMap< GlobalVariable *, DenseSet< Function * > > VariableFunctionMap
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
char & AMDGPULowerExecSyncLegacyPassID
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
LLVM_ABI std::string getUniqueModuleId(Module *M)
Produce a unique identifier for this module by taking the MD5 sum of the names of the module's strong...
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
ModulePass * createAMDGPULowerExecSyncLegacyPass()
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
FunctionVariableMap direct_access
FunctionVariableMap indirect_access