LLVM 23.0.0git
AMDGPULowerExecSync.cpp
Go to the documentation of this file.
1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Lower LDS global variables with target extension type "amdgpu.named.barrier"
10// that require specialized address assignment. It assigns a unique
11// barrier identifier to each named-barrier LDS variable and encodes
12// this identifier within the !absolute_symbol metadata of that global.
13// This encoding ensures that subsequent LDS lowering passes can process these
14// barriers correctly without conflicts.
15//
16//===----------------------------------------------------------------------===//
17
18#include "AMDGPU.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
21#include "llvm/ADT/DenseMap.h"
24#include "llvm/IR/Constants.h"
28#include "llvm/Pass.h"
30
31#include <algorithm>
32
33#define DEBUG_TYPE "amdgpu-lower-exec-sync"
34
35using namespace llvm;
36using namespace AMDGPU;
37
38namespace {
39
40// Write the specified address into metadata where it can be retrieved by
41// the assembler. Format is a half open range, [Address Address+1)
42static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
43 uint32_t Address) {
44 LLVMContext &Ctx = M->getContext();
45 auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
46 auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
47 auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
48 GV->setMetadata(LLVMContext::MD_absolute_symbol,
49 MDNode::get(Ctx, {MinC, MaxC}));
50}
51
52/// Get next available ID for sync object. The ID allocation is tracked in \p
53/// MaxNumGroup groups by \p NextAvailableIDTracker. Each call of the function
54/// will ask for \p IDCnt against all the \p Kernels, it will return the
55/// maximum of the available ones and update the ID tracker.
56template <typename T>
57unsigned allocateExecSyncID(T &NextAvailableIDTracker,
58 ArrayRef<Function *> Kernels, unsigned GroupID,
59 unsigned MaxNumGroup, unsigned IDCnt) {
60 constexpr unsigned InitialVal = 1;
61 unsigned NextID = InitialVal;
62 for (Function *F : Kernels) {
63 const SmallVectorImpl<unsigned> &NextAvailableID =
64 NextAvailableIDTracker.lookup(F);
65 unsigned ID = InitialVal;
66 if (!NextAvailableID.empty())
67 ID = NextAvailableID[GroupID];
68
69 if (ID > NextID)
70 NextID = ID;
71 }
72
73 // Bump the next available id for the kernels.
74 for (Function *F : Kernels) {
75 auto Inserted = NextAvailableIDTracker.try_emplace(F);
76 // Initialize on first insertion.
77 if (Inserted.second)
78 Inserted.first->second.assign(MaxNumGroup, InitialVal);
79 // Update the available ID.
80 Inserted.first->second[GroupID] = NextID + IDCnt;
81 }
82 return NextID;
83}
84
85// Main utility function for special LDS variables lowering.
86static bool lowerExecSyncGlobalVariables(Module &M,
87 LDSUsesInfoTy &LDSUsesInfo) {
88 bool Changed = false;
89 const DataLayout &DL = M.getDataLayout();
90
91 constexpr unsigned NumBarScopes = 1;
94
95 for (auto &[F, GVs] : LDSUsesInfo.indirect_access) {
96 for (auto *GV : GVs) {
97 if (!isNamedBarrier(*GV) || GV->isAbsoluteSymbolRef())
98 continue;
99 auto Iter = AllocationQ.find(GV);
100 if (Iter == AllocationQ.end())
101 AllocationQ.insert({GV, {F}});
102 else
103 Iter->second.push_back(F);
104 }
105 }
106
107 for (auto &[F, GVs] : LDSUsesInfo.direct_access) {
108 for (auto *GV : GVs) {
109 if (!isNamedBarrier(*GV) || GV->isAbsoluteSymbolRef())
110 continue;
111 auto Iter = AllocationQ.find(GV);
112 if (Iter == AllocationQ.end())
113 AllocationQ.insert({GV, {F}});
114 else
115 Iter->second.push_back(F);
116 }
117 }
118
119 sort(AllocationQ, [](std::pair<GlobalVariable *, SmallVector<Function *>> A,
121 // First order by number of kernels that access the GlobalVariable.
122 if (A.second.size() != B.second.size())
123 return A.second.size() > B.second.size();
124
125 // Then order by their names so we always get a deterministic order.
126 return A.first->getName() < B.first->getName();
127 });
128
129 for (auto &[GV, Kernels] : AllocationQ) {
130 unsigned Offset;
131 if (TargetExtType *ExtTy = isNamedBarrier(*GV)) {
132 unsigned BarrierScope = ExtTy->getIntParameter(0);
133 unsigned BarCnt = GV->getGlobalSize(DL) / 16;
134
135 unsigned BarID = allocateExecSyncID(KernelBarrierIDs, Kernels,
136 BarrierScope, NumBarScopes, BarCnt);
137
138 LLVM_DEBUG(GV->printAsOperand(dbgs(), false);
139 dbgs() << " was assigned barrier id: " << BarID
140 << " id-count: " << BarCnt << "\n");
141 // 4 bits for alignment, 5 bits for the barrier num,
142 // 3 bits for the barrier scope
143 Offset = 0x802000u | BarrierScope << 9 | BarID << 4;
144 } else {
145 llvm_unreachable("Unhandled special variable type.");
146 }
147
148 recordLDSAbsoluteAddress(&M, GV, Offset);
149 }
150
151 // Also erase those special LDS variables from indirect_access.
152 for (auto &K : LDSUsesInfo.indirect_access) {
153 assert(isKernel(*K.first));
154 K.second.remove_if([](GlobalVariable *GV) { return isNamedBarrier(*GV); });
155 }
156 return Changed;
157}
158
159// With object linking, barrier ID assignment is deferred to the linker.
160// Externalize named barrier globals and emit self-contained metadata so the
161// AsmPrinter can generate the callgraph entries the linker needs.
162static bool handleNamedBarriersForObjectLinking(Module &M) {
164 for (GlobalVariable &GV : M.globals()) {
165 if (!isNamedBarrier(GV) || GV.use_empty())
166 continue;
167 for (User *U : GV.users()) {
168 if (auto *I = dyn_cast<Instruction>(U))
169 BarrierToFuncs[&GV].insert(I->getFunction());
170 }
171 }
172 if (BarrierToFuncs.empty())
173 return false;
174
175 LLVMContext &Ctx = M.getContext();
176 NamedMDNode *BarMD = M.getOrInsertNamedMetadata("amdgpu.named_barrier.uses");
177
178 std::string ModuleId;
179 ModuleId = getUniqueModuleId(&M);
180 assert(!ModuleId.empty() &&
181 "modules with named barriers should have a unique ID");
182 for (auto &[V, Funcs] : BarrierToFuncs) {
183 if (V->hasLocalLinkage())
184 V->setName("__amdgpu_named_barrier." + V->getName() + ModuleId);
185 else if (!V->getName().starts_with("__amdgpu_named_barrier"))
186 V->setName("__amdgpu_named_barrier." + V->getName());
187 V->setInitializer(nullptr);
188 V->setLinkage(GlobalValue::ExternalLinkage);
189
191 Ops.push_back(ValueAsMetadata::get(V));
192 for (Function *F : Funcs)
193 Ops.push_back(ValueAsMetadata::get(F));
194 BarMD->addOperand(MDNode::get(Ctx, Ops));
195 }
196 return true;
197}
198
199static bool runLowerExecSyncGlobals(Module &M) {
201 return handleNamedBarriersForObjectLinking(M);
202
203 CallGraph CG = CallGraph(M);
204 bool Changed = false;
206
207 // For each kernel, what variables does it access directly or through
208 // callees
209 LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
210
211 if (LDSUsesInfo.HasSpecialGVs) {
212 // Special LDS variables need special address assignment
213 Changed |= lowerExecSyncGlobalVariables(M, LDSUsesInfo);
214 }
215 return Changed;
216}
217
218class AMDGPULowerExecSyncLegacy : public ModulePass {
219public:
220 static char ID;
221 AMDGPULowerExecSyncLegacy() : ModulePass(ID) {}
222 bool runOnModule(Module &M) override;
223};
224
225} // namespace
226
227char AMDGPULowerExecSyncLegacy::ID = 0;
228char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID;
229
230INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
231 "AMDGPU lowering of execution synchronization", false,
232 false)
234INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
235 "AMDGPU lowering of execution synchronization", false,
236 false)
237
238bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) {
239 return runLowerExecSyncGlobals(M);
240}
241
243 return new AMDGPULowerExecSyncLegacy();
244}
245
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
#define DEBUG_TYPE
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
#define LLVM_DEBUG(...)
Definition Debug.h:119
Target-Independent Code Generator Pass Configuration Options pass.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
The basic data container for the call graph of a Module of IR.
Definition CallGraph.h:72
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
bool empty() const
Definition DenseMap.h:113
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:239
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set a particular kind of metadata attachment.
LLVM_ABI bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition Globals.cpp:510
@ ExternalLinkage
Externally visible function.
Definition GlobalValue.h:53
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:624
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
iterator find(const KeyT &Key)
Definition MapVector.h:156
iterator end()
Definition MapVector.h:69
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition MapVector.h:126
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition Pass.h:255
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:68
A tuple of MDNodes.
Definition Metadata.h:1760
LLVM_ABI void addOperand(MDNode *M)
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Class to represent target extensions types, which are generally unintrospectable from target-independ...
Target-Independent Code Generator Pass Configuration Options.
static LLVM_ABI ValueAsMetadata * get(Value *V)
Definition Metadata.cpp:509
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
bool use_empty() const
Definition Value.h:346
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:558
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
char & AMDGPULowerExecSyncLegacyPassID
LLVM_ABI std::string getUniqueModuleId(Module *M)
Produce a unique identifier for this module by taking the MD5 sum of the names of the module's strong...
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
ModulePass * createAMDGPULowerExecSyncLegacyPass()
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
FunctionVariableMap direct_access
FunctionVariableMap indirect_access