LLVM 22.0.0git
AMDGPUPreloadKernelArguments.cpp
Go to the documentation of this file.
1//===- AMDGPUPreloadKernelArguments.cpp - Preload Kernel Arguments --------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass preloads kernel arguments into user_data SGPRs before kernel
10/// execution begins. The number of registers available for preloading depends
11/// on the number of free user SGPRs, up to the hardware's maximum limit.
12/// Implicit arguments enabled in the kernel descriptor are allocated first,
13/// followed by SGPRs used for preloaded kernel arguments. (Reference:
14/// https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state)
15/// Additionally, hidden kernel arguments may be preloaded, in which case they
16/// are appended to the kernel signature after explicit arguments. Preloaded
17/// arguments will be marked with `inreg`.
18//
19//===----------------------------------------------------------------------===//
20
21#include "AMDGPU.h"
22#include "AMDGPUTargetMachine.h"
24#include "llvm/IR/Function.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27#include "llvm/IR/Module.h"
28#include "llvm/IR/PassManager.h"
29#include "llvm/IR/Verifier.h"
30#include "llvm/Pass.h"
31
32#define DEBUG_TYPE "amdgpu-preload-kernel-arguments"
33
34using namespace llvm;
35
37 "amdgpu-kernarg-preload-count",
38 cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));
39
40static cl::opt<bool>
41 EnableKernargPreload("amdgpu-kernarg-preload",
42 cl::desc("Enable preload kernel arguments to SGPRs"),
43 cl::init(true));
44
45namespace {
46
47class AMDGPUPreloadKernelArgumentsLegacy : public ModulePass {
48 const GCNTargetMachine *TM;
49
50public:
51 static char ID;
52 explicit AMDGPUPreloadKernelArgumentsLegacy(
53 const GCNTargetMachine *TM = nullptr);
54
55 StringRef getPassName() const override {
56 return "AMDGPU Preload Kernel Arguments";
57 }
58
59 bool runOnModule(Module &M) override;
60};
61
62class PreloadKernelArgInfo {
63private:
64 Function &F;
65 const GCNSubtarget &ST;
66 unsigned NumFreeUserSGPRs;
67
68 enum HiddenArg : unsigned {
69 HIDDEN_BLOCK_COUNT_X,
70 HIDDEN_BLOCK_COUNT_Y,
71 HIDDEN_BLOCK_COUNT_Z,
72 HIDDEN_GROUP_SIZE_X,
73 HIDDEN_GROUP_SIZE_Y,
74 HIDDEN_GROUP_SIZE_Z,
75 HIDDEN_REMAINDER_X,
76 HIDDEN_REMAINDER_Y,
77 HIDDEN_REMAINDER_Z,
78 END_HIDDEN_ARGS
79 };
80
81 // Stores information about a specific hidden argument.
82 struct HiddenArgInfo {
83 // Offset in bytes from the location in the kernearg segment pointed to by
84 // the implicitarg pointer.
85 uint8_t Offset;
86 // The size of the hidden argument in bytes.
87 uint8_t Size;
88 // The name of the hidden argument in the kernel signature.
89 const char *Name;
90 };
91
92 static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
93 {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
94 {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
95 {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
96 {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
97 {22, 2, "_hidden_remainder_z"}};
98
99 static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
100 for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
101 if (HiddenArgs[I].Offset == Offset)
102 return static_cast<HiddenArg>(I);
103
104 return END_HIDDEN_ARGS;
105 }
106
107 static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
108 if (HA < END_HIDDEN_ARGS)
109 return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
110
111 llvm_unreachable("Unexpected hidden argument.");
112 }
113
114 static const char *getHiddenArgName(HiddenArg HA) {
115 if (HA < END_HIDDEN_ARGS)
116 return HiddenArgs[HA].Name;
117
118 llvm_unreachable("Unexpected hidden argument.");
119 }
120
121 // Clones the function after adding implicit arguments to the argument list
122 // and returns the new updated function. Preloaded implicit arguments are
123 // added up to and including the last one that will be preloaded, indicated by
124 // LastPreloadIndex. Currently preloading is only performed on the totality of
125 // sequential data from the kernarg segment including implicit (hidden)
126 // arguments. This means that all arguments up to the last preloaded argument
127 // will also be preloaded even if that data is unused.
128 Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
129 FunctionType *FT = F.getFunctionType();
130 LLVMContext &Ctx = F.getParent()->getContext();
131 SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
132 for (unsigned I = 0; I <= LastPreloadIndex; ++I)
133 FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
134
135 FunctionType *NFT =
136 FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
137 Function *NF =
138 Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
139
140 NF->copyAttributesFrom(&F);
141 NF->copyMetadata(&F, 0);
142
143 F.getParent()->getFunctionList().insert(F.getIterator(), NF);
144 NF->takeName(&F);
145 NF->splice(NF->begin(), &F);
146
147 Function::arg_iterator NFArg = NF->arg_begin();
148 for (Argument &Arg : F.args()) {
149 Arg.replaceAllUsesWith(&*NFArg);
150 NFArg->takeName(&Arg);
151 ++NFArg;
152 }
153
154 AttrBuilder AB(Ctx);
155 AB.addAttribute(Attribute::InReg);
156 AB.addAttribute("amdgpu-hidden-argument");
157 AttributeList AL = NF->getAttributes();
158 for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
159 AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
160 NFArg++->setName(getHiddenArgName(HiddenArg(I)));
161 }
162
163 NF->setAttributes(AL);
164 F.replaceAllUsesWith(NF);
165
166 return NF;
167 }
168
169public:
170 PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
171 setInitialFreeUserSGPRsCount();
172 }
173
174 // Returns the maximum number of user SGPRs that we have available to preload
175 // arguments.
176 void setInitialFreeUserSGPRsCount() {
177 GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);
178 NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs();
179 }
180
181 bool canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset) {
182 return ExplicitArgOffset <= NumFreeUserSGPRs * 4;
183 }
184
185 // Try to allocate SGPRs to preload hidden kernel arguments.
186 void
187 tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
188 SmallVectorImpl<Function *> &FunctionsToErase) {
190 F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
191 if (!ImplicitArgPtr)
192 return;
193
194 const DataLayout &DL = F.getParent()->getDataLayout();
195 // Pair is the load and the load offset.
197 for (auto *U : ImplicitArgPtr->users()) {
199 if (!CI || CI->getParent()->getParent() != &F)
200 continue;
201
202 for (auto *U : CI->users()) {
203 int64_t Offset = 0;
204 auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
205 if (!Load) {
207 continue;
208
209 Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
210 }
211
212 if (!Load || !Load->isSimple())
213 continue;
214
215 // FIXME: Expand handle merged loads.
216 LLVMContext &Ctx = F.getParent()->getContext();
217 Type *LoadTy = Load->getType();
218 HiddenArg HA = getHiddenArgFromOffset(Offset);
219 if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
220 continue;
221
222 ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
223 }
224 }
225
226 if (ImplicitArgLoads.empty())
227 return;
228
229 // Allocate loads in order of offset. We need to be sure that the implicit
230 // argument can actually be preloaded.
231 std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());
232
233 // If we fail to preload any implicit argument we know we don't have SGPRs
234 // to preload any subsequent ones with larger offsets. Find the first
235 // argument that we cannot preload.
236 auto *PreloadEnd = llvm::find_if(
237 ImplicitArgLoads, [&](const std::pair<LoadInst *, unsigned> &Load) {
238 unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
239 unsigned LoadOffset = Load.second;
240 if (!canPreloadKernArgAtOffset(LoadOffset + LoadSize +
241 ImplicitArgsBaseOffset))
242 return true;
243
244 return false;
245 });
246
247 if (PreloadEnd == ImplicitArgLoads.begin())
248 return;
249
250 unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
251 Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
252 assert(NF);
253 FunctionsToErase.push_back(&F);
254 for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
255 LoadInst *LoadInst = I->first;
256 unsigned LoadOffset = I->second;
257 unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
258 unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
259 Argument *Arg = NF->getArg(Index);
260 LoadInst->replaceAllUsesWith(Arg);
261 }
262 }
263};
264
265} // end anonymous namespace
266
267char AMDGPUPreloadKernelArgumentsLegacy::ID = 0;
268
269INITIALIZE_PASS(AMDGPUPreloadKernelArgumentsLegacy, DEBUG_TYPE,
270 "AMDGPU Preload Kernel Arguments", false, false)
271
274 return new AMDGPUPreloadKernelArgumentsLegacy(
275 static_cast<const GCNTargetMachine *>(TM));
276}
277
278AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy(
279 const GCNTargetMachine *TM)
280 : ModulePass(ID), TM(TM) {}
281
282static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM) {
284 return false;
285
286 SmallVector<Function *, 4> FunctionsToErase;
287 bool Changed = false;
288 for (auto &F : M) {
289 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
290 if (!ST.hasKernargPreload() ||
291 F.getCallingConv() != CallingConv::AMDGPU_KERNEL)
292 continue;
293
294 PreloadKernelArgInfo PreloadInfo(F, ST);
295 uint64_t ExplicitArgOffset = 0;
296 const DataLayout &DL = F.getDataLayout();
297 const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();
298 unsigned NumPreloadsRequested = KernargPreloadCount;
299 unsigned NumPreloadedExplicitArgs = 0;
300 for (Argument &Arg : F.args()) {
301 // Avoid incompatible attributes and guard against running this pass
302 // twice.
303 //
304 // TODO: Preload byref kernel arguments
305 if (Arg.hasByRefAttr() || Arg.hasNestAttr() ||
306 Arg.hasAttribute("amdgpu-hidden-argument"))
307 break;
308
309 // Inreg may be pre-existing on some arguments, try to preload these.
310 if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr())
311 break;
312
313 // FIXME: Preload aggregates.
314 if (Arg.getType()->isAggregateType())
315 break;
316
317 Type *ArgTy = Arg.getType();
318 Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
319 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
320 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
321
322 if (!PreloadInfo.canPreloadKernArgAtOffset(ExplicitArgOffset))
323 break;
324
325 Arg.addAttr(Attribute::InReg);
326 NumPreloadedExplicitArgs++;
327 if (NumPreloadsRequested > 0)
328 NumPreloadsRequested--;
329 }
330
331 // Only try preloading hidden arguments if we can successfully preload the
332 // last explicit argument.
333 if (NumPreloadedExplicitArgs == F.arg_size()) {
334 uint64_t ImplicitArgsBaseOffset =
335 alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
336 BaseOffset;
337 PreloadInfo.tryAllocHiddenArgPreloadSGPRs(ImplicitArgsBaseOffset,
338 FunctionsToErase);
339 }
340
341 Changed |= NumPreloadedExplicitArgs > 0;
342 }
343
344 Changed |= !FunctionsToErase.empty();
345 // Erase cloned functions if we needed to update the kernel signature to
346 // support preloading hidden kernel arguments.
347 for (auto *F : FunctionsToErase)
348 F->eraseFromParent();
349
350 return Changed;
351}
352
353bool AMDGPUPreloadKernelArgumentsLegacy::runOnModule(Module &M) {
354 if (skipModule(M) || !TM)
355 return false;
356
357 return markKernelArgsAsInreg(M, *TM);
358}
359
360PreservedAnalyses
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM)
static cl::opt< unsigned > KernargPreloadCount("amdgpu-kernarg-preload-count", cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0))
static cl::opt< bool > EnableKernargPreload("amdgpu-kernarg-preload", cl::desc("Enable preload kernel arguments to SGPRs"), cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define DEBUG_TYPE
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Machine Check Debug Module
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:166
void splice(Function::iterator ToIt, Function *FromF)
Transfer all blocks from FromF to this function at ToIt.
Definition Function.h:759
Argument * arg_iterator
Definition Function.h:72
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
iterator begin()
Definition Function.h:851
arg_iterator arg_begin()
Definition Function.h:866
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:355
size_t arg_size() const
Definition Function.h:899
Argument * getArg(unsigned i) const
Definition Function.h:884
void copyAttributesFrom(const Function *Src)
copyAttributesFrom - copy all additional attributes (those not needed to create a Function) from the ...
Definition Function.cpp:856
LLVM_ABI void copyMetadata(const GlobalObject *Src, unsigned Offset)
Copy metadata from Src, adjusting offsets by Offset.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition Pass.h:255
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Primary interface to the complete machine description for the target machine.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
const ParentTy * getParent() const
Definition ilist_node.h:34
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
initializer< Ty > init(const Ty &Val)
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:477
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
ModulePass * createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *)
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1760
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39