26#include "llvm/IR/IntrinsicsAMDGPU.h"
32#define DEBUG_TYPE "amdgpu-preload-kernel-arguments"
37 "amdgpu-kernarg-preload-count",
42 cl::desc(
"Enable preload kernel arguments to SGPRs"),
47class AMDGPUPreloadKernelArgumentsLegacy :
public ModulePass {
52 explicit AMDGPUPreloadKernelArgumentsLegacy(
56 return "AMDGPU Preload Kernel Arguments";
59 bool runOnModule(
Module &M)
override;
62class PreloadKernelArgInfo {
65 const GCNSubtarget &ST;
66 unsigned NumFreeUserSGPRs;
68 enum HiddenArg :
unsigned {
82 struct HiddenArgInfo {
92 static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
93 {0, 4,
"_hidden_block_count_x"}, {4, 4,
"_hidden_block_count_y"},
94 {8, 4,
"_hidden_block_count_z"}, {12, 2,
"_hidden_group_size_x"},
95 {14, 2,
"_hidden_group_size_y"}, {16, 2,
"_hidden_group_size_z"},
96 {18, 2,
"_hidden_remainder_x"}, {20, 2,
"_hidden_remainder_y"},
97 {22, 2,
"_hidden_remainder_z"}};
99 static HiddenArg getHiddenArgFromOffset(
unsigned Offset) {
100 for (
unsigned I = 0;
I < END_HIDDEN_ARGS; ++
I)
102 return static_cast<HiddenArg
>(
I);
104 return END_HIDDEN_ARGS;
107 static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
108 if (HA < END_HIDDEN_ARGS)
109 return Type::getIntNTy(Ctx, HiddenArgs[HA].
Size * 8);
114 static const char *getHiddenArgName(HiddenArg HA) {
115 if (HA < END_HIDDEN_ARGS)
116 return HiddenArgs[HA].Name;
128 Function *cloneFunctionWithPreloadImplicitArgs(
unsigned LastPreloadIndex) {
129 FunctionType *FT = F.getFunctionType();
130 LLVMContext &Ctx = F.getParent()->getContext();
132 for (
unsigned I = 0;
I <= LastPreloadIndex; ++
I)
133 FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(
I)));
136 FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
143 F.getParent()->getFunctionList().insert(F.getIterator(), NF);
148 for (Argument &Arg : F.args()) {
149 Arg.replaceAllUsesWith(&*NFArg);
155 AB.addAttribute(Attribute::InReg);
156 AB.addAttribute(
"amdgpu-hidden-argument");
158 for (
unsigned I = 0;
I <= LastPreloadIndex; ++
I) {
160 NFArg++->
setName(getHiddenArgName(HiddenArg(
I)));
164 F.replaceAllUsesWith(NF);
170 PreloadKernelArgInfo(Function &F,
const GCNSubtarget &ST) : F(F), ST(ST) {
171 setInitialFreeUserSGPRsCount();
176 void setInitialFreeUserSGPRsCount() {
177 GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);
178 NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs();
181 bool canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset) {
182 return ExplicitArgOffset <= NumFreeUserSGPRs * 4;
187 tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
188 SmallVectorImpl<Function *> &FunctionsToErase) {
190 F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
197 for (
auto *U : ImplicitArgPtr->
users()) {
199 if (!CI || CI->
getParent()->getParent() != &F)
202 for (
auto *U : CI->
users()) {
212 if (!Load || !
Load->isSimple())
216 LLVMContext &Ctx = F.getParent()->getContext();
218 HiddenArg HA = getHiddenArgFromOffset(
Offset);
219 if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
226 if (ImplicitArgLoads.
empty())
231 std::sort(ImplicitArgLoads.
begin(), ImplicitArgLoads.
end(), less_second());
237 ImplicitArgLoads, [&](
const std::pair<LoadInst *, unsigned> &Load) {
238 unsigned LoadSize =
DL.getTypeStoreSize(
Load.first->getType());
239 unsigned LoadOffset =
Load.second;
240 if (!canPreloadKernArgAtOffset(LoadOffset + LoadSize +
241 ImplicitArgsBaseOffset))
247 if (PreloadEnd == ImplicitArgLoads.
begin())
250 unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
251 Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
254 for (
const auto *
I = ImplicitArgLoads.
begin();
I != PreloadEnd; ++
I) {
255 LoadInst *LoadInst =
I->first;
256 unsigned LoadOffset =
I->second;
257 unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
258 unsigned Index = NF->
arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
267char AMDGPUPreloadKernelArgumentsLegacy::ID = 0;
270 "AMDGPU Preload Kernel Arguments",
false,
false)
274 return new AMDGPUPreloadKernelArgumentsLegacy(
278AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy(
290 if (!ST.hasKernargPreload() ||
294 PreloadKernelArgInfo PreloadInfo(
F, ST);
297 const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();
299 unsigned NumPreloadedExplicitArgs = 0;
305 if (Arg.hasByRefAttr() || Arg.hasNestAttr() ||
306 Arg.hasAttribute(
"amdgpu-hidden-argument"))
310 if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr())
314 if (Arg.getType()->isAggregateType())
317 Type *ArgTy = Arg.getType();
318 Align ABITypeAlign =
DL.getABITypeAlign(ArgTy);
319 uint64_t AllocSize =
DL.getTypeAllocSize(ArgTy);
320 ExplicitArgOffset =
alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
322 if (!PreloadInfo.canPreloadKernArgAtOffset(ExplicitArgOffset))
325 Arg.addAttr(Attribute::InReg);
326 NumPreloadedExplicitArgs++;
327 if (NumPreloadsRequested > 0)
328 NumPreloadsRequested--;
333 if (NumPreloadedExplicitArgs ==
F.arg_size()) {
335 alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
337 PreloadInfo.tryAllocHiddenArgPreloadSGPRs(ImplicitArgsBaseOffset,
341 Changed |= NumPreloadedExplicitArgs > 0;
347 for (
auto *
F : FunctionsToErase)
348 F->eraseFromParent();
353bool AMDGPUPreloadKernelArgumentsLegacy::runOnModule(
Module &M) {
354 if (skipModule(M) || !TM)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM)
static cl::opt< unsigned > KernargPreloadCount("amdgpu-kernarg-preload-count", cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0))
static cl::opt< bool > EnableKernargPreload("amdgpu-kernarg-preload", cl::desc("Enable preload kernel arguments to SGPRs"), cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
Machine Check Debug Module
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
This class represents an incoming formal argument to a Function.
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
A parsed version of the target data layout string in and methods for querying it.
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
void splice(Function::iterator ToIt, Function *FromF)
Transfer all blocks from FromF to this function at ToIt.
AttributeList getAttributes() const
Return the attribute list for this Function.
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Argument * getArg(unsigned i) const
void copyAttributesFrom(const Function *Src)
copyAttributesFrom - copy all additional attributes (those not needed to create a Function) from the ...
LLVM_ABI void copyMetadata(const GlobalObject *Src, unsigned Offset)
Copy metadata from Src, adjusting offsets by Offset.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
A Module instance is used to store all the information related to an LLVM module.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Primary interface to the complete machine description for the target machine.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
const ParentTy * getParent() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
initializer< Ty > init(const Ty &Val)
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
ModulePass * createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *)
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
This struct is a compact representation of a valid (non-zero power of two) alignment.