LLVM  14.0.0git
AMDGPULowerKernelArguments.cpp
Go to the documentation of this file.
1 //===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass replaces accesses to kernel arguments with loads from
10 /// offsets from the kernarg base pointer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
17 #include "llvm/IR/IntrinsicsAMDGPU.h"
18 #include "llvm/IR/IRBuilder.h"
19 #include "llvm/IR/MDBuilder.h"
21 #define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
22 
23 using namespace llvm;
24 
25 namespace {
26 
27 class AMDGPULowerKernelArguments : public FunctionPass{
28 public:
29  static char ID;
30 
31  AMDGPULowerKernelArguments() : FunctionPass(ID) {}
32 
33  bool runOnFunction(Function &F) override;
34 
35  void getAnalysisUsage(AnalysisUsage &AU) const override {
37  AU.setPreservesAll();
38  }
39 };
40 
41 } // end anonymous namespace
42 
43 // skip allocas
45  BasicBlock::iterator InsPt = BB.getFirstInsertionPt();
46  for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) {
47  AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt);
48 
49  // If this is a dynamic alloca, the value may depend on the loaded kernargs,
50  // so loads will need to be inserted before it.
51  if (!AI || !AI->isStaticAlloca())
52  break;
53  }
54 
55  return InsPt;
56 }
57 
59  CallingConv::ID CC = F.getCallingConv();
60  if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
61  return false;
62 
63  auto &TPC = getAnalysis<TargetPassConfig>();
64 
65  const TargetMachine &TM = TPC.getTM<TargetMachine>();
66  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
67  LLVMContext &Ctx = F.getParent()->getContext();
68  const DataLayout &DL = F.getParent()->getDataLayout();
69  BasicBlock &EntryBlock = *F.begin();
70  IRBuilder<> Builder(&*getInsertPt(EntryBlock));
71 
72  const Align KernArgBaseAlign(16); // FIXME: Increase if necessary
73  const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
74 
75  Align MaxAlign;
76  // FIXME: Alignment is broken broken with explicit arg offset.;
77  const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
78  if (TotalKernArgSize == 0)
79  return false;
80 
81  CallInst *KernArgSegment =
82  Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {},
83  nullptr, F.getName() + ".kernarg.segment");
84 
85  KernArgSegment->addRetAttr(Attribute::NonNull);
86  KernArgSegment->addRetAttr(
87  Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
88 
89  unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
90  uint64_t ExplicitArgOffset = 0;
91 
92  for (Argument &Arg : F.args()) {
93  const bool IsByRef = Arg.hasByRefAttr();
94  Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
95  MaybeAlign ABITypeAlign = IsByRef ? Arg.getParamAlign() : None;
96  if (!ABITypeAlign)
97  ABITypeAlign = DL.getABITypeAlign(ArgTy);
98 
99  uint64_t Size = DL.getTypeSizeInBits(ArgTy);
100  uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
101 
102  uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
103  ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
104 
105  if (Arg.use_empty())
106  continue;
107 
108  // If this is byval, the loads are already explicit in the function. We just
109  // need to rewrite the pointer values.
110  if (IsByRef) {
111  Value *ArgOffsetPtr = Builder.CreateConstInBoundsGEP1_64(
112  Builder.getInt8Ty(), KernArgSegment, EltOffset,
113  Arg.getName() + ".byval.kernarg.offset");
114 
115  Value *CastOffsetPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
116  ArgOffsetPtr, Arg.getType());
117  Arg.replaceAllUsesWith(CastOffsetPtr);
118  continue;
119  }
120 
121  if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
122  // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
123  // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
124  // can't represent this with range metadata because it's only allowed for
125  // integer types.
126  if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
127  PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
128  !ST.hasUsableDSOffset())
129  continue;
130 
131  // FIXME: We can replace this with equivalent alias.scope/noalias
132  // metadata, but this appears to be a lot of work.
133  if (Arg.hasNoAliasAttr())
134  continue;
135  }
136 
137  auto *VT = dyn_cast<FixedVectorType>(ArgTy);
138  bool IsV3 = VT && VT->getNumElements() == 3;
139  bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType();
140 
141  VectorType *V4Ty = nullptr;
142 
143  int64_t AlignDownOffset = alignDown(EltOffset, 4);
144  int64_t OffsetDiff = EltOffset - AlignDownOffset;
145  Align AdjustedAlign = commonAlignment(
146  KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset);
147 
148  Value *ArgPtr;
149  Type *AdjustedArgTy;
150  if (DoShiftOpt) { // FIXME: Handle aggregate types
151  // Since we don't have sub-dword scalar loads, avoid doing an extload by
152  // loading earlier than the argument address, and extracting the relevant
153  // bits.
154  //
155  // Additionally widen any sub-dword load to i32 even if suitably aligned,
156  // so that CSE between different argument loads works easily.
157  ArgPtr = Builder.CreateConstInBoundsGEP1_64(
158  Builder.getInt8Ty(), KernArgSegment, AlignDownOffset,
159  Arg.getName() + ".kernarg.offset.align.down");
160  AdjustedArgTy = Builder.getInt32Ty();
161  } else {
162  ArgPtr = Builder.CreateConstInBoundsGEP1_64(
163  Builder.getInt8Ty(), KernArgSegment, EltOffset,
164  Arg.getName() + ".kernarg.offset");
165  AdjustedArgTy = ArgTy;
166  }
167 
168  if (IsV3 && Size >= 32) {
169  V4Ty = FixedVectorType::get(VT->getElementType(), 4);
170  // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
171  AdjustedArgTy = V4Ty;
172  }
173 
174  ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS),
175  ArgPtr->getName() + ".cast");
176  LoadInst *Load =
177  Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);
178  Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
179 
180  MDBuilder MDB(Ctx);
181 
182  if (isa<PointerType>(ArgTy)) {
183  if (Arg.hasNonNullAttr())
184  Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));
185 
186  uint64_t DerefBytes = Arg.getDereferenceableBytes();
187  if (DerefBytes != 0) {
188  Load->setMetadata(
189  LLVMContext::MD_dereferenceable,
190  MDNode::get(Ctx,
191  MDB.createConstant(
192  ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
193  }
194 
195  uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
196  if (DerefOrNullBytes != 0) {
197  Load->setMetadata(
198  LLVMContext::MD_dereferenceable_or_null,
199  MDNode::get(Ctx,
200  MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
201  DerefOrNullBytes))));
202  }
203 
204  unsigned ParamAlign = Arg.getParamAlignment();
205  if (ParamAlign != 0) {
206  Load->setMetadata(
207  LLVMContext::MD_align,
208  MDNode::get(Ctx,
209  MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
210  ParamAlign))));
211  }
212  }
213 
214  // TODO: Convert noalias arg to !noalias
215 
216  if (DoShiftOpt) {
217  Value *ExtractBits = OffsetDiff == 0 ?
218  Load : Builder.CreateLShr(Load, OffsetDiff * 8);
219 
220  IntegerType *ArgIntTy = Builder.getIntNTy(Size);
221  Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
222  Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,
223  Arg.getName() + ".load");
224  Arg.replaceAllUsesWith(NewVal);
225  } else if (IsV3) {
226  Value *Shuf = Builder.CreateShuffleVector(Load, ArrayRef<int>{0, 1, 2},
227  Arg.getName() + ".load");
228  Arg.replaceAllUsesWith(Shuf);
229  } else {
230  Load->setName(Arg.getName() + ".load");
231  Arg.replaceAllUsesWith(Load);
232  }
233  }
234 
235  KernArgSegment->addRetAttr(
236  Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
237 
238  return true;
239 }
240 
241 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
242  "AMDGPU Lower Kernel Arguments", false, false)
243 INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments",
245 
246 char AMDGPULowerKernelArguments::ID = 0;
247 
249  return new AMDGPULowerKernelArguments();
250 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:148
llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPULowerKernelArguments.cpp:21
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:90
llvm::Function
Definition: Function.h:61
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:734
llvm::IRBuilder<>
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1208
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::CallingConv::AMDGPU_KERNEL
@ AMDGPU_KERNEL
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:216
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:206
llvm::AllocaInst::isStaticAlloca
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Definition: Instructions.cpp:1397
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments", false, false) INITIALIZE_PASS_END(AMDGPULowerKernelArguments
TargetMachine.h
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
false
Definition: StackSlotColoring.cpp:142
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
llvm::alignDown
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:753
llvm::IntegerType
Class to represent integer types.
Definition: DerivedTypes.h:40
MDBuilder.h
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:900
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:648
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::None
const NoneType None
Definition: None.h:23
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition: TargetPassConfig.h:84
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
uint64_t
llvm::omp::Kernel
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition: OpenMPOpt.h:21
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
llvm::Attribute::getWithDereferenceableBytes
static Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)
Definition: Attributes.cpp:176
llvm::PointerType
Class to represent pointers.
Definition: DerivedTypes.h:632
llvm::CallBase::addRetAttr
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Definition: InstrTypes.h:1510
TargetPassConfig.h
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:354
IRBuilder.h
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:79
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:351
llvm::ArrayRef< int >
AMDGPU.h
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:175
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::Attribute::getWithAlignment
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:166
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:69
llvm::commonAlignment
Align commonAlignment(Align A, Align B)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:211
llvm::AnalysisUsage::setPreservesAll
void setPreservesAll()
Set by analyses that do not transform their input at all.
Definition: PassAnalysisSupport.h:130
llvm::createAMDGPULowerKernelArgumentsPass
FunctionPass * createAMDGPULowerKernelArgumentsPass()
Definition: AMDGPULowerKernelArguments.cpp:248
Arguments
AMDGPU Lower Kernel Arguments
Definition: AMDGPULowerKernelArguments.cpp:243
llvm::Type::getPointerTo
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:738
llvm::MDBuilder
Definition: MDBuilder.h:35
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
getInsertPt
static BasicBlock::iterator getInsertPt(BasicBlock &BB)
Definition: AMDGPULowerKernelArguments.cpp:44
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1475
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::Type::isAggregateType
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition: Type.h:267
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::AllocaInst
an instruction to allocate memory on the stack
Definition: Instructions.h:62
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37