LLVM  10.0.0svn
AMDGPULowerKernelArguments.cpp
Go to the documentation of this file.
1 //===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass replaces accesses to kernel arguments with loads from
10 /// offsets from the kernarg base pointer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "llvm/ADT/StringRef.h"
18 #include "llvm/Analysis/Loads.h"
19 #include "llvm/CodeGen/Passes.h"
21 #include "llvm/IR/Attributes.h"
22 #include "llvm/IR/BasicBlock.h"
23 #include "llvm/IR/Constants.h"
24 #include "llvm/IR/DerivedTypes.h"
25 #include "llvm/IR/Function.h"
26 #include "llvm/IR/IRBuilder.h"
27 #include "llvm/IR/InstrTypes.h"
28 #include "llvm/IR/Instruction.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/IR/LLVMContext.h"
31 #include "llvm/IR/MDBuilder.h"
32 #include "llvm/IR/Metadata.h"
33 #include "llvm/IR/Operator.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/IR/Value.h"
36 #include "llvm/Pass.h"
37 #include "llvm/Support/Casting.h"
38 
39 #define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
40 
41 using namespace llvm;
42 
43 namespace {
44 
45 class AMDGPULowerKernelArguments : public FunctionPass{
46 public:
47  static char ID;
48 
49  AMDGPULowerKernelArguments() : FunctionPass(ID) {}
50 
51  bool runOnFunction(Function &F) override;
52 
53  void getAnalysisUsage(AnalysisUsage &AU) const override {
55  AU.setPreservesAll();
56  }
57 };
58 
59 } // end anonymous namespace
60 
63  if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
64  return false;
65 
66  auto &TPC = getAnalysis<TargetPassConfig>();
67 
68  const TargetMachine &TM = TPC.getTM<TargetMachine>();
69  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
70  LLVMContext &Ctx = F.getParent()->getContext();
71  const DataLayout &DL = F.getParent()->getDataLayout();
72  BasicBlock &EntryBlock = *F.begin();
73  IRBuilder<> Builder(&*EntryBlock.begin());
74 
75  const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary
76  const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
77 
78  unsigned MaxAlign;
79  // FIXME: Alignment is broken broken with explicit arg offset.;
80  const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
81  if (TotalKernArgSize == 0)
82  return false;
83 
84  CallInst *KernArgSegment =
85  Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {},
86  nullptr, F.getName() + ".kernarg.segment");
87 
88  KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
90  Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
91 
92  unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
93  uint64_t ExplicitArgOffset = 0;
94 
95  for (Argument &Arg : F.args()) {
96  Type *ArgTy = Arg.getType();
97  unsigned Align = DL.getABITypeAlignment(ArgTy);
98  unsigned Size = DL.getTypeSizeInBits(ArgTy);
99  unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
100 
101  uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset;
102  ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
103 
104  if (Arg.use_empty())
105  continue;
106 
107  if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
108  // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
109  // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
110  // can't represent this with range metadata because it's only allowed for
111  // integer types.
112  if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
113  PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
114  !ST.hasUsableDSOffset())
115  continue;
116 
117  // FIXME: We can replace this with equivalent alias.scope/noalias
118  // metadata, but this appears to be a lot of work.
119  if (Arg.hasNoAliasAttr())
120  continue;
121  }
122 
123  VectorType *VT = dyn_cast<VectorType>(ArgTy);
124  bool IsV3 = VT && VT->getNumElements() == 3;
125  bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType();
126 
127  VectorType *V4Ty = nullptr;
128 
129  int64_t AlignDownOffset = alignDown(EltOffset, 4);
130  int64_t OffsetDiff = EltOffset - AlignDownOffset;
131  unsigned AdjustedAlign = MinAlign(DoShiftOpt ? AlignDownOffset : EltOffset,
132  KernArgBaseAlign);
133 
134  Value *ArgPtr;
135  Type *AdjustedArgTy;
136  if (DoShiftOpt) { // FIXME: Handle aggregate types
137  // Since we don't have sub-dword scalar loads, avoid doing an extload by
138  // loading earlier than the argument address, and extracting the relevant
139  // bits.
140  //
141  // Additionally widen any sub-dword load to i32 even if suitably aligned,
142  // so that CSE between different argument loads works easily.
143  ArgPtr = Builder.CreateConstInBoundsGEP1_64(
144  Builder.getInt8Ty(), KernArgSegment, AlignDownOffset,
145  Arg.getName() + ".kernarg.offset.align.down");
146  AdjustedArgTy = Builder.getInt32Ty();
147  } else {
148  ArgPtr = Builder.CreateConstInBoundsGEP1_64(
149  Builder.getInt8Ty(), KernArgSegment, EltOffset,
150  Arg.getName() + ".kernarg.offset");
151  AdjustedArgTy = ArgTy;
152  }
153 
154  if (IsV3 && Size >= 32) {
155  V4Ty = VectorType::get(VT->getVectorElementType(), 4);
156  // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
157  AdjustedArgTy = V4Ty;
158  }
159 
160  ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS),
161  ArgPtr->getName() + ".cast");
162  LoadInst *Load =
163  Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);
165 
166  MDBuilder MDB(Ctx);
167 
168  if (isa<PointerType>(ArgTy)) {
169  if (Arg.hasNonNullAttr())
171 
172  uint64_t DerefBytes = Arg.getDereferenceableBytes();
173  if (DerefBytes != 0) {
174  Load->setMetadata(
176  MDNode::get(Ctx,
177  MDB.createConstant(
178  ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
179  }
180 
181  uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
182  if (DerefOrNullBytes != 0) {
183  Load->setMetadata(
185  MDNode::get(Ctx,
186  MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
187  DerefOrNullBytes))));
188  }
189 
190  unsigned ParamAlign = Arg.getParamAlignment();
191  if (ParamAlign != 0) {
192  Load->setMetadata(
194  MDNode::get(Ctx,
195  MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
196  ParamAlign))));
197  }
198  }
199 
200  // TODO: Convert noalias arg to !noalias
201 
202  if (DoShiftOpt) {
203  Value *ExtractBits = OffsetDiff == 0 ?
204  Load : Builder.CreateLShr(Load, OffsetDiff * 8);
205 
206  IntegerType *ArgIntTy = Builder.getIntNTy(Size);
207  Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
208  Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,
209  Arg.getName() + ".load");
210  Arg.replaceAllUsesWith(NewVal);
211  } else if (IsV3) {
212  Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty),
213  {0, 1, 2},
214  Arg.getName() + ".load");
215  Arg.replaceAllUsesWith(Shuf);
216  } else {
217  Load->setName(Arg.getName() + ".load");
218  Arg.replaceAllUsesWith(Load);
219  }
220  }
221 
222  KernArgSegment->addAttribute(
224  Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
225 
226  return true;
227 }
228 
229 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
230  "AMDGPU Lower Kernel Arguments", false, false)
232  false, false)
233 
234 char AMDGPULowerKernelArguments::ID = 0;
235 
237  return new AMDGPULowerKernelArguments();
238 }
Type * getVectorElementType() const
Definition: Type.h:371
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:110
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
AMDGPU specific subclass of TargetSubtarget.
This class represents lattice values for constants.
Definition: AllocatorList.h:23
ConstantAsMetadata * createConstant(Constant *C)
Return the given constant as metadata.
Definition: MDBuilder.cpp:24
static Attribute getWithAlignment(LLVMContext &Context, uint64_t Align)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:145
static Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)
Definition: Attributes.cpp:158
This class represents a function call, abstracting a target machine&#39;s calling convention.
This file contains the declarations for metadata subclasses.
#define DEBUG_TYPE
F(f)
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
Definition: MathExtras.h:689
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:580
An instruction for reading from memory.
Definition: Instructions.h:167
void addAttribute(unsigned i, Attribute::AttrKind Kind)
adds the attribute to the list of attributes.
Definition: InstrTypes.h:1383
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:268
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:722
AnalysisUsage & addRequired()
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:369
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:244
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:654
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:779
This file contains the simple types necessary to represent the attributes associated with functions a...
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:285
uint64_t getNumElements() const
For scalable vectors, this will return the minimum number of elements in the vector.
Definition: DerivedTypes.h:393
Target-Independent Code Generator Pass Configuration Options.
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:244
bool arg_empty() const
Definition: Function.h:723
iterator begin()
Definition: Function.h:680
Class to represent pointers.
Definition: DerivedTypes.h:544
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition: MathExtras.h:614
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata *> MDs)
Definition: Metadata.h:1165
static bool runOnFunction(Function &F, bool PostInlining)
FunctionPass * createAMDGPULowerKernelArgumentsPass()
LLVM Basic Block Representation.
Definition: BasicBlock.h:57
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:64
This file contains the declarations for the subclasses of Constant, which represent the different fla...
AMDGPU Lower Kernel Arguments
Represent the analysis usage information of a pass.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Class to represent integer types.
Definition: DerivedTypes.h:40
The AMDGPU TargetMachine interface definition for hw codgen targets.
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1436
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1222
INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments", false, false) INITIALIZE_PASS_END(AMDGPULowerKernelArguments
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:212
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:746
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition: Type.h:257
Address space for region memory. (GDS)
Definition: AMDGPU.h:267
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:179
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:643
Class to represent vector types.
Definition: DerivedTypes.h:427
void setPreservesAll()
Set by analyses that do not transform their input at all.
uint64_t getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:601
uint64_t getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:469
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:175
Address space for local memory.
Definition: AMDGPU.h:270
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:609
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:332
uint32_t Size
Definition: Profile.cpp:46
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:575
LLVM Value Representation.
Definition: Value.h:72
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:65
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:200
iterator_range< arg_iterator > args()
Definition: Function.h:713