LLVM 20.0.0git
AMDGPUSubtarget.cpp
Go to the documentation of this file.
1//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Implements the AMDGPU specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUSubtarget.h"
15#include "AMDGPUCallLowering.h"
17#include "AMDGPULegalizerInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "R600Subtarget.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
29#include "llvm/IR/IntrinsicsR600.h"
30#include "llvm/IR/MDBuilder.h"
32#include <algorithm>
33
34using namespace llvm;
35
36#define DEBUG_TYPE "amdgpu-subtarget"
37
39
42}
43
44// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
45// allows the given function to achieve an occupancy of NWaves waves per
46// SIMD / EU, taking into account only the function's *maximum* workgroup size.
47unsigned
49 const Function &F) const {
50 const unsigned WaveSize = getWavefrontSize();
51 const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
52 const unsigned WavesPerWorkgroup =
53 std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
54
55 const unsigned WorkGroupsPerCU =
56 std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
57
58 return getLocalMemorySize() / WorkGroupsPerCU;
59}
60
61// FIXME: Should return min,max range.
62//
63// Returns the maximum occupancy, in number of waves per SIMD / EU, that can
64// be achieved when only the given function is running on the machine; and
65// taking into account the overall number of wave slots, the (maximum) workgroup
66// size, and the per-workgroup LDS allocation size.
68 const Function &F) const {
69 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
70 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
71 if (!MaxWorkGroupsPerCu)
72 return 0;
73
74 const unsigned WaveSize = getWavefrontSize();
75
76 // FIXME: Do we need to account for alignment requirement of LDS rounding the
77 // size up?
78 // Compute restriction based on LDS usage
79 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
80
81 // This can be queried with more LDS than is possible, so just assume the
82 // worst.
83 if (NumGroups == 0)
84 return 1;
85
86 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
87
88 // Round to the number of waves per CU.
89 const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
90 unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
91
92 // Number of waves per EU (SIMD).
93 MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
94
95 // Clamp to the maximum possible number of waves.
96 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
97
98 // FIXME: Needs to be a multiple of the group size?
99 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
100
101 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
102 "computed invalid occupancy");
103 return MaxWaves;
104}
105
106unsigned
108 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
109 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
110}
111
112std::pair<unsigned, unsigned>
114 switch (CC) {
121 return std::pair(1, getWavefrontSize());
122 default:
123 return std::pair(1u, getMaxFlatWorkGroupSize());
124 }
125}
126
127std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
128 const Function &F) const {
129 // Default minimum/maximum flat work group sizes.
130 std::pair<unsigned, unsigned> Default =
131 getDefaultFlatWorkGroupSize(F.getCallingConv());
132
133 // Requested minimum/maximum flat work group sizes.
134 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
135 F, "amdgpu-flat-work-group-size", Default);
136
137 // Make sure requested minimum is less than requested maximum.
138 if (Requested.first > Requested.second)
139 return Default;
140
141 // Make sure requested values do not violate subtarget's specifications.
142 if (Requested.first < getMinFlatWorkGroupSize())
143 return Default;
144 if (Requested.second > getMaxFlatWorkGroupSize())
145 return Default;
146
147 return Requested;
148}
149
150std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
151 std::pair<unsigned, unsigned> Requested,
152 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
153 // Default minimum/maximum number of waves per execution unit.
154 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
155
156 // If minimum/maximum flat work group sizes were explicitly requested using
157 // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
158 // number of waves per execution unit to values implied by requested
159 // minimum/maximum flat work group sizes.
160 unsigned MinImpliedByFlatWorkGroupSize =
161 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
162 Default.first = MinImpliedByFlatWorkGroupSize;
163
164 // Make sure requested minimum is less than requested maximum.
165 if (Requested.second && Requested.first > Requested.second)
166 return Default;
167
168 // Make sure requested values do not violate subtarget's specifications.
169 if (Requested.first < getMinWavesPerEU() ||
170 Requested.second > getMaxWavesPerEU())
171 return Default;
172
173 // Make sure requested values are compatible with values implied by requested
174 // minimum/maximum flat work group sizes.
175 if (Requested.first < MinImpliedByFlatWorkGroupSize)
176 return Default;
177
178 return Requested;
179}
180
181std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
182 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
183 // Default minimum/maximum number of waves per execution unit.
184 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
185
186 // Requested minimum/maximum number of waves per execution unit.
187 std::pair<unsigned, unsigned> Requested =
188 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true);
189 return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes);
190}
191
192static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
193 auto Node = Kernel.getMetadata("reqd_work_group_size");
194 if (Node && Node->getNumOperands() == 3)
195 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
196 return std::numeric_limits<unsigned>::max();
197}
198
200 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
201}
202
204 unsigned Dimension) const {
205 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
206 if (ReqdSize != std::numeric_limits<unsigned>::max())
207 return ReqdSize - 1;
208 return getFlatWorkGroupSizes(Kernel).second - 1;
209}
210
212 for (int I = 0; I < 3; ++I) {
213 if (getMaxWorkitemID(Func, I) > 0)
214 return false;
215 }
216
217 return true;
218}
219
221 Function *Kernel = I->getParent()->getParent();
222 unsigned MinSize = 0;
223 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
224 bool IdQuery = false;
225
226 // If reqd_work_group_size is present it narrows value down.
227 if (auto *CI = dyn_cast<CallInst>(I)) {
228 const Function *F = CI->getCalledFunction();
229 if (F) {
230 unsigned Dim = UINT_MAX;
231 switch (F->getIntrinsicID()) {
232 case Intrinsic::amdgcn_workitem_id_x:
233 case Intrinsic::r600_read_tidig_x:
234 IdQuery = true;
235 [[fallthrough]];
236 case Intrinsic::r600_read_local_size_x:
237 Dim = 0;
238 break;
239 case Intrinsic::amdgcn_workitem_id_y:
240 case Intrinsic::r600_read_tidig_y:
241 IdQuery = true;
242 [[fallthrough]];
243 case Intrinsic::r600_read_local_size_y:
244 Dim = 1;
245 break;
246 case Intrinsic::amdgcn_workitem_id_z:
247 case Intrinsic::r600_read_tidig_z:
248 IdQuery = true;
249 [[fallthrough]];
250 case Intrinsic::r600_read_local_size_z:
251 Dim = 2;
252 break;
253 default:
254 break;
255 }
256
257 if (Dim <= 3) {
258 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
259 if (ReqdSize != std::numeric_limits<unsigned>::max())
260 MinSize = MaxSize = ReqdSize;
261 }
262 }
263 }
264
265 if (!MaxSize)
266 return false;
267
268 // Range metadata is [Lo, Hi). For ID query we need to pass max size
269 // as Hi. For size query we need to pass Hi + 1.
270 if (IdQuery)
271 MinSize = 0;
272 else
273 ++MaxSize;
274
275 APInt Lower{32, MinSize};
276 APInt Upper{32, MaxSize};
277 if (auto *CI = dyn_cast<CallBase>(I)) {
279 CI->addRangeRetAttr(Range);
280 } else {
281 MDBuilder MDB(I->getContext());
282 MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lower, Upper);
283 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
284 }
285 return true;
286}
287
289 assert(AMDGPU::isKernel(F.getCallingConv()));
290
291 // We don't allocate the segment if we know the implicit arguments weren't
292 // used, even if the ABI implies we need them.
293 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
294 return 0;
295
296 if (isMesaKernel(F))
297 return 16;
298
299 // Assume all implicit inputs are used by default
300 const Module *M = F.getParent();
301 unsigned NBytes =
303 return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
304 NBytes);
305}
306
308 Align &MaxAlign) const {
309 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
310 F.getCallingConv() == CallingConv::SPIR_KERNEL);
311
312 const DataLayout &DL = F.getDataLayout();
313 uint64_t ExplicitArgBytes = 0;
314 MaxAlign = Align(1);
315
316 for (const Argument &Arg : F.args()) {
317 const bool IsByRef = Arg.hasByRefAttr();
318 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
319 Align Alignment = DL.getValueOrABITypeAlignment(
320 IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);
321 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
322 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
323 MaxAlign = std::max(MaxAlign, Alignment);
324 }
325
326 return ExplicitArgBytes;
327}
328
330 Align &MaxAlign) const {
331 if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
332 F.getCallingConv() != CallingConv::SPIR_KERNEL)
333 return 0;
334
335 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
336
337 unsigned ExplicitOffset = getExplicitKernelArgOffset();
338
339 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
340 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
341 if (ImplicitBytes != 0) {
342 const Align Alignment = getAlignmentForImplicitArgPtr();
343 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
344 MaxAlign = std::max(MaxAlign, Alignment);
345 }
346
347 // Being able to dereference past the end is useful for emitting scalar loads.
348 return alignTo(TotalSize, 4);
349}
350
354}
355
358 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
359 return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>());
360}
361
363 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
364 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
365 return static_cast<const AMDGPUSubtarget &>(
366 TM.getSubtarget<R600Subtarget>(F));
367}
368
371 return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3);
372}
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the InstructionSelector class for AMDGPU.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim)
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file describes how to lower LLVM inline asm to machine code INLINEASM.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
if(PassOpts->AAPipeline)
AMDGPU R600 specific subclass of TargetSubtarget.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallString class.
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
Align getAlignmentForImplicitArgPtr() const
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
bool isMesaKernel(const Function &F) const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
virtual unsigned getMinWavesPerEU() const =0
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getLocalMemorySize() const
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
virtual unsigned getMaxFlatWorkGroupSize() const =0
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getMaxWavesPerEU() const
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > WavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes) const
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
unsigned getWavefrontSize() const
virtual unsigned getMinFlatWorkGroupSize() const =0
Class for arbitrary precision integers.
Definition: APInt.h:77
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
This class represents a range of values.
Definition: ConstantRange.h:47
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition: Value.h:565
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:97
Metadata node.
Definition: Metadata.h:1069
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const Triple & getTargetTriple() const
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:373
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
SmallVector< unsigned > getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isShader(CallingConv::ID cc)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1856
@ Default
The result values are uniform if and only if all operands are uniform.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39