LLVM 23.0.0git
AMDGPUSubtarget.h
Go to the documentation of this file.
1//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Base class for AMDGPU specific classes of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16
18#include "llvm/IR/CallingConv.h"
21
22namespace llvm {
23
24enum AMDGPUDwarfFlavour : unsigned;
25class Function;
26class Instruction;
27class MachineFunction;
28class TargetMachine;
29
31public:
47
48private:
49 Triple TargetTriple;
50
51protected:
52 bool HasMulI24 = true;
53 bool HasMulU24 = true;
54 bool HasSMulHi = false;
55 bool HasFminFmaxLegacy = true;
56
57 unsigned EUsPerCU = 4;
58 unsigned MaxWavesPerEU = 10;
59 unsigned LocalMemorySize = 0;
62
63public:
64 AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {}
65
66 static const AMDGPUSubtarget &get(const MachineFunction &MF);
67 static const AMDGPUSubtarget &get(const TargetMachine &TM,
68 const Function &F);
69
70 /// \returns Default range flat work group size for a calling convention.
71 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
72
73 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
74 /// for function \p F, or minimum/maximum flat work group sizes explicitly
75 /// requested using "amdgpu-flat-work-group-size" attribute attached to
76 /// function \p F.
77 ///
78 /// \returns Subtarget's default values if explicitly requested values cannot
79 /// be converted to integer, or violate subtarget's specifications.
80 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
81
82 /// \returns The required size of workgroups that will be used to execute \p F
83 /// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`
84 /// metadata. Otherwise, returns std::nullopt.
85 std::optional<unsigned> getReqdWorkGroupSize(const Function &F,
86 unsigned Dim) const;
87
88 /// \returns true if \p F will execute in a manner that leaves the X
89 /// dimensions of the workitem ID evenly tiling wavefronts - that is, if X /
90 /// wavefrontsize is uniform. This is true if either the Y and Z block
91 /// dimensions are known to always be 1 or if the X dimension will always be a
92 /// power of 2. If \p RequireUniformYZ is true, it also ensures that the Y and
93 /// Z workitem IDs will be uniform (so, while a (32, 2, 1) launch with
94 /// wavesize64 would ordinarily pass this test, it won't with
95 /// \pRequiresUniformYZ).
96 ///
97 /// This information is currently only gathered from the !reqd_work_group_size
98 /// metadata on \p F, but this may be improved in the future.
100 bool REquiresUniformYZ = false) const;
101
102 /// \returns Subtarget's default pair of minimum/maximum number of waves per
103 /// execution unit for function \p F, or minimum/maximum number of waves per
104 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
105 /// attached to function \p F.
106 ///
107 /// \returns Subtarget's default values if explicitly requested values cannot
108 /// be converted to integer, violate subtarget's specifications, or are not
109 /// compatible with minimum/maximum number of waves limited by flat work group
110 /// size, register usage, and/or lds usage.
111 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
112
113 /// Overload which uses the specified values for the flat work group sizes,
114 /// rather than querying the function itself. \p FlatWorkGroupSizes Should
115 /// correspond to the function's value for getFlatWorkGroupSizes.
116 std::pair<unsigned, unsigned>
118 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
119
120 /// Overload which uses the specified values for the flat workgroup sizes and
121 /// LDS space rather than querying the function itself. \p FlatWorkGroupSizes
122 /// should correspond to the function's value for getFlatWorkGroupSizes and \p
123 /// LDSBytes to the per-workgroup LDS allocation.
124 std::pair<unsigned, unsigned>
125 getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
126 unsigned LDSBytes, const Function &F) const;
127
128 /// Returns the target minimum/maximum number of waves per EU. This is based
129 /// on the minimum/maximum number of \p RequestedWavesPerEU and further
130 /// limited by the maximum achievable occupancy derived from the range of \p
131 /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
132 std::pair<unsigned, unsigned>
133 getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
134 std::pair<unsigned, unsigned> FlatWorkGroupSizes,
135 unsigned LDSBytes) const;
136
137 /// Return the amount of LDS that can be used that will not restrict the
138 /// occupancy lower than WaveCount.
139 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
140 const Function &) const;
141
142 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
143 /// be achieved when the only function running on a CU is \p F and each
144 /// workgroup running the function requires \p LDSBytes bytes of LDS space.
145 /// This notably depends on the range of allowed flat group sizes for the
146 /// function and hardware characteristics.
147 std::pair<unsigned, unsigned>
151
152 /// Overload which uses the specified values for the flat work group sizes,
153 /// rather than querying the function itself. \p FlatWorkGroupSizes should
154 /// correspond to the function's value for getFlatWorkGroupSizes.
155 std::pair<unsigned, unsigned> getOccupancyWithWorkGroupSizes(
156 uint32_t LDSBytes,
157 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
158
159 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
160 /// be achieved when the only function running on a CU is \p MF. This notably
161 /// depends on the range of allowed flat group sizes for the function, the
162 /// amount of per-workgroup LDS space required by the function, and hardware
163 /// characteristics.
164 std::pair<unsigned, unsigned>
166
167 bool isAmdHsaOS() const {
168 return TargetTriple.getOS() == Triple::AMDHSA;
169 }
170
171 bool isAmdPalOS() const {
172 return TargetTriple.getOS() == Triple::AMDPAL;
173 }
174
175 bool isMesa3DOS() const {
176 return TargetTriple.getOS() == Triple::Mesa3D;
177 }
178
179 bool isMesaKernel(const Function &F) const;
180
181 bool isAmdHsaOrMesa(const Function &F) const {
182 return isAmdHsaOS() || isMesaKernel(F);
183 }
184
185 bool isGCN() const { return TargetTriple.isAMDGCN(); }
186
187 //==---------------------------------------------------------------------===//
188 // TableGen-generated feature getters.
189 //==---------------------------------------------------------------------===//
190#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
191 virtual bool GETTER() const { return false; }
192#include "AMDGPUGenSubtargetInfo.inc"
193 //==---------------------------------------------------------------------===//
194
195 /// Return true if real (non-fake) variants of True16 instructions using
196 /// 16-bit registers should be code-generated. Fake True16 instructions are
197 /// identical to non-fake ones except that they take 32-bit registers as
198 /// operands and always use their low halves.
199 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
200 // supported and the support for fake True16 instructions is removed.
201 bool useRealTrue16Insts() const {
202 return hasTrue16BitInsts() && enableRealTrue16Insts();
203 }
204
205 bool hasMulI24() const {
206 return HasMulI24;
207 }
208
209 bool hasMulU24() const {
210 return HasMulU24;
211 }
212
213 bool hasSMulHi() const {
214 return HasSMulHi;
215 }
216
217 bool hasFminFmaxLegacy() const {
218 return HasFminFmaxLegacy;
219 }
220
221 unsigned getWavefrontSize() const {
222 return 1 << WavefrontSizeLog2;
223 }
224
225 unsigned getWavefrontSizeLog2() const {
226 return WavefrontSizeLog2;
227 }
228
229 /// Return the maximum number of bytes of LDS available for all workgroups
230 /// running on the same WGP or CU.
231 /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is
232 /// limited to 64k.
233 unsigned getLocalMemorySize() const {
234 return LocalMemorySize;
235 }
236
237 /// Return the maximum number of bytes of LDS that can be allocated to a
238 /// single workgroup.
239 /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has
240 /// 128k in total.
243 }
244
245 /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
246 /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
247 /// CU mode into account.
248 unsigned getEUsPerCU() const { return EUsPerCU; }
249
251 return isAmdHsaOS() ? Align(8) : Align(4);
252 }
253
254 /// Returns the offset in bytes from the start of the input buffer
255 /// of the first explicit kernel argument.
256 unsigned getExplicitKernelArgOffset() const {
257 switch (TargetTriple.getOS()) {
258 case Triple::AMDHSA:
259 case Triple::AMDPAL:
260 case Triple::Mesa3D:
261 return 0;
263 default:
264 // For legacy reasons unknown/other is treated as a different version of
265 // mesa.
266 return 36;
267 }
268
269 llvm_unreachable("invalid triple OS");
270 }
271
272 /// \returns Maximum number of work groups per compute unit supported by the
273 /// subtarget and limited by given \p FlatWorkGroupSize.
274 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
275
276 /// \returns Minimum flat work group size supported by the subtarget.
277 virtual unsigned getMinFlatWorkGroupSize() const = 0;
278
279 /// \returns Maximum flat work group size supported by the subtarget.
280 virtual unsigned getMaxFlatWorkGroupSize() const = 0;
281
282 /// \returns Number of waves per execution unit required to support the given
283 /// \p FlatWorkGroupSize.
284 virtual unsigned
285 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
286
287 /// \returns Minimum number of waves per execution unit supported by the
288 /// subtarget.
289 virtual unsigned getMinWavesPerEU() const = 0;
290
291 /// \returns Maximum number of waves per execution unit supported by the
292 /// subtarget without any kind of limitation.
293 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
294
295 /// Return the maximum workitem ID value in the function, for the given (0, 1,
296 /// 2) dimension.
297 unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
298
299 /// Return the number of work groups for the function.
301
302 /// Return true if only a single workitem can be active in a wave.
303 bool isSingleLaneExecution(const Function &Kernel) const;
304
305 /// Creates value range metadata on an workitemid.* intrinsic call or load.
307
308 /// \returns Number of bytes of arguments that are passed to a shader or
309 /// kernel in addition to the explicit ones declared for the function.
310 unsigned getImplicitArgNumBytes(const Function &F) const;
311 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
312 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
313
314 /// \returns Corresponding DWARF register number mapping flavour for the
315 /// \p WavefrontSize.
317
318 virtual ~AMDGPUSubtarget() = default;
319};
320
321} // end namespace llvm
322
323#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file defines the SmallVector class.
bool hasFminFmaxLegacy() const
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
Align getAlignmentForImplicitArgPtr() const
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
bool isMesaKernel(const Function &F) const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
virtual unsigned getMinWavesPerEU() const =0
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getLocalMemorySize() const
Return the maximum number of bytes of LDS available for all workgroups running on the same WGP or CU.
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
unsigned getWavefrontSizeLog2() const
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
virtual ~AMDGPUSubtarget()=default
bool isAmdHsaOrMesa(const Function &F) const
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
virtual unsigned getMaxFlatWorkGroupSize() const =0
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getMaxWavesPerEU() const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
unsigned AddressableLocalMemorySize
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
unsigned getWavefrontSize() const
virtual unsigned getMinFlatWorkGroupSize() const =0
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > RequestedWavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes, unsigned LDSBytes) const
Returns the target minimum/maximum number of waves per EU.
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F, std::pair< unsigned, unsigned > FlatWorkGroupSizes) const
Overload which uses the specified values for the flat work group sizes, rather than querying the func...
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Primary interface to the complete machine description for the target machine.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1915
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39