LLVM 23.0.0git
AMDGPUSubtarget.h
Go to the documentation of this file.
1//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Base class for AMDGPU specific classes of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16
18#include "llvm/IR/CallingConv.h"
21
22namespace llvm {
23
24enum AMDGPUDwarfFlavour : unsigned;
25class Function;
26class Instruction;
27class MachineFunction;
28class TargetMachine;
29
31public:
47
48private:
49 const Triple &TargetTriple;
50
51protected:
52 bool HasMulI24 = true;
53 bool HasMulU24 = true;
54 bool HasSMulHi = false;
55 bool HasFminFmaxLegacy = true;
56
57 unsigned EUsPerCU = 4;
58 unsigned MaxWavesPerEU = 10;
59 unsigned LocalMemorySize = 0;
62 unsigned FlatOffsetBitWidth = 0;
63
64public:
65 AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {}
66
67 static const AMDGPUSubtarget &get(const MachineFunction &MF);
68 static const AMDGPUSubtarget &get(const TargetMachine &TM,
69 const Function &F);
70
71 /// \returns Default range flat work group size for a calling convention.
72 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
73
74 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
75 /// for function \p F, or minimum/maximum flat work group sizes explicitly
76 /// requested using "amdgpu-flat-work-group-size" attribute attached to
77 /// function \p F.
78 ///
79 /// \returns Subtarget's default values if explicitly requested values cannot
80 /// be converted to integer, or violate subtarget's specifications.
81 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
82
83 /// \returns true if the maximum flat work-group size for \p F is at most the
84 /// wavefront size, so a work-group may fit in a single wavefront.
85 bool isSingleWavefrontWorkgroup(const Function &F) const;
86
87 /// \returns The required size of workgroups that will be used to execute \p F
88 /// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`
89 /// metadata. Otherwise, returns std::nullopt.
90 std::optional<unsigned> getReqdWorkGroupSize(const Function &F,
91 unsigned Dim) const;
92
93 /// \returns true if \p F will execute in a manner that leaves the X
94 /// dimensions of the workitem ID evenly tiling wavefronts - that is, if X /
95 /// wavefrontsize is uniform. This is true if either the Y and Z block
96 /// dimensions are known to always be 1 or if the X dimension will always be a
97 /// power of 2. If \p RequireUniformYZ is true, it also ensures that the Y and
98 /// Z workitem IDs will be uniform (so, while a (32, 2, 1) launch with
99 /// wavesize64 would ordinarily pass this test, it won't with
100 /// \pRequiresUniformYZ).
101 ///
102 /// This information is currently only gathered from the !reqd_work_group_size
103 /// metadata on \p F, but this may be improved in the future.
105 bool REquiresUniformYZ = false) const;
106
107 /// \returns Subtarget's default pair of minimum/maximum number of waves per
108 /// execution unit for function \p F, or minimum/maximum number of waves per
109 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
110 /// attached to function \p F.
111 ///
112 /// \returns Subtarget's default values if explicitly requested values cannot
113 /// be converted to integer, violate subtarget's specifications, or are not
114 /// compatible with minimum/maximum number of waves limited by flat work group
115 /// size, register usage, and/or lds usage.
116 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
117
118 /// Overload which uses the specified values for the flat workgroup sizes and
119 /// LDS space rather than querying the function itself. \p FlatWorkGroupSizes
120 /// should correspond to the function's value for getFlatWorkGroupSizes and \p
121 /// LDSBytes to the per-workgroup LDS allocation.
122 std::pair<unsigned, unsigned>
123 getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
124 unsigned LDSBytes, const Function &F) const;
125
126 /// Returns the target minimum/maximum number of waves per EU. This is based
127 /// on the minimum/maximum number of \p RequestedWavesPerEU and further
128 /// limited by the maximum achievable occupancy derived from the range of \p
129 /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
130 std::pair<unsigned, unsigned>
131 getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
132 std::pair<unsigned, unsigned> FlatWorkGroupSizes,
133 unsigned LDSBytes) const;
134
135 /// Return the amount of LDS that can be used that will not restrict the
136 /// occupancy lower than WaveCount.
137 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
138 const Function &) const;
139
140 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
141 /// be achieved when the only function running on a CU is \p F and each
142 /// workgroup running the function requires \p LDSBytes bytes of LDS space.
143 /// This notably depends on the range of allowed flat group sizes for the
144 /// function and hardware characteristics.
145 std::pair<unsigned, unsigned>
149
150 /// Overload which uses the specified values for the flat work group sizes,
151 /// rather than querying the function itself. \p FlatWorkGroupSizes should
152 /// correspond to the function's value for getFlatWorkGroupSizes.
153 std::pair<unsigned, unsigned> getOccupancyWithWorkGroupSizes(
154 uint32_t LDSBytes,
155 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
156
157 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
158 /// be achieved when the only function running on a CU is \p MF. This notably
159 /// depends on the range of allowed flat group sizes for the function, the
160 /// amount of per-workgroup LDS space required by the function, and hardware
161 /// characteristics.
162 std::pair<unsigned, unsigned>
164
165 bool isAmdHsaOS() const {
166 return TargetTriple.getOS() == Triple::AMDHSA;
167 }
168
169 bool isAmdPalOS() const {
170 return TargetTriple.getOS() == Triple::AMDPAL;
171 }
172
173 bool isMesa3DOS() const {
174 return TargetTriple.getOS() == Triple::Mesa3D;
175 }
176
177 bool isMesaKernel(const Function &F) const;
178
179 bool isAmdHsaOrMesa(const Function &F) const {
180 return isAmdHsaOS() || isMesaKernel(F);
181 }
182
183 bool isGCN() const { return TargetTriple.isAMDGCN(); }
184
185 //==---------------------------------------------------------------------===//
186 // TableGen-generated feature getters.
187 //==---------------------------------------------------------------------===//
188#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
189 virtual bool GETTER() const { return false; }
190#include "AMDGPUGenSubtargetInfo.inc"
191 //==---------------------------------------------------------------------===//
192
193 /// Return true if real (non-fake) variants of True16 instructions using
194 /// 16-bit registers should be code-generated. Fake True16 instructions are
195 /// identical to non-fake ones except that they take 32-bit registers as
196 /// operands and always use their low halves.
197 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
198 // supported and the support for fake True16 instructions is removed.
199 bool useRealTrue16Insts() const {
200 return hasTrue16BitInsts() && enableRealTrue16Insts();
201 }
202
203 bool hasMulI24() const {
204 return HasMulI24;
205 }
206
207 bool hasMulU24() const {
208 return HasMulU24;
209 }
210
211 bool hasSMulHi() const {
212 return HasSMulHi;
213 }
214
215 bool hasFminFmaxLegacy() const {
216 return HasFminFmaxLegacy;
217 }
218
219 unsigned getWavefrontSize() const {
220 return 1 << WavefrontSizeLog2;
221 }
222
223 unsigned getWavefrontSizeLog2() const {
224 return WavefrontSizeLog2;
225 }
226
227 /// Return the maximum number of bytes of LDS available for all workgroups
228 /// running on the same WGP or CU.
229 /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is
230 /// limited to 64k.
231 unsigned getLocalMemorySize() const {
232 return LocalMemorySize;
233 }
234
235 /// Return the maximum number of bytes of LDS that can be allocated to a
236 /// single workgroup.
237 /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has
238 /// 128k in total.
241 }
242
243 /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
244 /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
245 /// CU mode into account.
246 unsigned getEUsPerCU() const { return EUsPerCU; }
247
249 return isAmdHsaOS() ? Align(8) : Align(4);
250 }
251
252 /// Returns the offset in bytes from the start of the input buffer
253 /// of the first explicit kernel argument.
254 unsigned getExplicitKernelArgOffset() const {
255 switch (TargetTriple.getOS()) {
256 case Triple::AMDHSA:
257 case Triple::AMDPAL:
258 case Triple::Mesa3D:
259 return 0;
261 default:
262 // For legacy reasons unknown/other is treated as a different version of
263 // mesa.
264 return 36;
265 }
266
267 llvm_unreachable("invalid triple OS");
268 }
269
270 /// \returns Maximum number of work groups per compute unit supported by the
271 /// subtarget and limited by given \p FlatWorkGroupSize.
272 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
273
274 /// \returns Minimum flat work group size supported by the subtarget.
275 virtual unsigned getMinFlatWorkGroupSize() const = 0;
276
277 /// \returns Maximum flat work group size supported by the subtarget.
278 virtual unsigned getMaxFlatWorkGroupSize() const = 0;
279
280 /// \returns Number of waves per execution unit required to support the given
281 /// \p FlatWorkGroupSize.
282 virtual unsigned
283 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
284
285 /// \returns Minimum number of waves per execution unit supported by the
286 /// subtarget.
287 virtual unsigned getMinWavesPerEU() const = 0;
288
289 /// \returns Maximum number of waves per execution unit supported by the
290 /// subtarget without any kind of limitation.
291 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
292
293 /// Return the maximum workitem ID value in the function, for the given (0, 1,
294 /// 2) dimension.
295 unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
296
297 /// Return the number of work groups for the function.
299
300 /// Return true if only a single workitem can be active in a wave.
301 bool isSingleLaneExecution(const Function &Kernel) const;
302
303 /// Creates value range metadata on an workitemid.* intrinsic call or load.
305
306 /// \returns Number of bytes of arguments that are passed to a shader or
307 /// kernel in addition to the explicit ones declared for the function.
308 unsigned getImplicitArgNumBytes(const Function &F) const;
309 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
310 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
311
312 /// \returns Corresponding DWARF register number mapping flavour for the
313 /// \p WavefrontSize.
315
316 virtual ~AMDGPUSubtarget() = default;
317};
318
319} // end namespace llvm
320
321#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file defines the SmallVector class.
bool hasFminFmaxLegacy() const
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
Align getAlignmentForImplicitArgPtr() const
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
bool isMesaKernel(const Function &F) const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
virtual unsigned getMinWavesPerEU() const =0
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getLocalMemorySize() const
Return the maximum number of bytes of LDS available for all workgroups running on the same WGP or CU.
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
unsigned getWavefrontSizeLog2() const
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
virtual ~AMDGPUSubtarget()=default
bool isAmdHsaOrMesa(const Function &F) const
AMDGPUSubtarget(const Triple &TT)
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
virtual unsigned getMaxFlatWorkGroupSize() const =0
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getMaxWavesPerEU() const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
unsigned AddressableLocalMemorySize
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
unsigned getWavefrontSize() const
virtual unsigned getMinFlatWorkGroupSize() const =0
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > RequestedWavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes, unsigned LDSBytes) const
Returns the target minimum/maximum number of waves per EU.
bool isSingleWavefrontWorkgroup(const Function &F) const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Primary interface to the complete machine description for the target machine.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39