28#include "llvm/IR/IntrinsicsAMDGPU.h"
29#include "llvm/IR/IntrinsicsR600.h"
36#define DEBUG_TYPE "amdgpu-subtarget"
52 const unsigned WavesPerWorkgroup =
53 std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
55 const unsigned WorkGroupsPerCU =
56 std::max(1u, (NWaves *
getEUsPerCU()) / WavesPerWorkgroup);
71 if (!MaxWorkGroupsPerCu)
86 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
89 const unsigned MaxGroupNumWaves =
divideCeil(MaxWorkGroupSize, WaveSize);
90 unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
102 "computed invalid occupancy");
112std::pair<unsigned, unsigned>
130 std::pair<unsigned, unsigned>
Default =
135 F,
"amdgpu-flat-work-group-size",
Default);
138 if (Requested.first > Requested.second)
151 std::pair<unsigned, unsigned> Requested,
152 std::pair<unsigned, unsigned> FlatWorkGroupSizes)
const {
160 unsigned MinImpliedByFlatWorkGroupSize =
162 Default.first = MinImpliedByFlatWorkGroupSize;
165 if (Requested.second && Requested.first > Requested.second)
175 if (Requested.first < MinImpliedByFlatWorkGroupSize)
182 const Function &
F, std::pair<unsigned, unsigned> FlatWorkGroupSizes)
const {
187 std::pair<unsigned, unsigned> Requested =
194 if (
Node &&
Node->getNumOperands() == 3)
195 return mdconst::extract<ConstantInt>(
Node->getOperand(Dim))->getZExtValue();
196 return std::numeric_limits<unsigned>::max();
204 unsigned Dimension)
const {
206 if (ReqdSize != std::numeric_limits<unsigned>::max())
212 for (
int I = 0;
I < 3; ++
I) {
221 Function *Kernel =
I->getParent()->getParent();
222 unsigned MinSize = 0;
224 bool IdQuery =
false;
227 if (
auto *CI = dyn_cast<CallInst>(
I)) {
228 const Function *
F = CI->getCalledFunction();
230 unsigned Dim = UINT_MAX;
231 switch (
F->getIntrinsicID()) {
232 case Intrinsic::amdgcn_workitem_id_x:
233 case Intrinsic::r600_read_tidig_x:
236 case Intrinsic::r600_read_local_size_x:
239 case Intrinsic::amdgcn_workitem_id_y:
240 case Intrinsic::r600_read_tidig_y:
243 case Intrinsic::r600_read_local_size_y:
246 case Intrinsic::amdgcn_workitem_id_z:
247 case Intrinsic::r600_read_tidig_z:
250 case Intrinsic::r600_read_local_size_z:
259 if (ReqdSize != std::numeric_limits<unsigned>::max())
260 MinSize = MaxSize = ReqdSize;
277 if (
auto *CI = dyn_cast<CallBase>(
I)) {
279 CI->addRangeRetAttr(
Range);
283 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
293 if (
F.hasFnAttribute(
"amdgpu-no-implicitarg-ptr"))
300 const Module *M =
F.getParent();
303 return F.getFnAttributeAsParsedInteger(
"amdgpu-implicitarg-num-bytes",
308 Align &MaxAlign)
const {
317 const bool IsByRef = Arg.hasByRefAttr();
318 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
319 Align Alignment =
DL.getValueOrABITypeAlignment(
320 IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);
321 uint64_t AllocSize =
DL.getTypeAllocSize(ArgTy);
322 ExplicitArgBytes =
alignTo(ExplicitArgBytes, Alignment) + AllocSize;
323 MaxAlign = std::max(MaxAlign, Alignment);
326 return ExplicitArgBytes;
330 Align &MaxAlign)
const {
339 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
341 if (ImplicitBytes != 0) {
343 TotalSize =
alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
344 MaxAlign = std::max(MaxAlign, Alignment);
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the InstructionSelector class for AMDGPU.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim)
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file describes how to lower LLVM inline asm to machine code INLINEASM.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
AMDGPU R600 specific subclass of TargetSubtarget.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallString class.
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
bool EnableRealTrue16Insts
Align getAlignmentForImplicitArgPtr() const
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
bool isMesaKernel(const Function &F) const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
virtual unsigned getMinWavesPerEU() const =0
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getLocalMemorySize() const
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
virtual unsigned getMaxFlatWorkGroupSize() const =0
AMDGPUSubtarget(Triple TT)
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getMaxWavesPerEU() const
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > WavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes) const
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
unsigned getWavefrontSize() const
virtual unsigned getMinFlatWorkGroupSize() const =0
Class for arbitrary precision integers.
This class represents an incoming formal argument to a Function.
This class represents a range of values.
A parsed version of the target data layout string in and methods for querying it.
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
A Module instance is used to store all the information related to an LLVM module.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
Triple - Helper class for working with autoconf configuration names.
ArchType getArch() const
Get the parsed architecture type of this triple.
The instances of the Type class are immutable: once they are created, they are never changed.
SmallVector< unsigned > getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isShader(CallingConv::ID cc)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
This is an optimization pass for GlobalISel generic memory operations.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
@ Default
The result values are uniform if and only if all operands are uniform.
Implement std::hash so that hash_code can be used in STL containers.
This struct is a compact representation of a valid (non-zero power of two) alignment.