LLVM 19.0.0git
AMDGPUTargetTransformInfo.h
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file a TargetTransformInfo::Concept conforming object specific to the
11/// AMDGPU target machine. It uses the target's detailed information to
12/// provide more precise answers to certain TTI queries, while letting the
13/// target independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
18#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19
20#include "AMDGPU.h"
22#include <optional>
23
24namespace llvm {
25
26class AMDGPUTargetMachine;
27class GCNSubtarget;
28class InstCombiner;
29class Loop;
30class ScalarEvolution;
31class SITargetLowering;
32class Type;
33class Value;
34
35class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
38
39 friend BaseT;
40
41 Triple TargetTriple;
42
43 const TargetSubtargetInfo *ST;
44 const TargetLoweringBase *TLI;
45
46 const TargetSubtargetInfo *getST() const { return ST; }
47 const TargetLoweringBase *getTLI() const { return TLI; }
48
49public:
50 explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
51
55
58
60};
61
62class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
65
66 friend BaseT;
67
68 const GCNSubtarget *ST;
69 const SITargetLowering *TLI;
70 AMDGPUTTIImpl CommonTTI;
71 bool IsGraphics;
72 bool HasFP32Denormals;
73 bool HasFP64FP16Denormals;
74 static constexpr bool InlinerVectorBonusPercent = 0;
75
76 static const FeatureBitset InlineFeatureIgnoreList;
77
78 const GCNSubtarget *getST() const { return ST; }
79 const SITargetLowering *getTLI() const { return TLI; }
80
81 static inline int getFullRateInstrCost() {
83 }
84
85 static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) {
86 return CostKind == TTI::TCK_CodeSize ? 2
88 }
89
90 // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
91 // should be 2 or 4.
92 static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) {
93 return CostKind == TTI::TCK_CodeSize ? 2
95 }
96
97 // On some parts, normal fp64 operations are half rate, and others
98 // quarter. This also applies to some integer operations.
99 int get64BitInstrCost(TTI::TargetCostKind CostKind) const;
100
101 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const;
102
103public:
104 explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
105
106 bool hasBranchDivergence(const Function *F = nullptr) const;
107
111
114
116 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
118 }
119
120 unsigned getNumberOfRegisters(unsigned RCID) const;
122 unsigned getMinVectorRegisterBitWidth() const;
123 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
124 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
125 unsigned ChainSizeInBytes,
126 VectorType *VecTy) const;
127 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
128 unsigned ChainSizeInBytes,
129 VectorType *VecTy) const;
130 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
131
132 bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
133 unsigned AddrSpace) const;
134 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
135 unsigned AddrSpace) const;
136 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
137 unsigned AddrSpace) const;
138
141 LLVMContext & Context, Value * Length, unsigned SrcAddrSpace,
142 unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
143 std::optional<uint32_t> AtomicElementSize) const;
144
146 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
147 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
148 unsigned SrcAlign, unsigned DestAlign,
149 std::optional<uint32_t> AtomicCpySize) const;
151
153
155 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
157 TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
158 ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
159 const Instruction *CxtI = nullptr);
160
161 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
162 const Instruction *I = nullptr);
163
164 bool isInlineAsmSourceOfDivergence(const CallInst *CI,
165 ArrayRef<unsigned> Indices = {}) const;
166
168 InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
170 unsigned Index, Value *Op0, Value *Op1);
171
172 bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
173 bool isSourceOfDivergence(const Value *V) const;
174 bool isAlwaysUniform(const Value *V) const;
175
176 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
177 if (ToAS == AMDGPUAS::FLAT_ADDRESS) {
178 switch (FromAS) {
184 return true;
185 default:
186 break;
187 }
188 return false;
189 }
190 if ((FromAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
192 (FromAS == AMDGPUAS::CONSTANT_ADDRESS &&
194 return true;
195 return false;
196 }
197
198 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
199 return AMDGPU::addrspacesMayAlias(AS0, AS1);
200 }
201
202 unsigned getFlatAddressSpace() const {
203 // Don't bother running InferAddressSpaces pass on graphics shaders which
204 // don't use flat addressing.
205 if (IsGraphics)
206 return -1;
208 }
209
211 Intrinsic::ID IID) const;
212
216 }
217
219 Value *NewV) const;
220
221 bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
222 const Value *Op1, InstCombiner &IC) const;
223 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
224 IntrinsicInst &II) const;
225 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
226 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
227 APInt &UndefElts2, APInt &UndefElts3,
228 std::function<void(Instruction *, unsigned, APInt, APInt &)>
229 SimplifyAndSetOp) const;
230
232
234 ArrayRef<int> Mask,
236 VectorType *SubTp,
237 ArrayRef<const Value *> Args = std::nullopt);
238
239 bool areInlineCompatible(const Function *Caller,
240 const Function *Callee) const;
241
242 unsigned getInliningThresholdMultiplier() const { return 11; }
243 unsigned adjustInliningThreshold(const CallBase *CB) const;
244 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const;
245
246 int getInlinerVectorBonusPercent() const { return InlinerVectorBonusPercent; }
247
249 unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF,
251
255 FastMathFlags FMF,
257
258 /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
259 unsigned getCacheLineSize() const override { return 128; }
260
261 /// How much before a load we should place the prefetch instruction.
262 /// This is currently measured in number of IR instructions.
263 unsigned getPrefetchDistance() const override;
264
265 /// \return if target want to issue a prefetch in address space \p AS.
266 bool shouldPrefetchAddressSpace(unsigned AS) const override;
267};
268
269} // end namespace llvm
270
271#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
RelocType Type
Definition: COFFYAML.cpp:391
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Machine InstCombiner
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
int64_t getMaxMemIntrinsicInlineSizeThreshold() const
Class for arbitrary precision integers.
Definition: APInt.h:76
an instruction to allocate memory on the stack
Definition: Instructions.h:59
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Base class which can be used to help build a TTI implementation.
Definition: BasicTTIImpl.h:80
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1461
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isAlwaysUniform(const Value *V) const
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
int64_t getMaxMemIntrinsicInlineSizeThreshold() const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
unsigned getNumberOfRegisters(unsigned RCID) const
bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getCacheLineSize() const override
Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
bool shouldPrefetchAddressSpace(unsigned AS) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getMaxInterleaveFactor(ElementCount VF)
unsigned getInliningThresholdMultiplier() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
unsigned getFlatAddressSpace() const
int getInlinerVectorBonusPercent() const
InstructionCost getVectorSplitCost()
unsigned getMinVectorRegisterBitWidth() const
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional< uint32_t > AtomicCpySize) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
unsigned adjustInliningThreshold(const CallBase *CB) const
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
bool isSourceOfDivergence(const Value *V) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, const Value *Op1, InstCombiner &IC) const
bool hasBranchDivergence(const Function *F=nullptr) const
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional< uint32_t > AtomicElementSize) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
The core instruction combiner logic.
Definition: InstCombiner.h:47
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
The optimization diagnostic interface.
The main scalar evolution driver.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
TargetSubtargetInfo - Generic base class for all target subtargets.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
TargetCostKind
The kind of cost model.
@ TCK_CodeSize
Instruction code size.
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static bool addrspacesMayAlias(unsigned AS1, unsigned AS2)
Definition: AMDGPU.h:428
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Length
Definition: DWP.cpp:456
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Information about a load/store intrinsic defined by the target.
Parameters that control the generic loop unrolling transformation.