28#include "llvm/IR/IntrinsicsAMDGPU.h"
29#include "llvm/IR/IntrinsicsR600.h"
36#define DEBUG_TYPE "amdgpu-subtarget"
38#define GET_SUBTARGETINFO_TARGET_DESC
39#define GET_SUBTARGETINFO_CTOR
40#define AMDGPUSubtarget GCNSubtarget
41#include "AMDGPUGenSubtargetInfo.inc"
45 "amdgpu-enable-power-sched",
46 cl::desc(
"Enable scheduling to minimize mAI power bursts"),
50 "amdgpu-vgpr-index-mode",
51 cl::desc(
"Use GPR indexing mode instead of movrel for vector indexing"),
55 cl::desc(
"Enable the use of AA during codegen."),
59 cl::desc(
"Number of addresses from which to enable MIMG NSA."),
80 FullFS +=
"+flat-for-global,+unaligned-access-mode,+trap-handler,";
82 FullFS +=
"+enable-prt-strict-null,";
85 if (FS.contains_insensitive(
"+wavefrontsize")) {
86 if (!FS.contains_insensitive(
"wavefrontsize16"))
87 FullFS +=
"-wavefrontsize16,";
88 if (!FS.contains_insensitive(
"wavefrontsize32"))
89 FullFS +=
"-wavefrontsize32,";
90 if (!FS.contains_insensitive(
"wavefrontsize64"))
91 FullFS +=
"-wavefrontsize64,";
119 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
125 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
148 !getFeatureBits().
test(AMDGPU::FeatureCuMode))
181 InstrItins(getInstrItineraryForCPU(GPU)),
182 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
201 case AMDGPU::V_LSHLREV_B64_e64:
202 case AMDGPU::V_LSHLREV_B64_gfx10:
203 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
204 case AMDGPU::V_LSHL_B64_e64:
205 case AMDGPU::V_LSHRREV_B64_e64:
206 case AMDGPU::V_LSHRREV_B64_gfx10:
207 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
208 case AMDGPU::V_LSHR_B64_e64:
209 case AMDGPU::V_ASHRREV_I64_e64:
210 case AMDGPU::V_ASHRREV_I64_gfx10:
211 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
212 case AMDGPU::V_ASHR_I64_e64:
222 case AMDGPU::V_CVT_F16_F32_e32:
223 case AMDGPU::V_CVT_F16_F32_e64:
224 case AMDGPU::V_CVT_F16_U16_e32:
225 case AMDGPU::V_CVT_F16_U16_e64:
226 case AMDGPU::V_CVT_F16_I16_e32:
227 case AMDGPU::V_CVT_F16_I16_e64:
228 case AMDGPU::V_RCP_F16_e64:
229 case AMDGPU::V_RCP_F16_e32:
230 case AMDGPU::V_RSQ_F16_e64:
231 case AMDGPU::V_RSQ_F16_e32:
232 case AMDGPU::V_SQRT_F16_e64:
233 case AMDGPU::V_SQRT_F16_e32:
234 case AMDGPU::V_LOG_F16_e64:
235 case AMDGPU::V_LOG_F16_e32:
236 case AMDGPU::V_EXP_F16_e64:
237 case AMDGPU::V_EXP_F16_e32:
238 case AMDGPU::V_SIN_F16_e64:
239 case AMDGPU::V_SIN_F16_e32:
240 case AMDGPU::V_COS_F16_e64:
241 case AMDGPU::V_COS_F16_e32:
242 case AMDGPU::V_FLOOR_F16_e64:
243 case AMDGPU::V_FLOOR_F16_e32:
244 case AMDGPU::V_CEIL_F16_e64:
245 case AMDGPU::V_CEIL_F16_e32:
246 case AMDGPU::V_TRUNC_F16_e64:
247 case AMDGPU::V_TRUNC_F16_e32:
248 case AMDGPU::V_RNDNE_F16_e64:
249 case AMDGPU::V_RNDNE_F16_e32:
250 case AMDGPU::V_FRACT_F16_e64:
251 case AMDGPU::V_FRACT_F16_e32:
252 case AMDGPU::V_FREXP_MANT_F16_e64:
253 case AMDGPU::V_FREXP_MANT_F16_e32:
254 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
255 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
256 case AMDGPU::V_LDEXP_F16_e64:
257 case AMDGPU::V_LDEXP_F16_e32:
258 case AMDGPU::V_LSHLREV_B16_e64:
259 case AMDGPU::V_LSHLREV_B16_e32:
260 case AMDGPU::V_LSHRREV_B16_e64:
261 case AMDGPU::V_LSHRREV_B16_e32:
262 case AMDGPU::V_ASHRREV_I16_e64:
263 case AMDGPU::V_ASHRREV_I16_e32:
264 case AMDGPU::V_ADD_U16_e64:
265 case AMDGPU::V_ADD_U16_e32:
266 case AMDGPU::V_SUB_U16_e64:
267 case AMDGPU::V_SUB_U16_e32:
268 case AMDGPU::V_SUBREV_U16_e64:
269 case AMDGPU::V_SUBREV_U16_e32:
270 case AMDGPU::V_MUL_LO_U16_e64:
271 case AMDGPU::V_MUL_LO_U16_e32:
272 case AMDGPU::V_ADD_F16_e64:
273 case AMDGPU::V_ADD_F16_e32:
274 case AMDGPU::V_SUB_F16_e64:
275 case AMDGPU::V_SUB_F16_e32:
276 case AMDGPU::V_SUBREV_F16_e64:
277 case AMDGPU::V_SUBREV_F16_e32:
278 case AMDGPU::V_MUL_F16_e64:
279 case AMDGPU::V_MUL_F16_e32:
280 case AMDGPU::V_MAX_F16_e64:
281 case AMDGPU::V_MAX_F16_e32:
282 case AMDGPU::V_MIN_F16_e64:
283 case AMDGPU::V_MIN_F16_e32:
284 case AMDGPU::V_MAX_U16_e64:
285 case AMDGPU::V_MAX_U16_e32:
286 case AMDGPU::V_MIN_U16_e64:
287 case AMDGPU::V_MIN_U16_e32:
288 case AMDGPU::V_MAX_I16_e64:
289 case AMDGPU::V_MAX_I16_e32:
290 case AMDGPU::V_MIN_I16_e64:
291 case AMDGPU::V_MIN_I16_e32:
292 case AMDGPU::V_MAD_F16_e64:
293 case AMDGPU::V_MAD_U16_e64:
294 case AMDGPU::V_MAD_I16_e64:
295 case AMDGPU::V_FMA_F16_e64:
296 case AMDGPU::V_DIV_FIXUP_F16_e64:
299 case AMDGPU::V_MADAK_F16:
300 case AMDGPU::V_MADMK_F16:
301 case AMDGPU::V_MAC_F16_e64:
302 case AMDGPU::V_MAC_F16_e32:
303 case AMDGPU::V_FMAMK_F16:
304 case AMDGPU::V_FMAAK_F16:
305 case AMDGPU::V_FMAC_F16_e64:
306 case AMDGPU::V_FMAC_F16_e32:
311 case AMDGPU::V_MAD_MIXLO_F16:
312 case AMDGPU::V_MAD_MIXHI_F16:
326 const unsigned WavesPerWorkgroup =
327 std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
329 const unsigned WorkGroupsPerCU =
330 std::max(1u, (NWaves *
getEUsPerCU()) / WavesPerWorkgroup);
345 if (!MaxWorkGroupsPerCu)
360 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
363 const unsigned MaxGroupNumWaves =
divideCeil(MaxWorkGroupSize, WaveSize);
364 unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
376 "computed invalid occupancy");
386std::pair<unsigned, unsigned>
404 std::pair<unsigned, unsigned>
Default =
409 F,
"amdgpu-flat-work-group-size",
Default);
412 if (Requested.first > Requested.second)
425 std::pair<unsigned, unsigned> Requested,
426 std::pair<unsigned, unsigned> FlatWorkGroupSizes)
const {
434 unsigned MinImpliedByFlatWorkGroupSize =
436 Default.first = MinImpliedByFlatWorkGroupSize;
439 if (Requested.second && Requested.first > Requested.second)
449 if (Requested.first < MinImpliedByFlatWorkGroupSize)
456 const Function &
F, std::pair<unsigned, unsigned> FlatWorkGroupSizes)
const {
461 std::pair<unsigned, unsigned> Requested =
468 if (
Node &&
Node->getNumOperands() == 3)
469 return mdconst::extract<ConstantInt>(
Node->getOperand(Dim))->getZExtValue();
470 return std::numeric_limits<unsigned>::max();
478 unsigned Dimension)
const {
480 if (ReqdSize != std::numeric_limits<unsigned>::max())
486 for (
int I = 0;
I < 3; ++
I) {
495 Function *Kernel =
I->getParent()->getParent();
496 unsigned MinSize = 0;
498 bool IdQuery =
false;
501 if (
auto *CI = dyn_cast<CallInst>(
I)) {
502 const Function *
F = CI->getCalledFunction();
504 unsigned Dim = UINT_MAX;
505 switch (
F->getIntrinsicID()) {
506 case Intrinsic::amdgcn_workitem_id_x:
507 case Intrinsic::r600_read_tidig_x:
510 case Intrinsic::r600_read_local_size_x:
513 case Intrinsic::amdgcn_workitem_id_y:
514 case Intrinsic::r600_read_tidig_y:
517 case Intrinsic::r600_read_local_size_y:
520 case Intrinsic::amdgcn_workitem_id_z:
521 case Intrinsic::r600_read_tidig_z:
524 case Intrinsic::r600_read_local_size_z:
533 if (ReqdSize != std::numeric_limits<unsigned>::max())
534 MinSize = MaxSize = ReqdSize;
552 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
561 if (
F.hasFnAttribute(
"amdgpu-no-implicitarg-ptr"))
568 const Module *M =
F.getParent();
571 return F.getFnAttributeAsParsedInteger(
"amdgpu-implicitarg-num-bytes",
576 Align &MaxAlign)
const {
585 const bool IsByRef = Arg.hasByRefAttr();
586 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
587 Align Alignment =
DL.getValueOrABITypeAlignment(
588 IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);
589 uint64_t AllocSize =
DL.getTypeAllocSize(ArgTy);
590 ExplicitArgBytes =
alignTo(ExplicitArgBytes, Alignment) + AllocSize;
591 MaxAlign = std::max(MaxAlign, Alignment);
594 return ExplicitArgBytes;
598 Align &MaxAlign)
const {
607 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
609 if (ImplicitBytes != 0) {
611 TotalSize =
alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
612 MaxAlign = std::max(MaxAlign, Alignment);
625 unsigned NumRegionInstrs)
const {
713 unsigned NumVGPRs)
const {
725 const Function &
F, std::pair<unsigned, unsigned> WavesPerEU,
726 unsigned PreloadedSGPRs,
unsigned ReservedNumSGPRs)
const {
730 unsigned MaxAddressableNumSGPRs =
getMaxNumSGPRs(WavesPerEU.first,
true);
734 if (
F.hasFnAttribute(
"amdgpu-num-sgpr")) {
736 F.getFnAttributeAsParsedInteger(
"amdgpu-num-sgpr", MaxNumSGPRs);
739 if (Requested && (Requested <= ReservedNumSGPRs))
749 unsigned InputNumSGPRs = PreloadedSGPRs;
750 if (Requested && Requested < InputNumSGPRs)
751 Requested = InputNumSGPRs;
755 if (Requested && Requested >
getMaxNumSGPRs(WavesPerEU.first,
false))
757 if (WavesPerEU.second &&
762 MaxNumSGPRs = Requested;
768 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
781 const unsigned MaxUserSGPRs =
782 USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
783 USI::getNumUserSGPRForField(USI::DispatchPtrID) +
784 USI::getNumUserSGPRForField(USI::QueuePtrID) +
785 USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
786 USI::getNumUserSGPRForField(USI::DispatchIdID) +
787 USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
788 USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);
791 const unsigned MaxSystemSGPRs = 1 +
798 const unsigned SyntheticSGPRs = 1;
800 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
809 const Function &
F, std::pair<unsigned, unsigned> WavesPerEU)
const {
816 if (
F.hasFnAttribute(
"amdgpu-num-vgpr")) {
818 F.getFnAttributeAsParsedInteger(
"amdgpu-num-vgpr", MaxNumVGPRs);
827 if (WavesPerEU.second &&
832 MaxNumVGPRs = Requested;
849 int UseOpIdx,
SDep &Dep)
const {
851 !Def->isInstr() || !
Use->isInstr())
863 for (++
I;
I !=
E &&
I->isBundledWithPred(); ++
I) {
864 if (
I->modifiesRegister(Reg,
TRI))
876 for (++
I;
I !=
E &&
I->isBundledWithPred() && Lat; ++
I) {
877 if (
I->readsRegister(Reg,
TRI))
888 DefI, DefOpIdx, UseI, UseOpIdx));
900 bool isSALU(
const SUnit *SU)
const {
902 return MI &&
TII->isSALU(*
MI) && !
MI->isTerminator();
905 bool isVALU(
const SUnit *SU)
const {
917 while (!Worklist.empty() && MaxChain-- > 0) {
918 SUnit *SU = Worklist.pop_back_val();
919 if (!Visited.
insert(SU).second)
931 if (SUv !=
From && SU != &DAG->
ExitSU && isVALU(SUv) &&
938 if (Succ != SU && isSALU(Succ))
939 Worklist.push_back(Succ);
948 if (!
ST.hasMAIInsts())
952 if (!TSchedModel || DAG->
SUnits.empty())
959 auto LastSALU = DAG->
SUnits.begin();
964 if (!
TII->isMAI(MAI) ||
965 MAI.
getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
966 MAI.
getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
969 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
972 dbgs() <<
"Need " << Lat
973 <<
" instructions to cover latency.\n");
977 for ( ; Lat && LastSALU !=
E; ++LastSALU) {
978 if (Visited.
count(&*LastSALU))
981 if (&SU == &DAG->
ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) ||
985 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
993 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
const {
994 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
997std::unique_ptr<ScheduleDAGMutation>
999 return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo)
1008 "amdgpu-nsa-threshold", -1);
1010 return std::max(
Value, 2);
1033 const bool IsKernel =
1037 const bool HasCalls =
F.hasFnAttribute(
"amdgpu-calls");
1040 const bool HasStackObjects =
F.hasFnAttribute(
"amdgpu-stack-objects");
1042 if (IsKernel && (!
F.arg_empty() || ST.getImplicitArgNumBytes(
F) != 0))
1043 KernargSegmentPtr =
true;
1045 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(
F);
1046 if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
1047 PrivateSegmentBuffer =
true;
1048 else if (ST.isMesaGfxShader(
F))
1049 ImplicitBufferPtr =
true;
1052 if (!
F.hasFnAttribute(
"amdgpu-no-dispatch-ptr"))
1056 if (!
F.hasFnAttribute(
"amdgpu-no-queue-ptr"))
1059 if (!
F.hasFnAttribute(
"amdgpu-no-dispatch-id"))
1067 (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
1068 (
HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
1069 !ST.flatScratchIsArchitected()) {
1070 FlatScratchInit =
true;
1097 NumKernargPreloadSGPRs += NumSGPRs;
1098 NumUsedUserSGPRs += NumSGPRs;
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< bool > UseAA("aarch64-use-aa", cl::init(true), cl::desc("Enable the use of AA during codegen."))
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the InstructionSelector class for AMDGPU.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))
static cl::opt< unsigned > NSAThreshold("amdgpu-nsa-threshold", cl::desc("Number of addresses from which to enable MIMG NSA."), cl::init(3), cl::Hidden)
static cl::opt< bool > EnablePowerSched("amdgpu-enable-power-sched", cl::desc("Enable scheduling to minimize mAI power bursts"), cl::init(false))
static unsigned getMaxNumPreloadedSGPRs()
static cl::opt< bool > UseAA("amdgpu-use-aa-in-codegen", cl::desc("Enable the use of AA during codegen."), cl::init(true))
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim)
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
BlockVerifier::State From
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
This file describes how to lower LLVM inline asm to machine code INLINEASM.
unsigned const TargetRegisterInfo * TRI
const char LLVMTargetMachineRef TM
AMDGPU R600 specific subclass of TargetSubtarget.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallString class.
This class provides the information for the target register banks.
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
AMDGPUSubtarget(const Triple &TT)
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
bool EnableRealTrue16Insts
Align getAlignmentForImplicitArgPtr() const
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
bool isMesaKernel(const Function &F) const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
virtual unsigned getMinWavesPerEU() const =0
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getLocalMemorySize() const
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
virtual unsigned getMaxFlatWorkGroupSize() const =0
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getMaxWavesPerEU() const
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
unsigned AddressableLocalMemorySize
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > WavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes) const
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
unsigned getWavefrontSize() const
virtual unsigned getMinFlatWorkGroupSize() const =0
void setTargetIDFromFeaturesString(StringRef FS)
TargetIDSetting getXnackSetting() const
TargetIDSetting getSramEccSetting() const
Class for arbitrary precision integers.
This class represents an incoming formal argument to a Function.
A parsed version of the target data layout string in and methods for querying it.
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
bool useVGPRIndexMode() const
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep) const override
unsigned MaxPrivateElementSize
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU) const
unsigned getConstantBusLimit(unsigned Opcode) const
const InstrItineraryData * getInstrItineraryData() const override
bool hasSGPRInitBug() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
const SITargetLowering * getTargetLowering() const override
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasFlatAddressSpace() const
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
bool useAA() const override
bool hasVGPRIndexMode() const
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
bool HasArchitectedFlatScratch
std::unique_ptr< ScheduleDAGMutation > createFillMFMAShadowMutation(const TargetInstrInfo *TII) const
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
unsigned getMaxWavesPerEU() const
Generation getGeneration() const
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool isXNACKEnabled() const
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
bool enableSIScheduler() const
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
void getPostRAMutations(std::vector< std::unique_ptr< ScheduleDAGMutation > > &Mutations) const override
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
static unsigned getNumUserSGPRForField(UserSGPRID ID)
bool hasKernargSegmentPtr() const
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasDispatchPtr() const
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST)
bool hasFlatScratchInit() const
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
instr_iterator instr_end()
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
A Module instance is used to store all the information related to an LLVM module.
Kind getKind() const
Returns an enum value representing the kind of the dependence.
@ Data
Regular data dependence (aka true-dependence).
void setLatency(unsigned Lat)
Sets the latency for this edge.
@ Artificial
Arbitrary strong DAG edge (no real dependence).
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
unsigned getReg() const
Returns the register associated with this edge.
const TargetSchedModel & getSchedModel() const
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
unsigned getNumPreloadedSGPRs() const
std::pair< unsigned, unsigned > getWavesPerEU() const
GCNUserSGPRUsageInfo & getUserSGPRInfo()
Scheduling unit. This is a node in the scheduling DAG.
SmallVector< SDep, 4 > Succs
All sunit successors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
void dumpNode(const SUnit &SU) const override
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU)
True if an edge can be added from PredSU to SuccSU without creating a cycle.
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Mutate the DAG as a postpass after normal DAG building.
std::vector< SUnit > SUnits
The scheduling units.
MachineFunction & MF
Machine function.
SUnit ExitSU
Special node for the region exit.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Information about stack frame layout on the target.
TargetInstrInfo - Interface to description of machine instruction set.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
Provide an instruction scheduling machine model to CodeGen passes.
unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
Triple - Helper class for working with autoconf configuration names.
ArchType getArch() const
Get the parsed architecture type of this triple.
The instances of the Type class are immutable: once they are created, they are never changed.
A Use represents the edge between a Value definition and its users.
LLVM Value Representation.
Iterator for intrusive lists based on ilist_node.
self_iterator getIterator()
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getEUsPerCU(const MCSubtargetInfo *STI)
@ FIXED_NUM_SGPRS_FOR_INIT_BUG
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI, unsigned NumVGPRs)
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI)
unsigned getCodeObjectVersion(const Module &M)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
void apply(Opt *O, const Mod &M, const Mods &... Ms)
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg.