LLVM 19.0.0git
AMDGPUSubtarget.cpp
Go to the documentation of this file.
1//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Implements the AMDGPU specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUSubtarget.h"
15#include "AMDGPUCallLowering.h"
17#include "AMDGPULegalizerInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
21#include "R600Subtarget.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
29#include "llvm/IR/IntrinsicsR600.h"
30#include "llvm/IR/MDBuilder.h"
32#include <algorithm>
33
34using namespace llvm;
35
36#define DEBUG_TYPE "amdgpu-subtarget"
37
38#define GET_SUBTARGETINFO_TARGET_DESC
39#define GET_SUBTARGETINFO_CTOR
40#define AMDGPUSubtarget GCNSubtarget
41#include "AMDGPUGenSubtargetInfo.inc"
42#undef AMDGPUSubtarget
43
45 "amdgpu-enable-power-sched",
46 cl::desc("Enable scheduling to minimize mAI power bursts"),
47 cl::init(false));
48
50 "amdgpu-vgpr-index-mode",
51 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
52 cl::init(false));
53
54static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
55 cl::desc("Enable the use of AA during codegen."),
56 cl::init(true));
57
58static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold",
59 cl::desc("Number of addresses from which to enable MIMG NSA."),
61
63
66 StringRef GPU, StringRef FS) {
67 // Determine default and user-specified characteristics
68 //
69 // We want to be able to turn these off, but making this a subtarget feature
70 // for SI has the unhelpful behavior that it unsets everything else if you
71 // disable it.
72 //
73 // Similarly we want enable-prt-strict-null to be on by default and not to
74 // unset everything else if it is disabled
75
76 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
77
78 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
79 if (isAmdHsaOS())
80 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
81
82 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
83
84 // Disable mutually exclusive bits.
85 if (FS.contains_insensitive("+wavefrontsize")) {
86 if (!FS.contains_insensitive("wavefrontsize16"))
87 FullFS += "-wavefrontsize16,";
88 if (!FS.contains_insensitive("wavefrontsize32"))
89 FullFS += "-wavefrontsize32,";
90 if (!FS.contains_insensitive("wavefrontsize64"))
91 FullFS += "-wavefrontsize64,";
92 }
93
94 FullFS += FS;
95
96 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
97
98 // Implement the "generic" processors, which acts as the default when no
99 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
100 // the first amdgcn target that supports flat addressing. Other OSes defaults
101 // to the first amdgcn target.
105 }
106
107 // We don't support FP64 for EG/NI atm.
109
110 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
111 // support flat operations, otherwise they cannot access a 64-bit global
112 // address space
113 assert(hasAddr64() || hasFlat());
114 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
115 // that do not support ADDR64 variants of MUBUF instructions. Such targets
116 // cannot use a 64 bit offset with a MUBUF instruction to access the global
117 // address space
118 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
119 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
120 FlatForGlobal = true;
121 }
122 // Unless +-flat-for-global is specified, use MUBUF instructions for global
123 // address space access if flat operations are not available.
124 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
125 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
126 FlatForGlobal = false;
127 }
128
129 // Set defaults if needed.
130 if (MaxPrivateElementSize == 0)
132
133 if (LDSBankCount == 0)
134 LDSBankCount = 32;
135
136 if (TT.getArch() == Triple::amdgcn) {
137 if (LocalMemorySize == 0)
138 LocalMemorySize = 32768;
139
140 // Do something sensible for unspecified target.
142 HasMovrel = true;
143 }
144
146
147 if (AMDGPU::isGFX10Plus(*this) &&
148 !getFeatureBits().test(AMDGPU::FeatureCuMode))
149 LocalMemorySize *= 2;
150
151 // Don't crash on invalid devices.
152 if (WavefrontSizeLog2 == 0)
154
157
159
160 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
161 << TargetID.getXnackSetting() << '\n');
162 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
163 << TargetID.getSramEccSetting() << '\n');
164
165 return *this;
166}
167
168AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {}
169
172}
173
175 const GCNTargetMachine &TM)
176 : // clang-format off
177 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
178 AMDGPUSubtarget(TT),
179 TargetTriple(TT),
180 TargetID(*this),
181 InstrItins(getInstrItineraryForCPU(GPU)),
182 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
183 TLInfo(TM, *this),
184 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
185 // clang-format on
188 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
189 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
190 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
191 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
192 InstSelector.reset(new AMDGPUInstructionSelector(
193 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
194}
195
196unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
197 if (getGeneration() < GFX10)
198 return 1;
199
200 switch (Opcode) {
201 case AMDGPU::V_LSHLREV_B64_e64:
202 case AMDGPU::V_LSHLREV_B64_gfx10:
203 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
204 case AMDGPU::V_LSHLREV_B64_e32_gfx12:
205 case AMDGPU::V_LSHLREV_B64_e64_gfx12:
206 case AMDGPU::V_LSHL_B64_e64:
207 case AMDGPU::V_LSHRREV_B64_e64:
208 case AMDGPU::V_LSHRREV_B64_gfx10:
209 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
210 case AMDGPU::V_LSHRREV_B64_e64_gfx12:
211 case AMDGPU::V_LSHR_B64_e64:
212 case AMDGPU::V_ASHRREV_I64_e64:
213 case AMDGPU::V_ASHRREV_I64_gfx10:
214 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
215 case AMDGPU::V_ASHRREV_I64_e64_gfx12:
216 case AMDGPU::V_ASHR_I64_e64:
217 return 1;
218 }
219
220 return 2;
221}
222
223/// This list was mostly derived from experimentation.
224bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
225 switch (Opcode) {
226 case AMDGPU::V_CVT_F16_F32_e32:
227 case AMDGPU::V_CVT_F16_F32_e64:
228 case AMDGPU::V_CVT_F16_U16_e32:
229 case AMDGPU::V_CVT_F16_U16_e64:
230 case AMDGPU::V_CVT_F16_I16_e32:
231 case AMDGPU::V_CVT_F16_I16_e64:
232 case AMDGPU::V_RCP_F16_e64:
233 case AMDGPU::V_RCP_F16_e32:
234 case AMDGPU::V_RSQ_F16_e64:
235 case AMDGPU::V_RSQ_F16_e32:
236 case AMDGPU::V_SQRT_F16_e64:
237 case AMDGPU::V_SQRT_F16_e32:
238 case AMDGPU::V_LOG_F16_e64:
239 case AMDGPU::V_LOG_F16_e32:
240 case AMDGPU::V_EXP_F16_e64:
241 case AMDGPU::V_EXP_F16_e32:
242 case AMDGPU::V_SIN_F16_e64:
243 case AMDGPU::V_SIN_F16_e32:
244 case AMDGPU::V_COS_F16_e64:
245 case AMDGPU::V_COS_F16_e32:
246 case AMDGPU::V_FLOOR_F16_e64:
247 case AMDGPU::V_FLOOR_F16_e32:
248 case AMDGPU::V_CEIL_F16_e64:
249 case AMDGPU::V_CEIL_F16_e32:
250 case AMDGPU::V_TRUNC_F16_e64:
251 case AMDGPU::V_TRUNC_F16_e32:
252 case AMDGPU::V_RNDNE_F16_e64:
253 case AMDGPU::V_RNDNE_F16_e32:
254 case AMDGPU::V_FRACT_F16_e64:
255 case AMDGPU::V_FRACT_F16_e32:
256 case AMDGPU::V_FREXP_MANT_F16_e64:
257 case AMDGPU::V_FREXP_MANT_F16_e32:
258 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
259 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
260 case AMDGPU::V_LDEXP_F16_e64:
261 case AMDGPU::V_LDEXP_F16_e32:
262 case AMDGPU::V_LSHLREV_B16_e64:
263 case AMDGPU::V_LSHLREV_B16_e32:
264 case AMDGPU::V_LSHRREV_B16_e64:
265 case AMDGPU::V_LSHRREV_B16_e32:
266 case AMDGPU::V_ASHRREV_I16_e64:
267 case AMDGPU::V_ASHRREV_I16_e32:
268 case AMDGPU::V_ADD_U16_e64:
269 case AMDGPU::V_ADD_U16_e32:
270 case AMDGPU::V_SUB_U16_e64:
271 case AMDGPU::V_SUB_U16_e32:
272 case AMDGPU::V_SUBREV_U16_e64:
273 case AMDGPU::V_SUBREV_U16_e32:
274 case AMDGPU::V_MUL_LO_U16_e64:
275 case AMDGPU::V_MUL_LO_U16_e32:
276 case AMDGPU::V_ADD_F16_e64:
277 case AMDGPU::V_ADD_F16_e32:
278 case AMDGPU::V_SUB_F16_e64:
279 case AMDGPU::V_SUB_F16_e32:
280 case AMDGPU::V_SUBREV_F16_e64:
281 case AMDGPU::V_SUBREV_F16_e32:
282 case AMDGPU::V_MUL_F16_e64:
283 case AMDGPU::V_MUL_F16_e32:
284 case AMDGPU::V_MAX_F16_e64:
285 case AMDGPU::V_MAX_F16_e32:
286 case AMDGPU::V_MIN_F16_e64:
287 case AMDGPU::V_MIN_F16_e32:
288 case AMDGPU::V_MAX_U16_e64:
289 case AMDGPU::V_MAX_U16_e32:
290 case AMDGPU::V_MIN_U16_e64:
291 case AMDGPU::V_MIN_U16_e32:
292 case AMDGPU::V_MAX_I16_e64:
293 case AMDGPU::V_MAX_I16_e32:
294 case AMDGPU::V_MIN_I16_e64:
295 case AMDGPU::V_MIN_I16_e32:
296 case AMDGPU::V_MAD_F16_e64:
297 case AMDGPU::V_MAD_U16_e64:
298 case AMDGPU::V_MAD_I16_e64:
299 case AMDGPU::V_FMA_F16_e64:
300 case AMDGPU::V_DIV_FIXUP_F16_e64:
301 // On gfx10, all 16-bit instructions preserve the high bits.
303 case AMDGPU::V_MADAK_F16:
304 case AMDGPU::V_MADMK_F16:
305 case AMDGPU::V_MAC_F16_e64:
306 case AMDGPU::V_MAC_F16_e32:
307 case AMDGPU::V_FMAMK_F16:
308 case AMDGPU::V_FMAAK_F16:
309 case AMDGPU::V_FMAC_F16_e64:
310 case AMDGPU::V_FMAC_F16_e32:
311 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
312 // instructions maintain the legacy behavior of 0ing. Some instructions
313 // changed to preserving the high bits.
315 case AMDGPU::V_MAD_MIXLO_F16:
316 case AMDGPU::V_MAD_MIXHI_F16:
317 default:
318 return false;
319 }
320}
321
322// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
323// allows the given function to achieve an occupancy of NWaves waves per
324// SIMD / EU, taking into account only the function's *maximum* workgroup size.
325unsigned
327 const Function &F) const {
328 const unsigned WaveSize = getWavefrontSize();
329 const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
330 const unsigned WavesPerWorkgroup =
331 std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
332
333 const unsigned WorkGroupsPerCU =
334 std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
335
336 return getLocalMemorySize() / WorkGroupsPerCU;
337}
338
339// FIXME: Should return min,max range.
340//
341// Returns the maximum occupancy, in number of waves per SIMD / EU, that can
342// be achieved when only the given function is running on the machine; and
343// taking into account the overall number of wave slots, the (maximum) workgroup
344// size, and the per-workgroup LDS allocation size.
346 const Function &F) const {
347 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
348 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
349 if (!MaxWorkGroupsPerCu)
350 return 0;
351
352 const unsigned WaveSize = getWavefrontSize();
353
354 // FIXME: Do we need to account for alignment requirement of LDS rounding the
355 // size up?
356 // Compute restriction based on LDS usage
357 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
358
359 // This can be queried with more LDS than is possible, so just assume the
360 // worst.
361 if (NumGroups == 0)
362 return 1;
363
364 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
365
366 // Round to the number of waves per CU.
367 const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
368 unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
369
370 // Number of waves per EU (SIMD).
371 MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
372
373 // Clamp to the maximum possible number of waves.
374 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
375
376 // FIXME: Needs to be a multiple of the group size?
377 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
378
379 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
380 "computed invalid occupancy");
381 return MaxWaves;
382}
383
384unsigned
386 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
387 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
388}
389
390std::pair<unsigned, unsigned>
392 switch (CC) {
399 return std::pair(1, getWavefrontSize());
400 default:
401 return std::pair(1u, getMaxFlatWorkGroupSize());
402 }
403}
404
405std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
406 const Function &F) const {
407 // Default minimum/maximum flat work group sizes.
408 std::pair<unsigned, unsigned> Default =
409 getDefaultFlatWorkGroupSize(F.getCallingConv());
410
411 // Requested minimum/maximum flat work group sizes.
412 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
413 F, "amdgpu-flat-work-group-size", Default);
414
415 // Make sure requested minimum is less than requested maximum.
416 if (Requested.first > Requested.second)
417 return Default;
418
419 // Make sure requested values do not violate subtarget's specifications.
420 if (Requested.first < getMinFlatWorkGroupSize())
421 return Default;
422 if (Requested.second > getMaxFlatWorkGroupSize())
423 return Default;
424
425 return Requested;
426}
427
428std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
429 std::pair<unsigned, unsigned> Requested,
430 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
431 // Default minimum/maximum number of waves per execution unit.
432 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
433
434 // If minimum/maximum flat work group sizes were explicitly requested using
435 // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
436 // number of waves per execution unit to values implied by requested
437 // minimum/maximum flat work group sizes.
438 unsigned MinImpliedByFlatWorkGroupSize =
439 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
440 Default.first = MinImpliedByFlatWorkGroupSize;
441
442 // Make sure requested minimum is less than requested maximum.
443 if (Requested.second && Requested.first > Requested.second)
444 return Default;
445
446 // Make sure requested values do not violate subtarget's specifications.
447 if (Requested.first < getMinWavesPerEU() ||
448 Requested.second > getMaxWavesPerEU())
449 return Default;
450
451 // Make sure requested values are compatible with values implied by requested
452 // minimum/maximum flat work group sizes.
453 if (Requested.first < MinImpliedByFlatWorkGroupSize)
454 return Default;
455
456 return Requested;
457}
458
459std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
460 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
461 // Default minimum/maximum number of waves per execution unit.
462 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
463
464 // Requested minimum/maximum number of waves per execution unit.
465 std::pair<unsigned, unsigned> Requested =
466 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true);
467 return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes);
468}
469
470static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
471 auto Node = Kernel.getMetadata("reqd_work_group_size");
472 if (Node && Node->getNumOperands() == 3)
473 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
474 return std::numeric_limits<unsigned>::max();
475}
476
478 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
479}
480
482 unsigned Dimension) const {
483 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
484 if (ReqdSize != std::numeric_limits<unsigned>::max())
485 return ReqdSize - 1;
486 return getFlatWorkGroupSizes(Kernel).second - 1;
487}
488
490 for (int I = 0; I < 3; ++I) {
491 if (getMaxWorkitemID(Func, I) > 0)
492 return false;
493 }
494
495 return true;
496}
497
499 Function *Kernel = I->getParent()->getParent();
500 unsigned MinSize = 0;
501 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
502 bool IdQuery = false;
503
504 // If reqd_work_group_size is present it narrows value down.
505 if (auto *CI = dyn_cast<CallInst>(I)) {
506 const Function *F = CI->getCalledFunction();
507 if (F) {
508 unsigned Dim = UINT_MAX;
509 switch (F->getIntrinsicID()) {
510 case Intrinsic::amdgcn_workitem_id_x:
511 case Intrinsic::r600_read_tidig_x:
512 IdQuery = true;
513 [[fallthrough]];
514 case Intrinsic::r600_read_local_size_x:
515 Dim = 0;
516 break;
517 case Intrinsic::amdgcn_workitem_id_y:
518 case Intrinsic::r600_read_tidig_y:
519 IdQuery = true;
520 [[fallthrough]];
521 case Intrinsic::r600_read_local_size_y:
522 Dim = 1;
523 break;
524 case Intrinsic::amdgcn_workitem_id_z:
525 case Intrinsic::r600_read_tidig_z:
526 IdQuery = true;
527 [[fallthrough]];
528 case Intrinsic::r600_read_local_size_z:
529 Dim = 2;
530 break;
531 default:
532 break;
533 }
534
535 if (Dim <= 3) {
536 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
537 if (ReqdSize != std::numeric_limits<unsigned>::max())
538 MinSize = MaxSize = ReqdSize;
539 }
540 }
541 }
542
543 if (!MaxSize)
544 return false;
545
546 // Range metadata is [Lo, Hi). For ID query we need to pass max size
547 // as Hi. For size query we need to pass Hi + 1.
548 if (IdQuery)
549 MinSize = 0;
550 else
551 ++MaxSize;
552
553 MDBuilder MDB(I->getContext());
554 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
555 APInt(32, MaxSize));
556 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
557 return true;
558}
559
561 assert(AMDGPU::isKernel(F.getCallingConv()));
562
563 // We don't allocate the segment if we know the implicit arguments weren't
564 // used, even if the ABI implies we need them.
565 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
566 return 0;
567
568 if (isMesaKernel(F))
569 return 16;
570
571 // Assume all implicit inputs are used by default
572 const Module *M = F.getParent();
573 unsigned NBytes =
575 return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
576 NBytes);
577}
578
580 Align &MaxAlign) const {
581 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
582 F.getCallingConv() == CallingConv::SPIR_KERNEL);
583
584 const DataLayout &DL = F.getParent()->getDataLayout();
585 uint64_t ExplicitArgBytes = 0;
586 MaxAlign = Align(1);
587
588 for (const Argument &Arg : F.args()) {
589 const bool IsByRef = Arg.hasByRefAttr();
590 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
591 Align Alignment = DL.getValueOrABITypeAlignment(
592 IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);
593 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
594 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
595 MaxAlign = std::max(MaxAlign, Alignment);
596 }
597
598 return ExplicitArgBytes;
599}
600
602 Align &MaxAlign) const {
603 if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
604 F.getCallingConv() != CallingConv::SPIR_KERNEL)
605 return 0;
606
607 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
608
609 unsigned ExplicitOffset = getExplicitKernelArgOffset();
610
611 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
612 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
613 if (ImplicitBytes != 0) {
614 const Align Alignment = getAlignmentForImplicitArgPtr();
615 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
616 MaxAlign = std::max(MaxAlign, Alignment);
617 }
618
619 // Being able to dereference past the end is useful for emitting scalar loads.
620 return alignTo(TotalSize, 4);
621}
622
626}
627
629 unsigned NumRegionInstrs) const {
630 // Track register pressure so the scheduler can try to decrease
631 // pressure once register usage is above the threshold defined by
632 // SIRegisterInfo::getRegPressureSetLimit()
633 Policy.ShouldTrackPressure = true;
634
635 // Enabling both top down and bottom up scheduling seems to give us less
636 // register spills than just using one of these approaches on its own.
637 Policy.OnlyTopDown = false;
638 Policy.OnlyBottomUp = false;
639
640 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
641 if (!enableSIScheduler())
642 Policy.ShouldTrackLaneMasks = true;
643}
644
646 if (isWave32()) {
647 // Fix implicit $vcc operands after MIParser has verified that they match
648 // the instruction definitions.
649 for (auto &MBB : MF) {
650 for (auto &MI : MBB)
651 InstrInfo.fixImplicitOperands(MI);
652 }
653 }
654}
655
657 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
658}
659
662}
663
664bool GCNSubtarget::useAA() const { return UseAA; }
665
666unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
668 return getMaxWavesPerEU();
669
671 if (SGPRs <= 80)
672 return 10;
673 if (SGPRs <= 88)
674 return 9;
675 if (SGPRs <= 100)
676 return 8;
677 return 7;
678 }
679 if (SGPRs <= 48)
680 return 10;
681 if (SGPRs <= 56)
682 return 9;
683 if (SGPRs <= 64)
684 return 8;
685 if (SGPRs <= 72)
686 return 7;
687 if (SGPRs <= 80)
688 return 6;
689 return 5;
690}
691
692unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
694}
695
696unsigned
697GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
699 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
700
701 if (HasFlatScratch || HasArchitectedFlatScratch) {
703 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
705 return 4; // FLAT_SCRATCH, VCC (in that order).
706 }
707
708 if (isXNACKEnabled())
709 return 4; // XNACK, VCC (in that order).
710 return 2; // VCC.
711}
712
716}
717
719 // In principle we do not need to reserve SGPR pair used for flat_scratch if
720 // we know flat instructions do not access the stack anywhere in the
721 // program. For now assume it's needed if we have flat instructions.
722 const bool KernelUsesFlatScratch = hasFlatAddressSpace();
723 return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
724}
725
726unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
727 unsigned NumSGPRs,
728 unsigned NumVGPRs) const {
729 unsigned Occupancy =
730 std::min(getMaxWavesPerEU(),
732 if (NumSGPRs)
733 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
734 if (NumVGPRs)
735 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
736 return Occupancy;
737}
738
740 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
741 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
742 // Compute maximum number of SGPRs function can use using default/requested
743 // minimum number of waves per execution unit.
744 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
745 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
746
747 // Check if maximum number of SGPRs was explicitly requested using
748 // "amdgpu-num-sgpr" attribute.
749 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
750 unsigned Requested =
751 F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);
752
753 // Make sure requested value does not violate subtarget's specifications.
754 if (Requested && (Requested <= ReservedNumSGPRs))
755 Requested = 0;
756
757 // If more SGPRs are required to support the input user/system SGPRs,
758 // increase to accommodate them.
759 //
760 // FIXME: This really ends up using the requested number of SGPRs + number
761 // of reserved special registers in total. Theoretically you could re-use
762 // the last input registers for these special registers, but this would
763 // require a lot of complexity to deal with the weird aliasing.
764 unsigned InputNumSGPRs = PreloadedSGPRs;
765 if (Requested && Requested < InputNumSGPRs)
766 Requested = InputNumSGPRs;
767
768 // Make sure requested value is compatible with values implied by
769 // default/requested minimum/maximum number of waves per execution unit.
770 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
771 Requested = 0;
772 if (WavesPerEU.second &&
773 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
774 Requested = 0;
775
776 if (Requested)
777 MaxNumSGPRs = Requested;
778 }
779
780 if (hasSGPRInitBug())
782
783 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
784}
785
787 const Function &F = MF.getFunction();
791}
792
793static unsigned getMaxNumPreloadedSGPRs() {
794 using USI = GCNUserSGPRUsageInfo;
795 // Max number of user SGPRs
796 const unsigned MaxUserSGPRs =
797 USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
798 USI::getNumUserSGPRForField(USI::DispatchPtrID) +
799 USI::getNumUserSGPRForField(USI::QueuePtrID) +
800 USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
801 USI::getNumUserSGPRForField(USI::DispatchIdID) +
802 USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
803 USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);
804
805 // Max number of system SGPRs
806 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
807 1 + // WorkGroupIDY
808 1 + // WorkGroupIDZ
809 1 + // WorkGroupInfo
810 1; // private segment wave byte offset
811
812 // Max number of synthetic SGPRs
813 const unsigned SyntheticSGPRs = 1; // LDSKernelId
814
815 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
816}
817
821}
822
824 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
825 // Compute maximum number of VGPRs function can use using default/requested
826 // minimum number of waves per execution unit.
827 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
828
829 // Check if maximum number of VGPRs was explicitly requested using
830 // "amdgpu-num-vgpr" attribute.
831 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
832 unsigned Requested =
833 F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs);
834
835 if (hasGFX90AInsts())
836 Requested *= 2;
837
838 // Make sure requested value is compatible with values implied by
839 // default/requested minimum/maximum number of waves per execution unit.
840 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
841 Requested = 0;
842 if (WavesPerEU.second &&
843 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
844 Requested = 0;
845
846 if (Requested)
847 MaxNumVGPRs = Requested;
848 }
849
850 return MaxNumVGPRs;
851}
852
855}
856
858 const Function &F = MF.getFunction();
860 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
861}
862
864 SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
865 const TargetSchedModel *SchedModel) const {
866 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
867 !Def->isInstr() || !Use->isInstr())
868 return;
869
870 MachineInstr *DefI = Def->getInstr();
871 MachineInstr *UseI = Use->getInstr();
872
873 if (DefI->isBundle()) {
875 auto Reg = Dep.getReg();
878 unsigned Lat = 0;
879 for (++I; I != E && I->isBundledWithPred(); ++I) {
880 if (I->modifiesRegister(Reg, TRI))
881 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
882 else if (Lat)
883 --Lat;
884 }
885 Dep.setLatency(Lat);
886 } else if (UseI->isBundle()) {
888 auto Reg = Dep.getReg();
891 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
892 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
893 if (I->readsRegister(Reg, TRI))
894 break;
895 --Lat;
896 }
897 Dep.setLatency(Lat);
898 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
899 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
900 // implicit operands which come from the MCInstrDesc, which can fool
901 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
902 // pseudo operands.
904 DefI, DefOpIdx, UseI, UseOpIdx));
905 }
906}
907
908namespace {
909struct FillMFMAShadowMutation : ScheduleDAGMutation {
910 const SIInstrInfo *TII;
911
912 ScheduleDAGMI *DAG;
913
914 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
915
916 bool isSALU(const SUnit *SU) const {
917 const MachineInstr *MI = SU->getInstr();
918 return MI && TII->isSALU(*MI) && !MI->isTerminator();
919 }
920
921 bool isVALU(const SUnit *SU) const {
922 const MachineInstr *MI = SU->getInstr();
923 return MI && TII->isVALU(*MI);
924 }
925
926 // Link as many SALU instructions in chain as possible. Return the size
927 // of the chain. Links up to MaxChain instructions.
928 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
929 SmallPtrSetImpl<SUnit *> &Visited) const {
930 SmallVector<SUnit *, 8> Worklist({To});
931 unsigned Linked = 0;
932
933 while (!Worklist.empty() && MaxChain-- > 0) {
934 SUnit *SU = Worklist.pop_back_val();
935 if (!Visited.insert(SU).second)
936 continue;
937
938 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
939 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
940
941 if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From))
942 if (DAG->addEdge(SU, SDep(From, SDep::Artificial)))
943 ++Linked;
944
945 for (SDep &SI : From->Succs) {
946 SUnit *SUv = SI.getSUnit();
947 if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) &&
948 DAG->canAddEdge(SUv, SU))
949 DAG->addEdge(SUv, SDep(SU, SDep::Artificial));
950 }
951
952 for (SDep &SI : SU->Succs) {
953 SUnit *Succ = SI.getSUnit();
954 if (Succ != SU && isSALU(Succ))
955 Worklist.push_back(Succ);
956 }
957 }
958
959 return Linked;
960 }
961
962 void apply(ScheduleDAGInstrs *DAGInstrs) override {
963 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
964 if (!ST.hasMAIInsts())
965 return;
966 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
967 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
968 if (!TSchedModel || DAG->SUnits.empty())
969 return;
970
971 // Scan for MFMA long latency instructions and try to add a dependency
972 // of available SALU instructions to give them a chance to fill MFMA
973 // shadow. That is desirable to fill MFMA shadow with SALU instructions
974 // rather than VALU to prevent power consumption bursts and throttle.
975 auto LastSALU = DAG->SUnits.begin();
976 auto E = DAG->SUnits.end();
978 for (SUnit &SU : DAG->SUnits) {
979 MachineInstr &MAI = *SU.getInstr();
980 if (!TII->isMAI(MAI) ||
981 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
982 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
983 continue;
984
985 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
986
987 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
988 dbgs() << "Need " << Lat
989 << " instructions to cover latency.\n");
990
991 // Find up to Lat independent scalar instructions as early as
992 // possible such that they can be scheduled after this MFMA.
993 for ( ; Lat && LastSALU != E; ++LastSALU) {
994 if (Visited.count(&*LastSALU))
995 continue;
996
997 if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) ||
998 !DAG->canAddEdge(&*LastSALU, &SU))
999 continue;
1000
1001 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1002 }
1003 }
1004 }
1005};
1006} // namespace
1007
1009 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1010 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1011}
1012
1013std::unique_ptr<ScheduleDAGMutation>
1015 return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo)
1016 : nullptr;
1017}
1018
1021 return 0; // Not MIMG encoding.
1022
1023 if (NSAThreshold.getNumOccurrences() > 0)
1024 return std::max(NSAThreshold.getValue(), 2u);
1025
1027 "amdgpu-nsa-threshold", -1);
1028 if (Value > 0)
1029 return std::max(Value, 2);
1030
1031 return 3;
1032}
1033
1036 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1037 else
1038 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1039}
1040
1042 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1043 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1044 else
1045 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1046}
1047
1049 const GCNSubtarget &ST)
1050 : ST(ST) {
1051 const CallingConv::ID CC = F.getCallingConv();
1052 const bool IsKernel =
1054 // FIXME: Should have analysis or something rather than attribute to detect
1055 // calls.
1056 const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
1057 // FIXME: This attribute is a hack, we just need an analysis on the function
1058 // to look for allocas.
1059 const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
1060
1061 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
1062 KernargSegmentPtr = true;
1063
1064 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
1065 if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
1066 PrivateSegmentBuffer = true;
1067 else if (ST.isMesaGfxShader(F))
1068 ImplicitBufferPtr = true;
1069
1070 if (!AMDGPU::isGraphics(CC)) {
1071 if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
1072 DispatchPtr = true;
1073
1074 // FIXME: Can this always be disabled with < COv5?
1075 if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
1076 QueuePtr = true;
1077
1078 if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
1079 DispatchID = true;
1080 }
1081
1082 // TODO: This could be refined a lot. The attribute is a poor way of
1083 // detecting calls or stack objects that may require it before argument
1084 // lowering.
1085 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
1086 (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
1087 (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
1088 !ST.flatScratchIsArchitected()) {
1089 FlatScratchInit = true;
1090 }
1091
1093 NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
1094
1097
1098 if (hasDispatchPtr())
1099 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
1100
1101 if (hasQueuePtr())
1102 NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
1103
1105 NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
1106
1107 if (hasDispatchID())
1108 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
1109
1110 if (hasFlatScratchInit())
1111 NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
1112}
1113
1115 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
1116 NumKernargPreloadSGPRs += NumSGPRs;
1117 NumUsedUserSGPRs += NumSGPRs;
1118}
1119
1121 return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
1122}
1123
1126 return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3);
1127}
@ HasCalls
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< bool > UseAA("aarch64-use-aa", cl::init(true), cl::desc("Enable the use of AA during codegen."))
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the InstructionSelector class for AMDGPU.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))
static cl::opt< unsigned > NSAThreshold("amdgpu-nsa-threshold", cl::desc("Number of addresses from which to enable MIMG NSA."), cl::init(3), cl::Hidden)
static cl::opt< bool > EnablePowerSched("amdgpu-enable-power-sched", cl::desc("Enable scheduling to minimize mAI power bursts"), cl::init(false))
static unsigned getMaxNumPreloadedSGPRs()
static cl::opt< bool > UseAA("amdgpu-use-aa-in-codegen", cl::desc("Enable the use of AA during codegen."), cl::init(true))
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim)
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
BlockVerifier::State From
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DEBUG(X)
Definition: Debug.h:101
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
This file describes how to lower LLVM inline asm to machine code INLINEASM.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
modulo schedule test
if(VerifyEach)
const char LLVMTargetMachineRef TM
AMDGPU R600 specific subclass of TargetSubtarget.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallString class.
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
AMDGPUSubtarget(const Triple &TT)
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
Align getAlignmentForImplicitArgPtr() const
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
bool isMesaKernel(const Function &F) const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
virtual unsigned getMinWavesPerEU() const =0
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getLocalMemorySize() const
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
virtual unsigned getMaxFlatWorkGroupSize() const =0
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getMaxWavesPerEU() const
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
unsigned AddressableLocalMemorySize
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > WavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes) const
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
unsigned getWavefrontSize() const
virtual unsigned getMinFlatWorkGroupSize() const =0
void setTargetIDFromFeaturesString(StringRef FS)
TargetIDSetting getXnackSetting() const
TargetIDSetting getSramEccSetting() const
Class for arbitrary precision integers.
Definition: APInt.h:76
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:721
bool hasFlat() const
Definition: GCNSubtarget.h:371
bool useVGPRIndexMode() const
void mirFileLoaded(MachineFunction &MF) const override
unsigned MaxPrivateElementSize
Definition: GCNSubtarget.h:66
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU) const
unsigned getConstantBusLimit(unsigned Opcode) const
const InstrItineraryData * getInstrItineraryData() const override
Definition: GCNSubtarget.h:297
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const override
bool hasSGPRInitBug() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:264
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:260
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasFlatAddressSpace() const
Definition: GCNSubtarget.h:605
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
bool hasMovrel() const
Definition: GCNSubtarget.h:939
bool useAA() const override
bool isWave32() const
bool hasVGPRIndexMode() const
Definition: GCNSubtarget.h:943
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
bool HasArchitectedFlatScratch
Definition: GCNSubtarget.h:196
std::unique_ptr< ScheduleDAGMutation > createFillMFMAShadowMutation(const TargetInstrInfo *TII) const
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
unsigned getMaxWavesPerEU() const
Generation getGeneration() const
Definition: GCNSubtarget.h:303
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:591
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
bool enableSIScheduler() const
bool hasAddr64() const
Definition: GCNSubtarget.h:367
bool hasFP64() const
Definition: GCNSubtarget.h:347
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
void getPostRAMutations(std::vector< std::unique_ptr< ScheduleDAGMutation > > &Mutations) const override
~GCNSubtarget() override
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
Definition: GCNSubtarget.h:62
static unsigned getNumUserSGPRForField(UserSGPRID ID)
bool hasKernargSegmentPtr() const
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST)
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition: Value.h:565
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:84
Metadata node.
Definition: Metadata.h:1067
instr_iterator instr_end()
Instructions::const_iterator const_instr_iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:546
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:329
bool isBundle() const
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Scheduling dependency.
Definition: ScheduleDAG.h:49
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:486
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142
unsigned getReg() const
Returns the register associated with this edge.
Definition: ScheduleDAG.h:218
const TargetSchedModel & getSchedModel() const
Definition: SIInstrInfo.h:1374
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
std::pair< unsigned, unsigned > getWavesPerEU() const
GCNUserSGPRUsageInfo & getUserSGPRInfo()
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
void dumpNode(const SUnit &SU) const override
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU)
True if an edge can be added from PredSU to SuccSU without creating a cycle.
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Mutate the DAG as a postpass after normal DAG building.
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:561
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:559
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:563
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Information about stack frame layout on the target.
TargetInstrInfo - Interface to description of machine instruction set.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
const Triple & getTargetTriple() const
Provide an instruction scheduling machine model to CodeGen passes.
unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:361
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
self_iterator getIterator()
Definition: ilist_node.h:109
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getEUsPerCU(const MCSubtargetInfo *STI)
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI, unsigned NumVGPRs)
SmallVector< unsigned > getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size)
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1316
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg.