LLVM 18.0.0git
AMDGPUSubtarget.cpp
Go to the documentation of this file.
1//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Implements the AMDGPU specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUSubtarget.h"
15#include "AMDGPUCallLowering.h"
17#include "AMDGPULegalizerInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
21#include "R600Subtarget.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
29#include "llvm/IR/IntrinsicsR600.h"
30#include "llvm/IR/MDBuilder.h"
32#include <algorithm>
33
34using namespace llvm;
35
36#define DEBUG_TYPE "amdgpu-subtarget"
37
38#define GET_SUBTARGETINFO_TARGET_DESC
39#define GET_SUBTARGETINFO_CTOR
40#define AMDGPUSubtarget GCNSubtarget
41#include "AMDGPUGenSubtargetInfo.inc"
42#undef AMDGPUSubtarget
43
45 "amdgpu-enable-power-sched",
46 cl::desc("Enable scheduling to minimize mAI power bursts"),
47 cl::init(false));
48
50 "amdgpu-vgpr-index-mode",
51 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
52 cl::init(false));
53
54static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
55 cl::desc("Enable the use of AA during codegen."),
56 cl::init(true));
57
58static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold",
59 cl::desc("Number of addresses from which to enable MIMG NSA."),
61
63
66 StringRef GPU, StringRef FS) {
67 // Determine default and user-specified characteristics
68 //
69 // We want to be able to turn these off, but making this a subtarget feature
70 // for SI has the unhelpful behavior that it unsets everything else if you
71 // disable it.
72 //
73 // Similarly we want enable-prt-strict-null to be on by default and not to
74 // unset everything else if it is disabled
75
76 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
77
78 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
79 if (isAmdHsaOS())
80 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
81
82 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
83
84 // Disable mutually exclusive bits.
85 if (FS.contains_insensitive("+wavefrontsize")) {
86 if (!FS.contains_insensitive("wavefrontsize16"))
87 FullFS += "-wavefrontsize16,";
88 if (!FS.contains_insensitive("wavefrontsize32"))
89 FullFS += "-wavefrontsize32,";
90 if (!FS.contains_insensitive("wavefrontsize64"))
91 FullFS += "-wavefrontsize64,";
92 }
93
94 FullFS += FS;
95
96 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
97
98 // Implement the "generic" processors, which acts as the default when no
99 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
100 // the first amdgcn target that supports flat addressing. Other OSes defaults
101 // to the first amdgcn target.
105 }
106
107 // We don't support FP64 for EG/NI atm.
109
110 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
111 // support flat operations, otherwise they cannot access a 64-bit global
112 // address space
113 assert(hasAddr64() || hasFlat());
114 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
115 // that do not support ADDR64 variants of MUBUF instructions. Such targets
116 // cannot use a 64 bit offset with a MUBUF instruction to access the global
117 // address space
118 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
119 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
120 FlatForGlobal = true;
121 }
122 // Unless +-flat-for-global is specified, use MUBUF instructions for global
123 // address space access if flat operations are not available.
124 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
125 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
126 FlatForGlobal = false;
127 }
128
129 // Set defaults if needed.
130 if (MaxPrivateElementSize == 0)
132
133 if (LDSBankCount == 0)
134 LDSBankCount = 32;
135
136 if (TT.getArch() == Triple::amdgcn) {
137 if (LocalMemorySize == 0)
138 LocalMemorySize = 32768;
139
140 // Do something sensible for unspecified target.
142 HasMovrel = true;
143 }
144
146
147 if (AMDGPU::isGFX10Plus(*this) &&
148 !getFeatureBits().test(AMDGPU::FeatureCuMode))
149 LocalMemorySize *= 2;
150
151 // Don't crash on invalid devices.
152 if (WavefrontSizeLog2 == 0)
154
157
159
160 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
161 << TargetID.getXnackSetting() << '\n');
162 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
163 << TargetID.getSramEccSetting() << '\n');
164
165 return *this;
166}
167
168AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {}
169
172}
173
175 const GCNTargetMachine &TM)
176 : // clang-format off
177 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
178 AMDGPUSubtarget(TT),
179 TargetTriple(TT),
180 TargetID(*this),
181 InstrItins(getInstrItineraryForCPU(GPU)),
182 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
183 TLInfo(TM, *this),
184 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
185 // clang-format on
188 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
189 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
190 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
191 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
192 InstSelector.reset(new AMDGPUInstructionSelector(
193 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
194}
195
196unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
197 if (getGeneration() < GFX10)
198 return 1;
199
200 switch (Opcode) {
201 case AMDGPU::V_LSHLREV_B64_e64:
202 case AMDGPU::V_LSHLREV_B64_gfx10:
203 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
204 case AMDGPU::V_LSHL_B64_e64:
205 case AMDGPU::V_LSHRREV_B64_e64:
206 case AMDGPU::V_LSHRREV_B64_gfx10:
207 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
208 case AMDGPU::V_LSHR_B64_e64:
209 case AMDGPU::V_ASHRREV_I64_e64:
210 case AMDGPU::V_ASHRREV_I64_gfx10:
211 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
212 case AMDGPU::V_ASHR_I64_e64:
213 return 1;
214 }
215
216 return 2;
217}
218
219/// This list was mostly derived from experimentation.
220bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
221 switch (Opcode) {
222 case AMDGPU::V_CVT_F16_F32_e32:
223 case AMDGPU::V_CVT_F16_F32_e64:
224 case AMDGPU::V_CVT_F16_U16_e32:
225 case AMDGPU::V_CVT_F16_U16_e64:
226 case AMDGPU::V_CVT_F16_I16_e32:
227 case AMDGPU::V_CVT_F16_I16_e64:
228 case AMDGPU::V_RCP_F16_e64:
229 case AMDGPU::V_RCP_F16_e32:
230 case AMDGPU::V_RSQ_F16_e64:
231 case AMDGPU::V_RSQ_F16_e32:
232 case AMDGPU::V_SQRT_F16_e64:
233 case AMDGPU::V_SQRT_F16_e32:
234 case AMDGPU::V_LOG_F16_e64:
235 case AMDGPU::V_LOG_F16_e32:
236 case AMDGPU::V_EXP_F16_e64:
237 case AMDGPU::V_EXP_F16_e32:
238 case AMDGPU::V_SIN_F16_e64:
239 case AMDGPU::V_SIN_F16_e32:
240 case AMDGPU::V_COS_F16_e64:
241 case AMDGPU::V_COS_F16_e32:
242 case AMDGPU::V_FLOOR_F16_e64:
243 case AMDGPU::V_FLOOR_F16_e32:
244 case AMDGPU::V_CEIL_F16_e64:
245 case AMDGPU::V_CEIL_F16_e32:
246 case AMDGPU::V_TRUNC_F16_e64:
247 case AMDGPU::V_TRUNC_F16_e32:
248 case AMDGPU::V_RNDNE_F16_e64:
249 case AMDGPU::V_RNDNE_F16_e32:
250 case AMDGPU::V_FRACT_F16_e64:
251 case AMDGPU::V_FRACT_F16_e32:
252 case AMDGPU::V_FREXP_MANT_F16_e64:
253 case AMDGPU::V_FREXP_MANT_F16_e32:
254 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
255 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
256 case AMDGPU::V_LDEXP_F16_e64:
257 case AMDGPU::V_LDEXP_F16_e32:
258 case AMDGPU::V_LSHLREV_B16_e64:
259 case AMDGPU::V_LSHLREV_B16_e32:
260 case AMDGPU::V_LSHRREV_B16_e64:
261 case AMDGPU::V_LSHRREV_B16_e32:
262 case AMDGPU::V_ASHRREV_I16_e64:
263 case AMDGPU::V_ASHRREV_I16_e32:
264 case AMDGPU::V_ADD_U16_e64:
265 case AMDGPU::V_ADD_U16_e32:
266 case AMDGPU::V_SUB_U16_e64:
267 case AMDGPU::V_SUB_U16_e32:
268 case AMDGPU::V_SUBREV_U16_e64:
269 case AMDGPU::V_SUBREV_U16_e32:
270 case AMDGPU::V_MUL_LO_U16_e64:
271 case AMDGPU::V_MUL_LO_U16_e32:
272 case AMDGPU::V_ADD_F16_e64:
273 case AMDGPU::V_ADD_F16_e32:
274 case AMDGPU::V_SUB_F16_e64:
275 case AMDGPU::V_SUB_F16_e32:
276 case AMDGPU::V_SUBREV_F16_e64:
277 case AMDGPU::V_SUBREV_F16_e32:
278 case AMDGPU::V_MUL_F16_e64:
279 case AMDGPU::V_MUL_F16_e32:
280 case AMDGPU::V_MAX_F16_e64:
281 case AMDGPU::V_MAX_F16_e32:
282 case AMDGPU::V_MIN_F16_e64:
283 case AMDGPU::V_MIN_F16_e32:
284 case AMDGPU::V_MAX_U16_e64:
285 case AMDGPU::V_MAX_U16_e32:
286 case AMDGPU::V_MIN_U16_e64:
287 case AMDGPU::V_MIN_U16_e32:
288 case AMDGPU::V_MAX_I16_e64:
289 case AMDGPU::V_MAX_I16_e32:
290 case AMDGPU::V_MIN_I16_e64:
291 case AMDGPU::V_MIN_I16_e32:
292 case AMDGPU::V_MAD_F16_e64:
293 case AMDGPU::V_MAD_U16_e64:
294 case AMDGPU::V_MAD_I16_e64:
295 case AMDGPU::V_FMA_F16_e64:
296 case AMDGPU::V_DIV_FIXUP_F16_e64:
297 // On gfx10, all 16-bit instructions preserve the high bits.
299 case AMDGPU::V_MADAK_F16:
300 case AMDGPU::V_MADMK_F16:
301 case AMDGPU::V_MAC_F16_e64:
302 case AMDGPU::V_MAC_F16_e32:
303 case AMDGPU::V_FMAMK_F16:
304 case AMDGPU::V_FMAAK_F16:
305 case AMDGPU::V_FMAC_F16_e64:
306 case AMDGPU::V_FMAC_F16_e32:
307 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
308 // instructions maintain the legacy behavior of 0ing. Some instructions
309 // changed to preserving the high bits.
311 case AMDGPU::V_MAD_MIXLO_F16:
312 case AMDGPU::V_MAD_MIXHI_F16:
313 default:
314 return false;
315 }
316}
317
318// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
319// allows the given function to achieve an occupancy of NWaves waves per
320// SIMD / EU, taking into account only the function's *maximum* workgroup size.
321unsigned
323 const Function &F) const {
324 const unsigned WaveSize = getWavefrontSize();
325 const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
326 const unsigned WavesPerWorkgroup =
327 std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
328
329 const unsigned WorkGroupsPerCU =
330 std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
331
332 return getLocalMemorySize() / WorkGroupsPerCU;
333}
334
335// FIXME: Should return min,max range.
336//
337// Returns the maximum occupancy, in number of waves per SIMD / EU, that can
338// be achieved when only the given function is running on the machine; and
339// taking into account the overall number of wave slots, the (maximum) workgroup
340// size, and the per-workgroup LDS allocation size.
342 const Function &F) const {
343 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
344 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
345 if (!MaxWorkGroupsPerCu)
346 return 0;
347
348 const unsigned WaveSize = getWavefrontSize();
349
350 // FIXME: Do we need to account for alignment requirement of LDS rounding the
351 // size up?
352 // Compute restriction based on LDS usage
353 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
354
355 // This can be queried with more LDS than is possible, so just assume the
356 // worst.
357 if (NumGroups == 0)
358 return 1;
359
360 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
361
362 // Round to the number of waves per CU.
363 const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
364 unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
365
366 // Number of waves per EU (SIMD).
367 MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
368
369 // Clamp to the maximum possible number of waves.
370 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
371
372 // FIXME: Needs to be a multiple of the group size?
373 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
374
375 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
376 "computed invalid occupancy");
377 return MaxWaves;
378}
379
380unsigned
382 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
383 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
384}
385
386std::pair<unsigned, unsigned>
388 switch (CC) {
395 return std::pair(1, getWavefrontSize());
396 default:
397 return std::pair(1u, getMaxFlatWorkGroupSize());
398 }
399}
400
401std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
402 const Function &F) const {
403 // Default minimum/maximum flat work group sizes.
404 std::pair<unsigned, unsigned> Default =
405 getDefaultFlatWorkGroupSize(F.getCallingConv());
406
407 // Requested minimum/maximum flat work group sizes.
408 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
409 F, "amdgpu-flat-work-group-size", Default);
410
411 // Make sure requested minimum is less than requested maximum.
412 if (Requested.first > Requested.second)
413 return Default;
414
415 // Make sure requested values do not violate subtarget's specifications.
416 if (Requested.first < getMinFlatWorkGroupSize())
417 return Default;
418 if (Requested.second > getMaxFlatWorkGroupSize())
419 return Default;
420
421 return Requested;
422}
423
424std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
425 std::pair<unsigned, unsigned> Requested,
426 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
427 // Default minimum/maximum number of waves per execution unit.
428 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
429
430 // If minimum/maximum flat work group sizes were explicitly requested using
431 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
432 // number of waves per execution unit to values implied by requested
433 // minimum/maximum flat work group sizes.
434 unsigned MinImpliedByFlatWorkGroupSize =
435 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
436 Default.first = MinImpliedByFlatWorkGroupSize;
437
438 // Make sure requested minimum is less than requested maximum.
439 if (Requested.second && Requested.first > Requested.second)
440 return Default;
441
442 // Make sure requested values do not violate subtarget's specifications.
443 if (Requested.first < getMinWavesPerEU() ||
444 Requested.second > getMaxWavesPerEU())
445 return Default;
446
447 // Make sure requested values are compatible with values implied by requested
448 // minimum/maximum flat work group sizes.
449 if (Requested.first < MinImpliedByFlatWorkGroupSize)
450 return Default;
451
452 return Requested;
453}
454
455std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
456 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
457 // Default minimum/maximum number of waves per execution unit.
458 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
459
460 // Requested minimum/maximum number of waves per execution unit.
461 std::pair<unsigned, unsigned> Requested =
462 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true);
463 return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes);
464}
465
466static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
467 auto Node = Kernel.getMetadata("reqd_work_group_size");
468 if (Node && Node->getNumOperands() == 3)
469 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
470 return std::numeric_limits<unsigned>::max();
471}
472
474 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
475}
476
478 unsigned Dimension) const {
479 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
480 if (ReqdSize != std::numeric_limits<unsigned>::max())
481 return ReqdSize - 1;
482 return getFlatWorkGroupSizes(Kernel).second - 1;
483}
484
486 for (int I = 0; I < 3; ++I) {
487 if (getMaxWorkitemID(Func, I) > 0)
488 return false;
489 }
490
491 return true;
492}
493
495 Function *Kernel = I->getParent()->getParent();
496 unsigned MinSize = 0;
497 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
498 bool IdQuery = false;
499
500 // If reqd_work_group_size is present it narrows value down.
501 if (auto *CI = dyn_cast<CallInst>(I)) {
502 const Function *F = CI->getCalledFunction();
503 if (F) {
504 unsigned Dim = UINT_MAX;
505 switch (F->getIntrinsicID()) {
506 case Intrinsic::amdgcn_workitem_id_x:
507 case Intrinsic::r600_read_tidig_x:
508 IdQuery = true;
509 [[fallthrough]];
510 case Intrinsic::r600_read_local_size_x:
511 Dim = 0;
512 break;
513 case Intrinsic::amdgcn_workitem_id_y:
514 case Intrinsic::r600_read_tidig_y:
515 IdQuery = true;
516 [[fallthrough]];
517 case Intrinsic::r600_read_local_size_y:
518 Dim = 1;
519 break;
520 case Intrinsic::amdgcn_workitem_id_z:
521 case Intrinsic::r600_read_tidig_z:
522 IdQuery = true;
523 [[fallthrough]];
524 case Intrinsic::r600_read_local_size_z:
525 Dim = 2;
526 break;
527 default:
528 break;
529 }
530
531 if (Dim <= 3) {
532 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
533 if (ReqdSize != std::numeric_limits<unsigned>::max())
534 MinSize = MaxSize = ReqdSize;
535 }
536 }
537 }
538
539 if (!MaxSize)
540 return false;
541
542 // Range metadata is [Lo, Hi). For ID query we need to pass max size
543 // as Hi. For size query we need to pass Hi + 1.
544 if (IdQuery)
545 MinSize = 0;
546 else
547 ++MaxSize;
548
549 MDBuilder MDB(I->getContext());
550 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
551 APInt(32, MaxSize));
552 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
553 return true;
554}
555
557 assert(AMDGPU::isKernel(F.getCallingConv()));
558
559 // We don't allocate the segment if we know the implicit arguments weren't
560 // used, even if the ABI implies we need them.
561 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
562 return 0;
563
564 if (isMesaKernel(F))
565 return 16;
566
567 // Assume all implicit inputs are used by default
568 const Module *M = F.getParent();
569 unsigned NBytes =
571 return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
572 NBytes);
573}
574
576 Align &MaxAlign) const {
577 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
578 F.getCallingConv() == CallingConv::SPIR_KERNEL);
579
580 const DataLayout &DL = F.getParent()->getDataLayout();
581 uint64_t ExplicitArgBytes = 0;
582 MaxAlign = Align(1);
583
584 for (const Argument &Arg : F.args()) {
585 const bool IsByRef = Arg.hasByRefAttr();
586 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
587 Align Alignment = DL.getValueOrABITypeAlignment(
588 IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);
589 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
590 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
591 MaxAlign = std::max(MaxAlign, Alignment);
592 }
593
594 return ExplicitArgBytes;
595}
596
598 Align &MaxAlign) const {
599 if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
600 F.getCallingConv() != CallingConv::SPIR_KERNEL)
601 return 0;
602
603 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
604
605 unsigned ExplicitOffset = getExplicitKernelArgOffset();
606
607 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
608 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
609 if (ImplicitBytes != 0) {
610 const Align Alignment = getAlignmentForImplicitArgPtr();
611 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
612 MaxAlign = std::max(MaxAlign, Alignment);
613 }
614
615 // Being able to dereference past the end is useful for emitting scalar loads.
616 return alignTo(TotalSize, 4);
617}
618
622}
623
625 unsigned NumRegionInstrs) const {
626 // Track register pressure so the scheduler can try to decrease
627 // pressure once register usage is above the threshold defined by
628 // SIRegisterInfo::getRegPressureSetLimit()
629 Policy.ShouldTrackPressure = true;
630
631 // Enabling both top down and bottom up scheduling seems to give us less
632 // register spills than just using one of these approaches on its own.
633 Policy.OnlyTopDown = false;
634 Policy.OnlyBottomUp = false;
635
636 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
637 if (!enableSIScheduler())
638 Policy.ShouldTrackLaneMasks = true;
639}
640
642 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
643}
644
647}
648
649bool GCNSubtarget::useAA() const { return UseAA; }
650
651unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
653 return getMaxWavesPerEU();
654
656 if (SGPRs <= 80)
657 return 10;
658 if (SGPRs <= 88)
659 return 9;
660 if (SGPRs <= 100)
661 return 8;
662 return 7;
663 }
664 if (SGPRs <= 48)
665 return 10;
666 if (SGPRs <= 56)
667 return 9;
668 if (SGPRs <= 64)
669 return 8;
670 if (SGPRs <= 72)
671 return 7;
672 if (SGPRs <= 80)
673 return 6;
674 return 5;
675}
676
677unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
679}
680
681unsigned
682GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
684 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
685
686 if (HasFlatScratch || HasArchitectedFlatScratch) {
688 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
690 return 4; // FLAT_SCRATCH, VCC (in that order).
691 }
692
693 if (isXNACKEnabled())
694 return 4; // XNACK, VCC (in that order).
695 return 2; // VCC.
696}
697
701}
702
704 // In principle we do not need to reserve SGPR pair used for flat_scratch if
705 // we know flat instructions do not access the stack anywhere in the
706 // program. For now assume it's needed if we have flat instructions.
707 const bool KernelUsesFlatScratch = hasFlatAddressSpace();
708 return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
709}
710
711unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
712 unsigned NumSGPRs,
713 unsigned NumVGPRs) const {
714 unsigned Occupancy =
715 std::min(getMaxWavesPerEU(),
717 if (NumSGPRs)
718 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
719 if (NumVGPRs)
720 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
721 return Occupancy;
722}
723
725 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
726 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
727 // Compute maximum number of SGPRs function can use using default/requested
728 // minimum number of waves per execution unit.
729 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
730 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
731
732 // Check if maximum number of SGPRs was explicitly requested using
733 // "amdgpu-num-sgpr" attribute.
734 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
735 unsigned Requested =
736 F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);
737
738 // Make sure requested value does not violate subtarget's specifications.
739 if (Requested && (Requested <= ReservedNumSGPRs))
740 Requested = 0;
741
742 // If more SGPRs are required to support the input user/system SGPRs,
743 // increase to accommodate them.
744 //
745 // FIXME: This really ends up using the requested number of SGPRs + number
746 // of reserved special registers in total. Theoretically you could re-use
747 // the last input registers for these special registers, but this would
748 // require a lot of complexity to deal with the weird aliasing.
749 unsigned InputNumSGPRs = PreloadedSGPRs;
750 if (Requested && Requested < InputNumSGPRs)
751 Requested = InputNumSGPRs;
752
753 // Make sure requested value is compatible with values implied by
754 // default/requested minimum/maximum number of waves per execution unit.
755 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
756 Requested = 0;
757 if (WavesPerEU.second &&
758 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
759 Requested = 0;
760
761 if (Requested)
762 MaxNumSGPRs = Requested;
763 }
764
765 if (hasSGPRInitBug())
767
768 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
769}
770
772 const Function &F = MF.getFunction();
776}
777
778static unsigned getMaxNumPreloadedSGPRs() {
779 using USI = GCNUserSGPRUsageInfo;
780 // Max number of user SGPRs
781 const unsigned MaxUserSGPRs =
782 USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
783 USI::getNumUserSGPRForField(USI::DispatchPtrID) +
784 USI::getNumUserSGPRForField(USI::QueuePtrID) +
785 USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
786 USI::getNumUserSGPRForField(USI::DispatchIdID) +
787 USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
788 USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);
789
790 // Max number of system SGPRs
791 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
792 1 + // WorkGroupIDY
793 1 + // WorkGroupIDZ
794 1 + // WorkGroupInfo
795 1; // private segment wave byte offset
796
797 // Max number of synthetic SGPRs
798 const unsigned SyntheticSGPRs = 1; // LDSKernelId
799
800 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
801}
802
806}
807
809 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
810 // Compute maximum number of VGPRs function can use using default/requested
811 // minimum number of waves per execution unit.
812 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
813
814 // Check if maximum number of VGPRs was explicitly requested using
815 // "amdgpu-num-vgpr" attribute.
816 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
817 unsigned Requested =
818 F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs);
819
820 if (hasGFX90AInsts())
821 Requested *= 2;
822
823 // Make sure requested value is compatible with values implied by
824 // default/requested minimum/maximum number of waves per execution unit.
825 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
826 Requested = 0;
827 if (WavesPerEU.second &&
828 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
829 Requested = 0;
830
831 if (Requested)
832 MaxNumVGPRs = Requested;
833 }
834
835 return MaxNumVGPRs;
836}
837
840}
841
843 const Function &F = MF.getFunction();
845 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
846}
847
849 int UseOpIdx, SDep &Dep) const {
850 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
851 !Def->isInstr() || !Use->isInstr())
852 return;
853
854 MachineInstr *DefI = Def->getInstr();
855 MachineInstr *UseI = Use->getInstr();
856
857 if (DefI->isBundle()) {
859 auto Reg = Dep.getReg();
862 unsigned Lat = 0;
863 for (++I; I != E && I->isBundledWithPred(); ++I) {
864 if (I->modifiesRegister(Reg, TRI))
865 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
866 else if (Lat)
867 --Lat;
868 }
869 Dep.setLatency(Lat);
870 } else if (UseI->isBundle()) {
872 auto Reg = Dep.getReg();
875 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
876 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
877 if (I->readsRegister(Reg, TRI))
878 break;
879 --Lat;
880 }
881 Dep.setLatency(Lat);
882 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
883 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
884 // implicit operands which come from the MCInstrDesc, which can fool
885 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
886 // pseudo operands.
888 DefI, DefOpIdx, UseI, UseOpIdx));
889 }
890}
891
892namespace {
893struct FillMFMAShadowMutation : ScheduleDAGMutation {
894 const SIInstrInfo *TII;
895
896 ScheduleDAGMI *DAG;
897
898 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
899
900 bool isSALU(const SUnit *SU) const {
901 const MachineInstr *MI = SU->getInstr();
902 return MI && TII->isSALU(*MI) && !MI->isTerminator();
903 }
904
905 bool isVALU(const SUnit *SU) const {
906 const MachineInstr *MI = SU->getInstr();
907 return MI && TII->isVALU(*MI);
908 }
909
910 // Link as many SALU instructions in chain as possible. Return the size
911 // of the chain. Links up to MaxChain instructions.
912 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
913 SmallPtrSetImpl<SUnit *> &Visited) const {
914 SmallVector<SUnit *, 8> Worklist({To});
915 unsigned Linked = 0;
916
917 while (!Worklist.empty() && MaxChain-- > 0) {
918 SUnit *SU = Worklist.pop_back_val();
919 if (!Visited.insert(SU).second)
920 continue;
921
922 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
923 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
924
925 if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From))
926 if (DAG->addEdge(SU, SDep(From, SDep::Artificial)))
927 ++Linked;
928
929 for (SDep &SI : From->Succs) {
930 SUnit *SUv = SI.getSUnit();
931 if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) &&
932 DAG->canAddEdge(SUv, SU))
933 DAG->addEdge(SUv, SDep(SU, SDep::Artificial));
934 }
935
936 for (SDep &SI : SU->Succs) {
937 SUnit *Succ = SI.getSUnit();
938 if (Succ != SU && isSALU(Succ))
939 Worklist.push_back(Succ);
940 }
941 }
942
943 return Linked;
944 }
945
946 void apply(ScheduleDAGInstrs *DAGInstrs) override {
947 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
948 if (!ST.hasMAIInsts())
949 return;
950 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
951 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
952 if (!TSchedModel || DAG->SUnits.empty())
953 return;
954
955 // Scan for MFMA long latency instructions and try to add a dependency
956 // of available SALU instructions to give them a chance to fill MFMA
957 // shadow. That is desirable to fill MFMA shadow with SALU instructions
958 // rather than VALU to prevent power consumption bursts and throttle.
959 auto LastSALU = DAG->SUnits.begin();
960 auto E = DAG->SUnits.end();
962 for (SUnit &SU : DAG->SUnits) {
963 MachineInstr &MAI = *SU.getInstr();
964 if (!TII->isMAI(MAI) ||
965 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
966 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
967 continue;
968
969 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
970
971 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
972 dbgs() << "Need " << Lat
973 << " instructions to cover latency.\n");
974
975 // Find up to Lat independent scalar instructions as early as
976 // possible such that they can be scheduled after this MFMA.
977 for ( ; Lat && LastSALU != E; ++LastSALU) {
978 if (Visited.count(&*LastSALU))
979 continue;
980
981 if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) ||
982 !DAG->canAddEdge(&*LastSALU, &SU))
983 continue;
984
985 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
986 }
987 }
988 }
989};
990} // namespace
991
993 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
994 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
995}
996
997std::unique_ptr<ScheduleDAGMutation>
999 return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo)
1000 : nullptr;
1001}
1002
1004 if (NSAThreshold.getNumOccurrences() > 0)
1005 return std::max(NSAThreshold.getValue(), 2u);
1006
1008 "amdgpu-nsa-threshold", -1);
1009 if (Value > 0)
1010 return std::max(Value, 2);
1011
1012 return 3;
1013}
1014
1017 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1018 else
1019 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1020}
1021
1023 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1024 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1025 else
1026 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1027}
1028
1030 const GCNSubtarget &ST)
1031 : ST(ST) {
1032 const CallingConv::ID CC = F.getCallingConv();
1033 const bool IsKernel =
1035 // FIXME: Should have analysis or something rather than attribute to detect
1036 // calls.
1037 const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
1038 // FIXME: This attribute is a hack, we just need an analysis on the function
1039 // to look for allocas.
1040 const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
1041
1042 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
1043 KernargSegmentPtr = true;
1044
1045 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
1046 if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
1047 PrivateSegmentBuffer = true;
1048 else if (ST.isMesaGfxShader(F))
1049 ImplicitBufferPtr = true;
1050
1051 if (!AMDGPU::isGraphics(CC)) {
1052 if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
1053 DispatchPtr = true;
1054
1055 // FIXME: Can this always be disabled with < COv5?
1056 if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
1057 QueuePtr = true;
1058
1059 if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
1060 DispatchID = true;
1061 }
1062
1063 // TODO: This could be refined a lot. The attribute is a poor way of
1064 // detecting calls or stack objects that may require it before argument
1065 // lowering.
1066 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
1067 (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
1068 (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
1069 !ST.flatScratchIsArchitected()) {
1070 FlatScratchInit = true;
1071 }
1072
1074 NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
1075
1078
1079 if (hasDispatchPtr())
1080 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
1081
1082 if (hasQueuePtr())
1083 NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
1084
1086 NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
1087
1088 if (hasDispatchID())
1089 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
1090
1091 if (hasFlatScratchInit())
1092 NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
1093}
1094
1096 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
1097 NumKernargPreloadSGPRs += NumSGPRs;
1098 NumUsedUserSGPRs += NumSGPRs;
1099}
1100
1102 return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
1103}
@ HasCalls
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< bool > UseAA("aarch64-use-aa", cl::init(true), cl::desc("Enable the use of AA during codegen."))
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the InstructionSelector class for AMDGPU.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))
static cl::opt< unsigned > NSAThreshold("amdgpu-nsa-threshold", cl::desc("Number of addresses from which to enable MIMG NSA."), cl::init(3), cl::Hidden)
static cl::opt< bool > EnablePowerSched("amdgpu-enable-power-sched", cl::desc("Enable scheduling to minimize mAI power bursts"), cl::init(false))
static unsigned getMaxNumPreloadedSGPRs()
static cl::opt< bool > UseAA("amdgpu-use-aa-in-codegen", cl::desc("Enable the use of AA during codegen."), cl::init(true))
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim)
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
BlockVerifier::State From
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DEBUG(X)
Definition: Debug.h:101
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
This file describes how to lower LLVM inline asm to machine code INLINEASM.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
modulo schedule test
if(VerifyEach)
const char LLVMTargetMachineRef TM
AMDGPU R600 specific subclass of TargetSubtarget.
return InstrInfo
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallString class.
This class provides the information for the target register banks.
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
AMDGPUSubtarget(const Triple &TT)
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
Align getAlignmentForImplicitArgPtr() const
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
bool isMesaKernel(const Function &F) const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
virtual unsigned getMinWavesPerEU() const =0
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getLocalMemorySize() const
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
virtual unsigned getMaxFlatWorkGroupSize() const =0
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getMaxWavesPerEU() const
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
unsigned AddressableLocalMemorySize
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > WavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes) const
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
unsigned getWavefrontSize() const
virtual unsigned getMinFlatWorkGroupSize() const =0
void setTargetIDFromFeaturesString(StringRef FS)
TargetIDSetting getXnackSetting() const
TargetIDSetting getSramEccSetting() const
Class for arbitrary precision integers.
Definition: APInt.h:76
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:679
bool hasFlat() const
Definition: GCNSubtarget.h:350
bool useVGPRIndexMode() const
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep) const override
unsigned MaxPrivateElementSize
Definition: GCNSubtarget.h:66
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU) const
unsigned getConstantBusLimit(unsigned Opcode) const
const InstrItineraryData * getInstrItineraryData() const override
Definition: GCNSubtarget.h:280
bool hasSGPRInitBug() const
Definition: GCNSubtarget.h:996
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:247
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:243
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasFlatAddressSpace() const
Definition: GCNSubtarget.h:580
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
bool hasMovrel() const
Definition: GCNSubtarget.h:871
bool useAA() const override
bool hasVGPRIndexMode() const
Definition: GCNSubtarget.h:875
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
bool HasArchitectedFlatScratch
Definition: GCNSubtarget.h:183
std::unique_ptr< ScheduleDAGMutation > createFillMFMAShadowMutation(const TargetInstrInfo *TII) const
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
unsigned getMaxWavesPerEU() const
Generation getGeneration() const
Definition: GCNSubtarget.h:286
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:568
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
bool enableSIScheduler() const
Definition: GCNSubtarget.h:988
bool hasAddr64() const
Definition: GCNSubtarget.h:346
bool hasFP64() const
Definition: GCNSubtarget.h:326
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
void getPostRAMutations(std::vector< std::unique_ptr< ScheduleDAGMutation > > &Mutations) const override
~GCNSubtarget() override
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
Definition: GCNSubtarget.h:62
static unsigned getNumUserSGPRForField(UserSGPRID ID)
bool hasKernargSegmentPtr() const
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST)
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition: Metadata.cpp:1354
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:84
Metadata node.
Definition: Metadata.h:950
instr_iterator instr_end()
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:68
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:543
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:326
bool isBundle() const
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Scheduling dependency.
Definition: ScheduleDAG.h:49
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:486
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142
unsigned getReg() const
Returns the register associated with this edge.
Definition: ScheduleDAG.h:218
const TargetSchedModel & getSchedModel() const
Definition: SIInstrInfo.h:1260
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
std::pair< unsigned, unsigned > getWavesPerEU() const
GCNUserSGPRUsageInfo & getUserSGPRInfo()
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
void dumpNode(const SUnit &SU) const override
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU)
True if an edge can be added from PredSU to SuccSU without creating a cycle.
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Mutate the DAG as a postpass after normal DAG building.
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:561
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:559
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:563
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:345
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:384
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:366
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:451
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Information about stack frame layout on the target.
TargetInstrInfo - Interface to description of machine instruction set.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
const Triple & getTargetTriple() const
Provide an instruction scheduling machine model to CodeGen passes.
unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:355
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
Iterator for intrusive lists based on ilist_node.
self_iterator getIterator()
Definition: ilist_node.h:82
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getEUsPerCU(const MCSubtargetInfo *STI)
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI, unsigned NumVGPRs)
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI)
unsigned getCodeObjectVersion(const Module &M)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:185
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:197
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:203
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:188
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:191
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:141
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:215
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:210
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1300
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:414
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg.