LLVM  14.0.0git
AMDGPUSubtarget.cpp
Go to the documentation of this file.
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "R600Subtarget.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #undef AMDGPUSubtarget
42 
44  "amdgpu-disable-power-sched",
45  cl::desc("Disable scheduling to minimize mAI power bursts"),
46  cl::init(false));
47 
49  "amdgpu-vgpr-index-mode",
50  cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51  cl::init(false));
52 
54  "amdgpu-enable-flat-scratch",
55  cl::desc("Use flat scratch instructions"),
56  cl::init(false));
57 
58 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
59  cl::desc("Enable the use of AA during codegen."),
60  cl::init(true));
61 
62 GCNSubtarget::~GCNSubtarget() = default;
63 
66  StringRef GPU, StringRef FS) {
67  // Determine default and user-specified characteristics
68  //
69  // We want to be able to turn these off, but making this a subtarget feature
70  // for SI has the unhelpful behavior that it unsets everything else if you
71  // disable it.
72  //
73  // Similarly we want enable-prt-strict-null to be on by default and not to
74  // unset everything else if it is disabled
75 
76  SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
77 
78  // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
79  if (isAmdHsaOS())
80  FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
81 
82  FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
83 
84  // Disable mutually exclusive bits.
85  if (FS.contains_insensitive("+wavefrontsize")) {
86  if (!FS.contains_insensitive("wavefrontsize16"))
87  FullFS += "-wavefrontsize16,";
88  if (!FS.contains_insensitive("wavefrontsize32"))
89  FullFS += "-wavefrontsize32,";
90  if (!FS.contains_insensitive("wavefrontsize64"))
91  FullFS += "-wavefrontsize64,";
92  }
93 
94  FullFS += FS;
95 
96  ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
97 
98  // Implement the "generic" processors, which acts as the default when no
99  // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
100  // the first amdgcn target that supports flat addressing. Other OSes defaults
101  // to the first amdgcn target.
102  if (Gen == AMDGPUSubtarget::INVALID) {
105  }
106 
107  // We don't support FP64 for EG/NI atm.
109 
110  // Targets must either support 64-bit offsets for MUBUF instructions, and/or
111  // support flat operations, otherwise they cannot access a 64-bit global
112  // address space
113  assert(hasAddr64() || hasFlat());
114  // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
115  // that do not support ADDR64 variants of MUBUF instructions. Such targets
116  // cannot use a 64 bit offset with a MUBUF instruction to access the global
117  // address space
118  if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
119  ToggleFeature(AMDGPU::FeatureFlatForGlobal);
120  FlatForGlobal = true;
121  }
122  // Unless +-flat-for-global is specified, use MUBUF instructions for global
123  // address space access if flat operations are not available.
124  if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
125  ToggleFeature(AMDGPU::FeatureFlatForGlobal);
126  FlatForGlobal = false;
127  }
128 
129  // Set defaults if needed.
130  if (MaxPrivateElementSize == 0)
132 
133  if (LDSBankCount == 0)
134  LDSBankCount = 32;
135 
136  if (TT.getArch() == Triple::amdgcn) {
137  if (LocalMemorySize == 0)
138  LocalMemorySize = 32768;
139 
140  // Do something sensible for unspecified target.
141  if (!HasMovrel && !HasVGPRIndexMode)
142  HasMovrel = true;
143  }
144 
145  // Don't crash on invalid devices.
146  if (WavefrontSizeLog2 == 0)
147  WavefrontSizeLog2 = 5;
148 
151 
153 
154  LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
155  << TargetID.getXnackSetting() << '\n');
156  LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
157  << TargetID.getSramEccSetting() << '\n');
158 
159  return *this;
160 }
161 
163  TargetTriple(TT),
164  GCN3Encoding(false),
165  Has16BitInsts(false),
166  HasMadMixInsts(false),
167  HasMadMacF32Insts(false),
168  HasDsSrc2Insts(false),
169  HasSDWA(false),
170  HasVOP3PInsts(false),
171  HasMulI24(true),
172  HasMulU24(true),
173  HasSMulHi(false),
174  HasInv2PiInlineImm(false),
175  HasFminFmaxLegacy(true),
176  EnablePromoteAlloca(false),
177  HasTrigReducedRange(false),
178  MaxWavesPerEU(10),
179  LocalMemorySize(0),
180  WavefrontSizeLog2(0)
181  { }
182 
184  const GCNTargetMachine &TM)
185  : // clang-format off
186  AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
187  AMDGPUSubtarget(TT),
188  TargetTriple(TT),
189  TargetID(*this),
190  Gen(INVALID),
191  InstrItins(getInstrItineraryForCPU(GPU)),
192  LDSBankCount(0),
193  MaxPrivateElementSize(0),
194 
195  FastFMAF32(false),
196  FastDenormalF32(false),
197  HalfRate64Ops(false),
198  FullRate64Ops(false),
199 
200  FlatForGlobal(false),
201  AutoWaitcntBeforeBarrier(false),
202  UnalignedScratchAccess(false),
203  UnalignedAccessMode(false),
204 
205  HasApertureRegs(false),
206  SupportsXNACK(false),
207  EnableXNACK(false),
208  EnableTgSplit(false),
209  EnableCuMode(false),
210  TrapHandler(false),
211 
213  EnableUnsafeDSOffsetFolding(false),
214  EnableSIScheduler(false),
215  EnableDS128(false),
216  EnablePRTStrictNull(false),
217  DumpCode(false),
218 
219  FP64(false),
220  CIInsts(false),
221  GFX8Insts(false),
222  GFX9Insts(false),
223  GFX90AInsts(false),
224  GFX10Insts(false),
225  GFX10_3Insts(false),
226  GFX7GFX8GFX9Insts(false),
227  SGPRInitBug(false),
228  NegativeScratchOffsetBug(false),
229  NegativeUnalignedScratchOffsetBug(false),
230  HasSMemRealTime(false),
231  HasIntClamp(false),
232  HasFmaMixInsts(false),
233  HasMovrel(false),
234  HasVGPRIndexMode(false),
235  HasScalarStores(false),
236  HasScalarAtomics(false),
237  HasSDWAOmod(false),
238  HasSDWAScalar(false),
239  HasSDWASdst(false),
240  HasSDWAMac(false),
241  HasSDWAOutModsVOPC(false),
242  HasDPP(false),
243  HasDPP8(false),
244  Has64BitDPP(false),
245  HasPackedFP32Ops(false),
246  HasExtendedImageInsts(false),
247  HasR128A16(false),
248  HasGFX10A16(false),
249  HasG16(false),
250  HasNSAEncoding(false),
251  NSAMaxSize(0),
252  GFX10_AEncoding(false),
253  GFX10_BEncoding(false),
254  HasDLInsts(false),
255  HasDot1Insts(false),
256  HasDot2Insts(false),
257  HasDot3Insts(false),
258  HasDot4Insts(false),
259  HasDot5Insts(false),
260  HasDot6Insts(false),
261  HasDot7Insts(false),
262  HasMAIInsts(false),
263  HasPkFmacF16Inst(false),
264  HasAtomicFaddInsts(false),
265  SupportsSRAMECC(false),
266  EnableSRAMECC(false),
267  HasNoSdstCMPX(false),
268  HasVscnt(false),
269  HasGetWaveIdInst(false),
270  HasSMemTimeInst(false),
271  HasShaderCyclesRegister(false),
272  HasRegisterBanking(false),
273  HasVOP3Literal(false),
274  HasNoDataDepHazard(false),
275  FlatAddressSpace(false),
276  FlatInstOffsets(false),
277  FlatGlobalInsts(false),
278  FlatScratchInsts(false),
279  ScalarFlatScratchInsts(false),
280  HasArchitectedFlatScratch(false),
281  AddNoCarryInsts(false),
282  HasUnpackedD16VMem(false),
283  LDSMisalignedBug(false),
284  HasMFMAInlineLiteralBug(false),
285  UnalignedBufferAccess(false),
286  UnalignedDSAccess(false),
287  HasPackedTID(false),
288 
290 
291  HasVcmpxPermlaneHazard(false),
292  HasVMEMtoScalarWriteHazard(false),
293  HasSMEMtoVectorWriteHazard(false),
294  HasInstFwdPrefetchBug(false),
295  HasVcmpxExecWARHazard(false),
296  HasLdsBranchVmemWARHazard(false),
297  HasNSAtoVMEMBug(false),
298  HasNSAClauseBug(false),
299  HasOffset3fBug(false),
300  HasFlatSegmentOffsetBug(false),
301  HasImageStoreD16Bug(false),
302  HasImageGather4D16Bug(false),
303 
304  FeatureDisable(false),
305  InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
306  TLInfo(TM, *this),
307  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
308  // clang-format on
310  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
311  InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
312  Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
313  RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
314  InstSelector.reset(new AMDGPUInstructionSelector(
315  *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
316 }
317 
319  return flatScratchIsArchitected() ||
321 }
322 
323 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
324  if (getGeneration() < GFX10)
325  return 1;
326 
327  switch (Opcode) {
328  case AMDGPU::V_LSHLREV_B64_e64:
329  case AMDGPU::V_LSHLREV_B64_gfx10:
330  case AMDGPU::V_LSHL_B64_e64:
331  case AMDGPU::V_LSHRREV_B64_e64:
332  case AMDGPU::V_LSHRREV_B64_gfx10:
333  case AMDGPU::V_LSHR_B64_e64:
334  case AMDGPU::V_ASHRREV_I64_e64:
335  case AMDGPU::V_ASHRREV_I64_gfx10:
336  case AMDGPU::V_ASHR_I64_e64:
337  return 1;
338  }
339 
340  return 2;
341 }
342 
343 /// This list was mostly derived from experimentation.
344 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
345  switch (Opcode) {
346  case AMDGPU::V_CVT_F16_F32_e32:
347  case AMDGPU::V_CVT_F16_F32_e64:
348  case AMDGPU::V_CVT_F16_U16_e32:
349  case AMDGPU::V_CVT_F16_U16_e64:
350  case AMDGPU::V_CVT_F16_I16_e32:
351  case AMDGPU::V_CVT_F16_I16_e64:
352  case AMDGPU::V_RCP_F16_e64:
353  case AMDGPU::V_RCP_F16_e32:
354  case AMDGPU::V_RSQ_F16_e64:
355  case AMDGPU::V_RSQ_F16_e32:
356  case AMDGPU::V_SQRT_F16_e64:
357  case AMDGPU::V_SQRT_F16_e32:
358  case AMDGPU::V_LOG_F16_e64:
359  case AMDGPU::V_LOG_F16_e32:
360  case AMDGPU::V_EXP_F16_e64:
361  case AMDGPU::V_EXP_F16_e32:
362  case AMDGPU::V_SIN_F16_e64:
363  case AMDGPU::V_SIN_F16_e32:
364  case AMDGPU::V_COS_F16_e64:
365  case AMDGPU::V_COS_F16_e32:
366  case AMDGPU::V_FLOOR_F16_e64:
367  case AMDGPU::V_FLOOR_F16_e32:
368  case AMDGPU::V_CEIL_F16_e64:
369  case AMDGPU::V_CEIL_F16_e32:
370  case AMDGPU::V_TRUNC_F16_e64:
371  case AMDGPU::V_TRUNC_F16_e32:
372  case AMDGPU::V_RNDNE_F16_e64:
373  case AMDGPU::V_RNDNE_F16_e32:
374  case AMDGPU::V_FRACT_F16_e64:
375  case AMDGPU::V_FRACT_F16_e32:
376  case AMDGPU::V_FREXP_MANT_F16_e64:
377  case AMDGPU::V_FREXP_MANT_F16_e32:
378  case AMDGPU::V_FREXP_EXP_I16_F16_e64:
379  case AMDGPU::V_FREXP_EXP_I16_F16_e32:
380  case AMDGPU::V_LDEXP_F16_e64:
381  case AMDGPU::V_LDEXP_F16_e32:
382  case AMDGPU::V_LSHLREV_B16_e64:
383  case AMDGPU::V_LSHLREV_B16_e32:
384  case AMDGPU::V_LSHRREV_B16_e64:
385  case AMDGPU::V_LSHRREV_B16_e32:
386  case AMDGPU::V_ASHRREV_I16_e64:
387  case AMDGPU::V_ASHRREV_I16_e32:
388  case AMDGPU::V_ADD_U16_e64:
389  case AMDGPU::V_ADD_U16_e32:
390  case AMDGPU::V_SUB_U16_e64:
391  case AMDGPU::V_SUB_U16_e32:
392  case AMDGPU::V_SUBREV_U16_e64:
393  case AMDGPU::V_SUBREV_U16_e32:
394  case AMDGPU::V_MUL_LO_U16_e64:
395  case AMDGPU::V_MUL_LO_U16_e32:
396  case AMDGPU::V_ADD_F16_e64:
397  case AMDGPU::V_ADD_F16_e32:
398  case AMDGPU::V_SUB_F16_e64:
399  case AMDGPU::V_SUB_F16_e32:
400  case AMDGPU::V_SUBREV_F16_e64:
401  case AMDGPU::V_SUBREV_F16_e32:
402  case AMDGPU::V_MUL_F16_e64:
403  case AMDGPU::V_MUL_F16_e32:
404  case AMDGPU::V_MAX_F16_e64:
405  case AMDGPU::V_MAX_F16_e32:
406  case AMDGPU::V_MIN_F16_e64:
407  case AMDGPU::V_MIN_F16_e32:
408  case AMDGPU::V_MAX_U16_e64:
409  case AMDGPU::V_MAX_U16_e32:
410  case AMDGPU::V_MIN_U16_e64:
411  case AMDGPU::V_MIN_U16_e32:
412  case AMDGPU::V_MAX_I16_e64:
413  case AMDGPU::V_MAX_I16_e32:
414  case AMDGPU::V_MIN_I16_e64:
415  case AMDGPU::V_MIN_I16_e32:
416  case AMDGPU::V_MAD_F16_e64:
417  case AMDGPU::V_MAD_U16_e64:
418  case AMDGPU::V_MAD_I16_e64:
419  case AMDGPU::V_FMA_F16_e64:
420  case AMDGPU::V_DIV_FIXUP_F16_e64:
421  // On gfx10, all 16-bit instructions preserve the high bits.
423  case AMDGPU::V_MADAK_F16:
424  case AMDGPU::V_MADMK_F16:
425  case AMDGPU::V_MAC_F16_e64:
426  case AMDGPU::V_MAC_F16_e32:
427  case AMDGPU::V_FMAMK_F16:
428  case AMDGPU::V_FMAAK_F16:
429  case AMDGPU::V_FMAC_F16_e64:
430  case AMDGPU::V_FMAC_F16_e32:
431  // In gfx9, the preferred handling of the unused high 16-bits changed. Most
432  // instructions maintain the legacy behavior of 0ing. Some instructions
433  // changed to preserving the high bits.
435  case AMDGPU::V_MAD_MIXLO_F16:
436  case AMDGPU::V_MAD_MIXHI_F16:
437  default:
438  return false;
439  }
440 }
441 
443  const Function &F) const {
444  if (NWaves == 1)
445  return getLocalMemorySize();
446  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
447  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
448  if (!WorkGroupsPerCu)
449  return 0;
450  unsigned MaxWaves = getMaxWavesPerEU();
451  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
452 }
453 
454 // FIXME: Should return min,max range.
456  const Function &F) const {
457  const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
458  const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
459  if (!MaxWorkGroupsPerCu)
460  return 0;
461 
462  const unsigned WaveSize = getWavefrontSize();
463 
464  // FIXME: Do we need to account for alignment requirement of LDS rounding the
465  // size up?
466  // Compute restriction based on LDS usage
467  unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
468 
469  // This can be queried with more LDS than is possible, so just assume the
470  // worst.
471  if (NumGroups == 0)
472  return 1;
473 
474  NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
475 
476  // Round to the number of waves.
477  const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
478  unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
479 
480  // Clamp to the maximum possible number of waves.
481  MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
482 
483  // FIXME: Needs to be a multiple of the group size?
484  //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
485 
486  assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
487  "computed invalid occupancy");
488  return MaxWaves;
489 }
490 
491 unsigned
493  const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
494  return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
495 }
496 
497 std::pair<unsigned, unsigned>
499  switch (CC) {
506  return std::make_pair(1, getWavefrontSize());
507  default:
508  return std::make_pair(1u, getMaxFlatWorkGroupSize());
509  }
510 }
511 
512 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
513  const Function &F) const {
514  // Default minimum/maximum flat work group sizes.
515  std::pair<unsigned, unsigned> Default =
516  getDefaultFlatWorkGroupSize(F.getCallingConv());
517 
518  // Requested minimum/maximum flat work group sizes.
519  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
520  F, "amdgpu-flat-work-group-size", Default);
521 
522  // Make sure requested minimum is less than requested maximum.
523  if (Requested.first > Requested.second)
524  return Default;
525 
526  // Make sure requested values do not violate subtarget's specifications.
527  if (Requested.first < getMinFlatWorkGroupSize())
528  return Default;
529  if (Requested.second > getMaxFlatWorkGroupSize())
530  return Default;
531 
532  return Requested;
533 }
534 
535 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
536  const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
537  // Default minimum/maximum number of waves per execution unit.
538  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
539 
540  // If minimum/maximum flat work group sizes were explicitly requested using
541  // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
542  // number of waves per execution unit to values implied by requested
543  // minimum/maximum flat work group sizes.
544  unsigned MinImpliedByFlatWorkGroupSize =
545  getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
546  Default.first = MinImpliedByFlatWorkGroupSize;
547 
548  // Requested minimum/maximum number of waves per execution unit.
549  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
550  F, "amdgpu-waves-per-eu", Default, true);
551 
552  // Make sure requested minimum is less than requested maximum.
553  if (Requested.second && Requested.first > Requested.second)
554  return Default;
555 
556  // Make sure requested values do not violate subtarget's specifications.
557  if (Requested.first < getMinWavesPerEU() ||
558  Requested.second > getMaxWavesPerEU())
559  return Default;
560 
561  // Make sure requested values are compatible with values implied by requested
562  // minimum/maximum flat work group sizes.
563  if (Requested.first < MinImpliedByFlatWorkGroupSize)
564  return Default;
565 
566  return Requested;
567 }
568 
569 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
570  auto Node = Kernel.getMetadata("reqd_work_group_size");
571  if (Node && Node->getNumOperands() == 3)
572  return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
574 }
575 
577  return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
578 }
579 
581  unsigned Dimension) const {
582  unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
583  if (ReqdSize != std::numeric_limits<unsigned>::max())
584  return ReqdSize - 1;
585  return getFlatWorkGroupSizes(Kernel).second - 1;
586 }
587 
589  Function *Kernel = I->getParent()->getParent();
590  unsigned MinSize = 0;
591  unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
592  bool IdQuery = false;
593 
594  // If reqd_work_group_size is present it narrows value down.
595  if (auto *CI = dyn_cast<CallInst>(I)) {
596  const Function *F = CI->getCalledFunction();
597  if (F) {
598  unsigned Dim = UINT_MAX;
599  switch (F->getIntrinsicID()) {
600  case Intrinsic::amdgcn_workitem_id_x:
601  case Intrinsic::r600_read_tidig_x:
602  IdQuery = true;
604  case Intrinsic::r600_read_local_size_x:
605  Dim = 0;
606  break;
607  case Intrinsic::amdgcn_workitem_id_y:
608  case Intrinsic::r600_read_tidig_y:
609  IdQuery = true;
611  case Intrinsic::r600_read_local_size_y:
612  Dim = 1;
613  break;
614  case Intrinsic::amdgcn_workitem_id_z:
615  case Intrinsic::r600_read_tidig_z:
616  IdQuery = true;
618  case Intrinsic::r600_read_local_size_z:
619  Dim = 2;
620  break;
621  default:
622  break;
623  }
624 
625  if (Dim <= 3) {
626  unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
627  if (ReqdSize != std::numeric_limits<unsigned>::max())
628  MinSize = MaxSize = ReqdSize;
629  }
630  }
631  }
632 
633  if (!MaxSize)
634  return false;
635 
636  // Range metadata is [Lo, Hi). For ID query we need to pass max size
637  // as Hi. For size query we need to pass Hi + 1.
638  if (IdQuery)
639  MinSize = 0;
640  else
641  ++MaxSize;
642 
643  MDBuilder MDB(I->getContext());
644  MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
645  APInt(32, MaxSize));
646  I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
647  return true;
648 }
649 
651  assert(AMDGPU::isKernel(F.getCallingConv()));
652 
653  // We don't allocate the segment if we know the implicit arguments weren't
654  // used, even if the ABI implies we need them.
655  if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
656  return 0;
657 
658  if (isMesaKernel(F))
659  return 16;
660 
661  // Assume all implicit inputs are used by default
662  return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56);
663 }
664 
666  Align &MaxAlign) const {
667  assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
668  F.getCallingConv() == CallingConv::SPIR_KERNEL);
669 
670  const DataLayout &DL = F.getParent()->getDataLayout();
671  uint64_t ExplicitArgBytes = 0;
672  MaxAlign = Align(1);
673 
674  for (const Argument &Arg : F.args()) {
675  const bool IsByRef = Arg.hasByRefAttr();
676  Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
677  MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
678  if (!Alignment)
679  Alignment = DL.getABITypeAlign(ArgTy);
680 
681  uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
682  ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
683  MaxAlign = max(MaxAlign, Alignment);
684  }
685 
686  return ExplicitArgBytes;
687 }
688 
690  Align &MaxAlign) const {
691  uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
692 
693  unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
694 
695  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
696  unsigned ImplicitBytes = getImplicitArgNumBytes(F);
697  if (ImplicitBytes != 0) {
698  const Align Alignment = getAlignmentForImplicitArgPtr();
699  TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
700  MaxAlign = std::max(MaxAlign, Alignment);
701  }
702 
703  // Being able to dereference past the end is useful for emitting scalar loads.
704  return alignTo(TotalSize, 4);
705 }
706 
710 }
711 
713  unsigned NumRegionInstrs) const {
714  // Track register pressure so the scheduler can try to decrease
715  // pressure once register usage is above the threshold defined by
716  // SIRegisterInfo::getRegPressureSetLimit()
717  Policy.ShouldTrackPressure = true;
718 
719  // Enabling both top down and bottom up scheduling seems to give us less
720  // register spills than just using one of these approaches on its own.
721  Policy.OnlyTopDown = false;
722  Policy.OnlyBottomUp = false;
723 
724  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
725  if (!enableSIScheduler())
726  Policy.ShouldTrackLaneMasks = true;
727 }
728 
730  return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
731 }
732 
734  return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
735 }
736 
737 bool GCNSubtarget::useAA() const { return UseAA; }
738 
739 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
741  return getMaxWavesPerEU();
742 
744  if (SGPRs <= 80)
745  return 10;
746  if (SGPRs <= 88)
747  return 9;
748  if (SGPRs <= 100)
749  return 8;
750  return 7;
751  }
752  if (SGPRs <= 48)
753  return 10;
754  if (SGPRs <= 56)
755  return 9;
756  if (SGPRs <= 64)
757  return 8;
758  if (SGPRs <= 72)
759  return 7;
760  if (SGPRs <= 80)
761  return 6;
762  return 5;
763 }
764 
765 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
766  unsigned MaxWaves = getMaxWavesPerEU();
767  unsigned Granule = getVGPRAllocGranule();
768  if (VGPRs < Granule)
769  return MaxWaves;
770  unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
771  return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
772 }
773 
774 unsigned
775 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
777  return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
778 
779  if (HasFlatScratch || HasArchitectedFlatScratch) {
781  return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
783  return 4; // FLAT_SCRATCH, VCC (in that order).
784  }
785 
786  if (isXNACKEnabled())
787  return 4; // XNACK, VCC (in that order).
788  return 2; // VCC.
789 }
790 
794 }
795 
797  // In principle we do not need to reserve SGPR pair used for flat_scratch if
798  // we know flat instructions do not access the stack anywhere in the
799  // program. For now assume it's needed if we have flat instructions.
800  const bool KernelUsesFlatScratch = hasFlatAddressSpace();
801  return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
802 }
803 
804 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
805  unsigned NumSGPRs,
806  unsigned NumVGPRs) const {
807  unsigned Occupancy =
809  getOccupancyWithLocalMemSize(LDSSize, F));
810  if (NumSGPRs)
811  Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
812  if (NumVGPRs)
813  Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
814  return Occupancy;
815 }
816 
818  const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
819  unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
820  // Compute maximum number of SGPRs function can use using default/requested
821  // minimum number of waves per execution unit.
822  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
823  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
824 
825  // Check if maximum number of SGPRs was explicitly requested using
826  // "amdgpu-num-sgpr" attribute.
827  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
828  unsigned Requested = AMDGPU::getIntegerAttribute(
829  F, "amdgpu-num-sgpr", MaxNumSGPRs);
830 
831  // Make sure requested value does not violate subtarget's specifications.
832  if (Requested && (Requested <= ReservedNumSGPRs))
833  Requested = 0;
834 
835  // If more SGPRs are required to support the input user/system SGPRs,
836  // increase to accommodate them.
837  //
838  // FIXME: This really ends up using the requested number of SGPRs + number
839  // of reserved special registers in total. Theoretically you could re-use
840  // the last input registers for these special registers, but this would
841  // require a lot of complexity to deal with the weird aliasing.
842  unsigned InputNumSGPRs = PreloadedSGPRs;
843  if (Requested && Requested < InputNumSGPRs)
844  Requested = InputNumSGPRs;
845 
846  // Make sure requested value is compatible with values implied by
847  // default/requested minimum/maximum number of waves per execution unit.
848  if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
849  Requested = 0;
850  if (WavesPerEU.second &&
851  Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
852  Requested = 0;
853 
854  if (Requested)
855  MaxNumSGPRs = Requested;
856  }
857 
858  if (hasSGPRInitBug())
860 
861  return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
862 }
863 
865  const Function &F = MF.getFunction();
868  getReservedNumSGPRs(MF));
869 }
870 
871 static unsigned getMaxNumPreloadedSGPRs() {
872  // Max number of user SGPRs
873  unsigned MaxUserSGPRs = 4 + // private segment buffer
874  2 + // Dispatch ptr
875  2 + // queue ptr
876  2 + // kernel segment ptr
877  2 + // dispatch ID
878  2 + // flat scratch init
879  2; // Implicit buffer ptr
880  // Max number of system SGPRs
881  unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
882  1 + // WorkGroupIDY
883  1 + // WorkGroupIDZ
884  1 + // WorkGroupInfo
885  1; // private segment wave byte offset
886  return MaxUserSGPRs + MaxSystemSGPRs;
887 }
888 
889 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
892 }
893 
895  const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
896  // Compute maximum number of VGPRs function can use using default/requested
897  // minimum number of waves per execution unit.
898  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
899 
900  // Check if maximum number of VGPRs was explicitly requested using
901  // "amdgpu-num-vgpr" attribute.
902  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
903  unsigned Requested = AMDGPU::getIntegerAttribute(
904  F, "amdgpu-num-vgpr", MaxNumVGPRs);
905 
906  if (hasGFX90AInsts())
907  Requested *= 2;
908 
909  // Make sure requested value is compatible with values implied by
910  // default/requested minimum/maximum number of waves per execution unit.
911  if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
912  Requested = 0;
913  if (WavesPerEU.second &&
914  Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
915  Requested = 0;
916 
917  if (Requested)
918  MaxNumVGPRs = Requested;
919  }
920 
921  return MaxNumVGPRs;
922 }
923 
924 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
926 }
927 
929  const Function &F = MF.getFunction();
931  return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
932 }
933 
935  int UseOpIdx, SDep &Dep) const {
936  if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
937  !Def->isInstr() || !Use->isInstr())
938  return;
939 
940  MachineInstr *DefI = Def->getInstr();
941  MachineInstr *UseI = Use->getInstr();
942 
943  if (DefI->isBundle()) {
945  auto Reg = Dep.getReg();
948  unsigned Lat = 0;
949  for (++I; I != E && I->isBundledWithPred(); ++I) {
950  if (I->modifiesRegister(Reg, TRI))
951  Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
952  else if (Lat)
953  --Lat;
954  }
955  Dep.setLatency(Lat);
956  } else if (UseI->isBundle()) {
958  auto Reg = Dep.getReg();
961  unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
962  for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
963  if (I->readsRegister(Reg, TRI))
964  break;
965  --Lat;
966  }
967  Dep.setLatency(Lat);
968  } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
969  // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
970  // implicit operands which come from the MCInstrDesc, which can fool
971  // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
972  // pseudo operands.
974  DefI, DefOpIdx, UseI, UseOpIdx));
975  }
976 }
977 
978 namespace {
979 struct FillMFMAShadowMutation : ScheduleDAGMutation {
980  const SIInstrInfo *TII;
981 
982  ScheduleDAGMI *DAG;
983 
984  FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
985 
986  bool isSALU(const SUnit *SU) const {
987  const MachineInstr *MI = SU->getInstr();
988  return MI && TII->isSALU(*MI) && !MI->isTerminator();
989  }
990 
991  bool isVALU(const SUnit *SU) const {
992  const MachineInstr *MI = SU->getInstr();
993  return MI && TII->isVALU(*MI);
994  }
995 
996  bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
997  if (Pred->NodeNum < Succ->NodeNum)
998  return true;
999 
1000  SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
1001 
1002  for (unsigned I = 0; I < Succs.size(); ++I) {
1003  for (const SDep &SI : Succs[I]->Succs) {
1004  const SUnit *SU = SI.getSUnit();
1005  if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
1006  Succs.push_back(SU);
1007  }
1008  }
1009 
1011  while (!Preds.empty()) {
1012  const SUnit *SU = Preds.pop_back_val();
1013  if (llvm::is_contained(Succs, SU))
1014  return false;
1015  Visited.insert(SU);
1016  for (const SDep &SI : SU->Preds)
1017  if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1018  Preds.push_back(SI.getSUnit());
1019  }
1020 
1021  return true;
1022  }
1023 
1024  // Link as many SALU instructions in chain as possible. Return the size
1025  // of the chain. Links up to MaxChain instructions.
1026  unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1027  SmallPtrSetImpl<SUnit *> &Visited) const {
1028  SmallVector<SUnit *, 8> Worklist({To});
1029  unsigned Linked = 0;
1030 
1031  while (!Worklist.empty() && MaxChain-- > 0) {
1032  SUnit *SU = Worklist.pop_back_val();
1033  if (!Visited.insert(SU).second)
1034  continue;
1035 
1036  LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1037  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1038 
1039  if (SU->addPred(SDep(From, SDep::Artificial), false))
1040  ++Linked;
1041 
1042  for (SDep &SI : From->Succs) {
1043  SUnit *SUv = SI.getSUnit();
1044  if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1045  SUv->addPred(SDep(SU, SDep::Artificial), false);
1046  }
1047 
1048  for (SDep &SI : SU->Succs) {
1049  SUnit *Succ = SI.getSUnit();
1050  if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1051  Worklist.push_back(Succ);
1052  }
1053  }
1054 
1055  return Linked;
1056  }
1057 
1058  void apply(ScheduleDAGInstrs *DAGInstrs) override {
1059  const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1060  if (!ST.hasMAIInsts() || DisablePowerSched)
1061  return;
1062  DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1063  const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1064  if (!TSchedModel || DAG->SUnits.empty())
1065  return;
1066 
1067  // Scan for MFMA long latency instructions and try to add a dependency
1068  // of available SALU instructions to give them a chance to fill MFMA
1069  // shadow. That is desirable to fill MFMA shadow with SALU instructions
1070  // rather than VALU to prevent power consumption bursts and throttle.
1071  auto LastSALU = DAG->SUnits.begin();
1072  auto E = DAG->SUnits.end();
1073  SmallPtrSet<SUnit*, 32> Visited;
1074  for (SUnit &SU : DAG->SUnits) {
1075  MachineInstr &MAI = *SU.getInstr();
1076  if (!TII->isMAI(MAI) ||
1077  MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1078  MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1079  continue;
1080 
1081  unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1082 
1083  LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1084  dbgs() << "Need " << Lat
1085  << " instructions to cover latency.\n");
1086 
1087  // Find up to Lat independent scalar instructions as early as
1088  // possible such that they can be scheduled after this MFMA.
1089  for ( ; Lat && LastSALU != E; ++LastSALU) {
1090  if (Visited.count(&*LastSALU))
1091  continue;
1092 
1093  if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1094  continue;
1095 
1096  Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1097  }
1098  }
1099  }
1100 };
1101 } // namespace
1102 
1104  std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1105  Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1106 }
1107 
1108 std::unique_ptr<ScheduleDAGMutation>
1110  return std::make_unique<FillMFMAShadowMutation>(&InstrInfo);
1111 }
1112 
1115  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1116  else
1117  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1118 }
1119 
1121  if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1122  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1123  else
1124  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1125 }
llvm::MachineSchedPolicy::OnlyBottomUp
bool OnlyBottomUp
Definition: MachineScheduler.h:189
llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:148
llvm::AMDGPUSubtarget::getAlignmentForImplicitArgPtr
Align getAlignmentForImplicitArgPtr() const
Definition: AMDGPUSubtarget.h:208
llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
llvm::GCNSubtarget::Gen
unsigned Gen
Definition: GCNSubtarget.h:61
llvm::AMDGPURegisterBankInfo
Definition: AMDGPURegisterBankInfo.h:42
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:105
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AllocatorList.h:23
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
Reg
unsigned Reg
Definition: MachineSink.cpp:1563
TargetFrameLowering.h
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::AMDGPU::HSAMD::Kernel::CodeProps::Key::NumSGPRs
constexpr char NumSGPRs[]
Key for Kernel::CodeProps::Metadata::mNumSGPRs.
Definition: AMDGPUMetadata.h:253
llvm::TargetFrameLowering
Information about stack frame layout on the target.
Definition: TargetFrameLowering.h:43
llvm::GCNSubtarget::hasVGPRIndexMode
bool hasVGPRIndexMode() const
Definition: GCNSubtarget.h:790
llvm::Wave32
@ Wave32
Definition: AMDGPUMCTargetDesc.h:31
llvm::SDep::Artificial
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
llvm::InlineAsmLowering
Definition: InlineAsmLowering.h:28
llvm::AMDGPUSubtarget::HasFminFmaxLegacy
bool HasFminFmaxLegacy
Definition: AMDGPUSubtarget.h:59
SIMachineFunctionInfo.h
llvm::GCNSubtarget::initializeSubtargetDependencies
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Definition: AMDGPUSubtarget.cpp:65
llvm::Function
Definition: Function.h:62
llvm::GCNSubtarget::hasMovrel
bool hasMovrel() const
Definition: GCNSubtarget.h:786
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1177
llvm::AMDGPUSubtarget::GFX9
@ GFX9
Definition: AMDGPUSubtarget.h:40
llvm::AMDGPUSubtarget::SOUTHERN_ISLANDS
@ SOUTHERN_ISLANDS
Definition: AMDGPUSubtarget.h:37
llvm::Triple::amdgcn
@ amdgcn
Definition: Triple.h:71
llvm::AMDGPUSubtarget::getMinWavesPerEU
virtual unsigned getMinWavesPerEU() const =0
llvm::CallingConv::AMDGPU_PS
@ AMDGPU_PS
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:210
llvm::GCNSubtarget::hasFlatScratchInsts
bool hasFlatScratchInsts() const
Definition: GCNSubtarget.h:554
llvm::GCNSubtarget::hasFP64
bool hasFP64() const
Definition: GCNSubtarget.h:286
llvm::GlobalObject::getMetadata
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition: Metadata.cpp:1197
llvm::AMDGPUSubtarget::SEA_ISLANDS
@ SEA_ISLANDS
Definition: AMDGPUSubtarget.h:38
llvm::Triple
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
llvm::Wave64
@ Wave64
Definition: AMDGPUMCTargetDesc.h:31
llvm::AMDGPUSubtarget::getOccupancyWithLocalMemSize
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
Definition: AMDGPUSubtarget.cpp:455
true
basic Basic Alias true
Definition: BasicAliasAnalysis.cpp:1886
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
EnableVGPRIndexMode
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))
llvm::AMDGPUSubtarget::getMaxWavesPerEU
unsigned getMaxWavesPerEU() const
Definition: AMDGPUSubtarget.h:239
llvm::SUnit::Succs
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::GCNSubtarget::TargetID
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
Definition: GCNSubtarget.h:60
llvm::AMDGPUSubtarget::getMaxFlatWorkGroupSize
virtual unsigned getMaxFlatWorkGroupSize() const =0
getMaxNumPreloadedSGPRs
static unsigned getMaxNumPreloadedSGPRs()
Definition: AMDGPUSubtarget.cpp:871
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1564
llvm::SIMachineFunctionInfo::getNumPreloadedSGPRs
unsigned getNumPreloadedSGPRs() const
Definition: SIMachineFunctionInfo.h:733
llvm::Data
@ Data
Definition: SIMachineScheduler.h:55
llvm::AMDGPUSubtarget::getKernArgSegmentSize
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
Definition: AMDGPUSubtarget.cpp:689
llvm::GCNSubtarget::getRegisterInfo
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:219
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::AMDGPULegalizerInfo
This class provides the information for the target register banks.
Definition: AMDGPULegalizerInfo.h:31
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::AMDGPU::isKernel
LLVM_READNONE bool isKernel(CallingConv::ID CC)
Definition: AMDGPUBaseInfo.h:723
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:185
llvm::AMDGPUSubtarget::isAmdHsaOS
bool isAmdHsaOS() const
Definition: AMDGPUSubtarget.h:118
llvm::GCNSubtarget::HasVGPRIndexMode
bool HasVGPRIndexMode
Definition: GCNSubtarget.h:114
llvm::GCNSubtarget::overrideSchedPolicy
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
Definition: AMDGPUSubtarget.cpp:712
llvm::TargetInstrInfo
TargetInstrInfo - Interface to description of machine instruction set.
Definition: TargetInstrInfo.h:97
llvm::cl::apply
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1318
llvm::AMDGPUSubtarget::get
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Definition: AMDGPUSubtarget.cpp:1113
llvm::GCNSubtarget::ParseSubtargetFeatures
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
llvm::AMDGPU::isShader
bool isShader(CallingConv::ID cc)
Definition: AMDGPUBaseInfo.cpp:1369
InlineAsmLowering.h
llvm::GCNSubtarget::getMaxNumSGPRs
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
Definition: GCNSubtarget.h:1025
llvm::GCNSubtarget::useVGPRIndexMode
bool useVGPRIndexMode() const
Definition: AMDGPUSubtarget.cpp:733
llvm::GCNSubtarget::hasAddr64
bool hasAddr64() const
Definition: GCNSubtarget.h:310
SmallString.h
ScalarizeGlobal
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden)
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::SIInstrInfo::getSchedModel
const TargetSchedModel & getSchedModel() const
Definition: SIInstrInfo.h:1134
llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:739
llvm::Legalizer
Definition: Legalizer.h:30
getReqdWorkGroupSize
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim)
Definition: AMDGPUSubtarget.cpp:569
llvm::GCNSubtarget::getTargetLowering
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:215
llvm::CallingConv::AMDGPU_GS
@ AMDGPU_GS
Calling convention used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:207
llvm::AMDGPUSubtarget::isMesa3DOS
bool isMesa3DOS() const
Definition: AMDGPUSubtarget.h:126
llvm::SUnit::NodeNum
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:264
llvm::GCNSubtarget::getOccupancyWithNumSGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
Definition: AMDGPUSubtarget.cpp:739
llvm::AMDGPUSubtarget::getImplicitArgNumBytes
unsigned getImplicitArgNumBytes(const Function &F) const
Definition: AMDGPUSubtarget.cpp:650
llvm::GCNSubtarget::HasArchitectedFlatScratch
bool HasArchitectedFlatScratch
Definition: GCNSubtarget.h:164
llvm::GCNSubtarget::getBaseMaxNumSGPRs
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
Definition: AMDGPUSubtarget.cpp:817
AMDGPUSubtarget.h
false
Definition: StackSlotColoring.cpp:142
llvm::AMDGPUSubtarget::getWavesPerEUForWorkGroup
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
EnableFlatScratch
static cl::opt< bool > EnableFlatScratch("amdgpu-enable-flat-scratch", cl::desc("Use flat scratch instructions"), cl::init(false))
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:127
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
MCSubtargetInfo.h
llvm::AMDGPUSubtarget::isMesaKernel
bool isMesaKernel(const Function &F) const
Definition: AMDGPUSubtarget.cpp:576
llvm::Instruction
Definition: Instruction.h:45
llvm::GCNSubtarget::hasSGPRInitBug
bool hasSGPRInitBug() const
Definition: GCNSubtarget.h:891
llvm::SIInstrInfo::getInstrLatency
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
Definition: SIInstrInfo.cpp:8008
MDBuilder.h
llvm::AMDGPU::IsaInfo::AMDGPUTargetID::getSramEccSetting
TargetIDSetting getSramEccSetting() const
Definition: AMDGPUBaseInfo.h:144
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:30
llvm::AMDGPUSubtarget::getMaxWorkitemID
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
Definition: AMDGPUSubtarget.cpp:580
Align
uint64_t Align
Definition: ELFObjHandler.cpp:82
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::Triple::getArch
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:311
llvm::GCNSubtarget::getMaxNumVGPRs
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
Definition: GCNSubtarget.h:1095
llvm::GCNSubtarget::LDSBankCount
int LDSBankCount
Definition: GCNSubtarget.h:63
llvm::AMDGPUSubtarget::getLocalMemorySize
unsigned getLocalMemorySize() const
Definition: AMDGPUSubtarget.h:204
llvm::None
const NoneType None
Definition: None.h:23
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::SmallString< 256 >
llvm::X86AS::FS
@ FS
Definition: X86.h:188
llvm::AMDGPUSubtarget::AMDGPUSubtarget
AMDGPUSubtarget(const Triple &TT)
Definition: AMDGPUSubtarget.cpp:162
llvm::AMDGPUSubtarget::getWavefrontSize
unsigned getWavefrontSize() const
Definition: AMDGPUSubtarget.h:196
llvm::Triple::AMDHSA
@ AMDHSA
Definition: Triple.h:196
llvm::AMDGPUSubtarget::GFX10
@ GFX10
Definition: AMDGPUSubtarget.h:41
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:641
llvm::AMDGPUDwarfFlavour
AMDGPUDwarfFlavour
Definition: AMDGPUMCTargetDesc.h:31
llvm::cl::opt< bool >
llvm::AMDGPUSubtarget::WavefrontSizeLog2
char WavefrontSizeLog2
Definition: AMDGPUSubtarget.h:64
llvm::AMDGPUSubtarget::makeLIDRangeMetadata
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
Definition: AMDGPUSubtarget.cpp:588
llvm::R600Subtarget
Definition: R600Subtarget.h:29
llvm::AMDGPUSubtarget
Definition: AMDGPUSubtarget.h:29
AMDGPURegisterBankInfo.h
llvm::TargetSchedModel
Provide an instruction scheduling machine model to CodeGen passes.
Definition: TargetSchedule.h:30
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:64
uint64_t
llvm::omp::Kernel
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition: OpenMPOpt.h:21
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::AMDGPU::getIntegerAttribute
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
Definition: AMDGPUBaseInfo.cpp:854
llvm::GCNSubtarget::getTotalNumVGPRs
unsigned getTotalNumVGPRs() const
Definition: GCNSubtarget.h:1078
llvm::GCNSubtarget::GCNSubtarget
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
Definition: AMDGPUSubtarget.cpp:183
llvm::AMDGPUCallLowering
Definition: AMDGPUCallLowering.h:26
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::SUnit::getInstr
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
llvm::SDep::getReg
unsigned getReg() const
Returns the register associated with this edge.
Definition: ScheduleDAG.h:218
llvm::GCNSubtarget::hasFlat
bool hasFlat() const
Definition: GCNSubtarget.h:314
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:441
AMDGPUInstructionSelector.h
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1714
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:80
llvm::AMDGPUInstructionSelector
Definition: AMDGPUInstructionSelector.h:47
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
Definition: AMDGPUSubtarget.cpp:442
llvm::AMDGPU::IsaInfo::AMDGPUTargetID::getXnackSetting
TargetIDSetting getXnackSetting() const
Definition: AMDGPUBaseInfo.h:115
llvm::GCNSubtarget::getMinNumVGPRs
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
Definition: GCNSubtarget.h:1089
llvm::GCNSubtarget::hasGFX90AInsts
bool hasGFX90AInsts() const
Definition: GCNSubtarget.h:959
llvm::GCNSubtarget::getOccupancyWithNumVGPRs
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
Definition: AMDGPUSubtarget.cpp:765
EnableLoadStoreOpt
static cl::opt< bool > EnableLoadStoreOpt("aarch64-enable-ldst-opt", cl::desc("Enable the load/store pair" " optimization pass"), cl::init(true), cl::Hidden)
llvm::ScheduleDAGMI
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Definition: MachineScheduler.h:271
llvm::TargetSchedModel::computeOperandLatency
unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
Definition: TargetSchedule.cpp:184
llvm::MDNode
Metadata node.
Definition: Metadata.h:906
UseAA
static cl::opt< bool > UseAA("amdgpu-use-aa-in-codegen", cl::desc("Enable the use of AA during codegen."), cl::init(true))
llvm::GCNSubtarget::MaxPrivateElementSize
unsigned MaxPrivateElementSize
Definition: GCNSubtarget.h:64
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
llvm::AMDGPUSubtarget::MaxWavesPerEU
unsigned MaxWavesPerEU
Definition: AMDGPUSubtarget.h:62
llvm::AMDGPUSubtarget::getExplicitKernelArgOffset
unsigned getExplicitKernelArgOffset(const Function &F) const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
Definition: AMDGPUSubtarget.h:214
llvm::MachineBasicBlock::instr_end
instr_iterator instr_end()
Definition: MachineBasicBlock.h:258
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::MachineFunction
Definition: MachineFunction.h:241
llvm::GCNTargetMachine
Definition: AMDGPUTargetMachine.h:73
llvm::AMDGPUSubtarget::getMinFlatWorkGroupSize
virtual unsigned getMinFlatWorkGroupSize() const =0
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::CallingConv::AMDGPU_HS
@ AMDGPU_HS
Calling convention used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:223
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
this
Analysis the ScalarEvolution expression for r is this
Definition: README.txt:8
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:489
llvm::AMDGPUSubtarget::getAMDGPUDwarfFlavour
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
Definition: AMDGPUSubtarget.cpp:707
llvm::ScheduleDAG::MF
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:560
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
uint32_t
llvm::GCNSubtarget::hasMadF16
bool hasMadF16() const
Definition: AMDGPUSubtarget.cpp:729
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:81
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::CallingConv::AMDGPU_ES
@ AMDGPU_ES
Calling convention used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:236
llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:286
llvm::AMDGPUSubtarget::getMaxWorkGroupsPerCU
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:290
llvm::GCNSubtarget::adjustSchedDependency
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep) const override
Definition: AMDGPUSubtarget.cpp:934
llvm::GCNSubtarget::getMinNumSGPRs
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
Definition: GCNSubtarget.h:1019
llvm::GCNSubtarget::enableFlatScratch
bool enableFlatScratch() const
Definition: AMDGPUSubtarget.cpp:318
llvm::MachineSchedPolicy::ShouldTrackPressure
bool ShouldTrackPressure
Definition: MachineScheduler.h:181
llvm::GCNSubtarget::getPostRAMutations
void getPostRAMutations(std::vector< std::unique_ptr< ScheduleDAGMutation >> &Mutations) const override
Definition: AMDGPUSubtarget.cpp:1103
llvm::GCNSubtarget::hasFlatAddressSpace
bool hasFlatAddressSpace() const
Definition: GCNSubtarget.h:538
llvm::GCNSubtarget::FlatForGlobal
bool FlatForGlobal
Definition: GCNSubtarget.h:73
llvm::GCNSubtarget::getReservedNumSGPRs
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
Definition: AMDGPUSubtarget.cpp:791
llvm::SDep
Scheduling dependency.
Definition: ScheduleDAG.h:49
llvm::GCNSubtarget::getVGPRAllocGranule
unsigned getVGPRAllocGranule() const
Definition: GCNSubtarget.h:1068
llvm::AMDGPUSubtarget::getExplicitKernArgSize
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
Definition: AMDGPUSubtarget.cpp:665
llvm::AMDGPUSubtarget::INVALID
@ INVALID
Definition: AMDGPUSubtarget.h:32
llvm::ScheduleDAG::SUnits
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:562
AMDGPUGenSubtargetInfo
llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:607
llvm::AMDGPUSubtarget::HasSMulHi
bool HasSMulHi
Definition: AMDGPUSubtarget.h:57
llvm::MachineSchedPolicy::OnlyTopDown
bool OnlyTopDown
Definition: MachineScheduler.h:188
llvm::MachineFunction::getTarget
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Definition: MachineFunction.h:637
llvm::AMDGPUSubtarget::VOLCANIC_ISLANDS
@ VOLCANIC_ISLANDS
Definition: AMDGPUSubtarget.h:39
llvm::GCNSubtarget::flatScratchIsArchitected
bool flatScratchIsArchitected() const
Definition: GCNSubtarget.h:989
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::SDep::setLatency
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
llvm::GCNSubtarget::useAA
bool useAA() const override
Definition: AMDGPUSubtarget.cpp:737
R600Subtarget.h
llvm::SIMachineFunctionInfo::getWavesPerEU
std::pair< unsigned, unsigned > getWavesPerEU() const
Definition: SIMachineFunctionInfo.h:876
llvm::SUnit::addPred
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
Definition: ScheduleDAG.cpp:107
llvm::GCNSubtarget::zeroesHigh16BitsOfDest
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
Definition: AMDGPUSubtarget.cpp:344
llvm::AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG
@ FIXED_NUM_SGPRS_FOR_INIT_BUG
Definition: AMDGPUBaseInfo.h:74
llvm::CallingConv::SPIR_KERNEL
@ SPIR_KERNEL
SPIR_KERNEL - Calling convention for SPIR kernel functions.
Definition: CallingConv.h:152
llvm::SDep::getKind
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:486
llvm::MachineInstr::isBundle
bool isBundle() const
Definition: MachineInstr.h:1281
llvm::MDBuilder
Definition: MDBuilder.h:35
llvm::GCNSubtarget::getGeneration
Generation getGeneration() const
Definition: GCNSubtarget.h:258
llvm::SIInstrInfo
Definition: SIInstrInfo.h:38
llvm::AMDGPUSubtarget::LocalMemorySize
unsigned LocalMemorySize
Definition: AMDGPUSubtarget.h:63
AMDGPULegalizerInfo.h
llvm::GCNSubtarget::isXNACKEnabled
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:526
AMDGPUCallLowering.h
llvm::GCNSubtarget::HasMovrel
bool HasMovrel
Definition: GCNSubtarget.h:113
MachineScheduler.h
DisablePowerSched
static cl::opt< bool > DisablePowerSched("amdgpu-disable-power-sched", cl::desc("Disable scheduling to minimize mAI power bursts"), cl::init(false))
llvm::ScheduleDAGInstrs::dumpNode
void dumpNode(const SUnit &SU) const override
Definition: ScheduleDAGInstrs.cpp:1158
llvm::GCNSubtarget::getConstantBusLimit
unsigned getConstantBusLimit(unsigned Opcode) const
Definition: AMDGPUSubtarget.cpp:323
llvm::GCNSubtarget::getBaseReservedNumSGPRs
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
Definition: AMDGPUSubtarget.cpp:775
llvm::MDBuilder::createRange
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:84
llvm::GCNSubtarget::~GCNSubtarget
~GCNSubtarget() override
llvm::AMDGPUSubtarget::getDefaultFlatWorkGroupSize
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
Definition: AMDGPUSubtarget.cpp:498
llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition: SIMachineFunctionInfo.h:335
llvm::AMDGPU::IsaInfo::getMaxWavesPerEU
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
Definition: AMDGPUBaseInfo.cpp:563
llvm::TargetMachine::getTargetTriple
const Triple & getTargetTriple() const
Definition: TargetMachine.h:129
llvm::MachineSchedPolicy::ShouldTrackLaneMasks
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg.
Definition: MachineScheduler.h:184
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
llvm::GCNSubtarget::enableSIScheduler
bool enableSIScheduler() const
Definition: GCNSubtarget.h:883
llvm::GCNSubtarget::getInstrItineraryData
const InstrItineraryData * getInstrItineraryData() const override
Definition: GCNSubtarget.h:252
llvm::ScheduleDAGInstrs::getSchedModel
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
Definition: ScheduleDAGInstrs.h:262
llvm::ScheduleDAGMutation
Mutate the DAG as a postpass after normal DAG building.
Definition: ScheduleDAGMutation.h:22
llvm::CallingConv::AMDGPU_KERNEL
@ AMDGPU_KERNEL
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:216
llvm::CallingConv::AMDGPU_VS
@ AMDGPU_VS
Calling convention used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (ve...
Definition: CallingConv.h:204
llvm::SmallPtrSetImpl
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::AMDGPU::HSAMD::Kernel::CodeProps::Key::NumVGPRs
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
Definition: AMDGPUMetadata.h:255
llvm::AMDGPU::IsaInfo::AMDGPUTargetID::setTargetIDFromFeaturesString
void setTargetIDFromFeaturesString(StringRef FS)
Definition: AMDGPUBaseInfo.cpp:332
llvm::SUnit::Preds
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:256
llvm::SUnit
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
llvm::GCNSubtarget::createFillMFMAShadowMutation
std::unique_ptr< ScheduleDAGMutation > createFillMFMAShadowMutation(const TargetInstrInfo *TII) const
Definition: AMDGPUSubtarget.cpp:1109
From
BlockVerifier::State From
Definition: BlockVerifier.cpp:55
llvm::MachineSchedPolicy
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
Definition: MachineScheduler.h:179
llvm::AMDGPUSubtarget::getFlatWorkGroupSizes
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
Definition: AMDGPUSubtarget.cpp:512
llvm::cl::desc
Definition: CommandLine.h:412
llvm::ScheduleDAGInstrs
A ScheduleDAG for scheduling lists of MachineInstr.
Definition: ScheduleDAGInstrs.h:119
llvm::AMDGPU::getIntegerPairAttribute
std::pair< int, int > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< int, int > Default, bool OnlyFirstRequired)
Definition: AMDGPUBaseInfo.cpp:869
llvm::AMDGPUSubtarget::getWavesPerEU
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
Definition: AMDGPUSubtarget.h:94
llvm::GCNSubtarget::computeOccupancy
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
Definition: AMDGPUSubtarget.cpp:804
llvm::SIMachineFunctionInfo::hasFlatScratchInit
bool hasFlatScratchInit() const
Definition: SIMachineFunctionInfo.h:653
AMDGPUTargetMachine.h
llvm::SDep::getLatency
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142
llvm::SIInstrInfo::pseudoToMCOpcode
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
Definition: SIInstrInfo.cpp:7712
llvm::GCNSubtarget::getBaseMaxNumVGPRs
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU) const
Definition: AMDGPUSubtarget.cpp:894
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:44
AMDGPUBaseInfo.h
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::CallingConv::AMDGPU_LS
@ AMDGPU_LS
Calling convention used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:231