LLVM  14.0.0git
AMDGPUSubtarget.cpp
Go to the documentation of this file.
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "R600Subtarget.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #undef AMDGPUSubtarget
42 
44  "amdgpu-disable-power-sched",
45  cl::desc("Disable scheduling to minimize mAI power bursts"),
46  cl::init(false));
47 
49  "amdgpu-vgpr-index-mode",
50  cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51  cl::init(false));
52 
54  "amdgpu-enable-flat-scratch",
55  cl::desc("Use flat scratch instructions"),
56  cl::init(false));
57 
58 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
59  cl::desc("Enable the use of AA during codegen."),
60  cl::init(true));
61 
62 GCNSubtarget::~GCNSubtarget() = default;
63 
66  StringRef GPU, StringRef FS) {
67  // Determine default and user-specified characteristics
68  //
69  // We want to be able to turn these off, but making this a subtarget feature
70  // for SI has the unhelpful behavior that it unsets everything else if you
71  // disable it.
72  //
73  // Similarly we want enable-prt-strict-null to be on by default and not to
74  // unset everything else if it is disabled
75 
76  SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
77 
78  // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
79  if (isAmdHsaOS())
80  FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
81 
82  FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
83 
84  // Disable mutually exclusive bits.
85  if (FS.find_insensitive("+wavefrontsize") != StringRef::npos) {
86  if (FS.find_insensitive("wavefrontsize16") == StringRef::npos)
87  FullFS += "-wavefrontsize16,";
88  if (FS.find_insensitive("wavefrontsize32") == StringRef::npos)
89  FullFS += "-wavefrontsize32,";
90  if (FS.find_insensitive("wavefrontsize64") == StringRef::npos)
91  FullFS += "-wavefrontsize64,";
92  }
93 
94  FullFS += FS;
95 
96  ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
97 
98  // Implement the "generic" processors, which acts as the default when no
99  // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
100  // the first amdgcn target that supports flat addressing. Other OSes defaults
101  // to the first amdgcn target.
102  if (Gen == AMDGPUSubtarget::INVALID) {
105  }
106 
107  // We don't support FP64 for EG/NI atm.
109 
110  // Targets must either support 64-bit offsets for MUBUF instructions, and/or
111  // support flat operations, otherwise they cannot access a 64-bit global
112  // address space
113  assert(hasAddr64() || hasFlat());
114  // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
115  // that do not support ADDR64 variants of MUBUF instructions. Such targets
116  // cannot use a 64 bit offset with a MUBUF instruction to access the global
117  // address space
118  if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
119  ToggleFeature(AMDGPU::FeatureFlatForGlobal);
120  FlatForGlobal = true;
121  }
122  // Unless +-flat-for-global is specified, use MUBUF instructions for global
123  // address space access if flat operations are not available.
124  if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
125  ToggleFeature(AMDGPU::FeatureFlatForGlobal);
126  FlatForGlobal = false;
127  }
128 
129  // Set defaults if needed.
130  if (MaxPrivateElementSize == 0)
132 
133  if (LDSBankCount == 0)
134  LDSBankCount = 32;
135 
136  if (TT.getArch() == Triple::amdgcn) {
137  if (LocalMemorySize == 0)
138  LocalMemorySize = 32768;
139 
140  // Do something sensible for unspecified target.
141  if (!HasMovrel && !HasVGPRIndexMode)
142  HasMovrel = true;
143  }
144 
145  // Don't crash on invalid devices.
146  if (WavefrontSizeLog2 == 0)
147  WavefrontSizeLog2 = 5;
148 
151 
153 
154  LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
155  << TargetID.getXnackSetting() << '\n');
156  LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
157  << TargetID.getSramEccSetting() << '\n');
158 
159  return *this;
160 }
161 
163  TargetTriple(TT),
164  GCN3Encoding(false),
165  Has16BitInsts(false),
166  HasMadMixInsts(false),
167  HasMadMacF32Insts(false),
168  HasDsSrc2Insts(false),
169  HasSDWA(false),
170  HasVOP3PInsts(false),
171  HasMulI24(true),
172  HasMulU24(true),
173  HasSMulHi(false),
174  HasInv2PiInlineImm(false),
175  HasFminFmaxLegacy(true),
176  EnablePromoteAlloca(false),
177  HasTrigReducedRange(false),
178  MaxWavesPerEU(10),
179  LocalMemorySize(0),
180  WavefrontSizeLog2(0)
181  { }
182 
184  const GCNTargetMachine &TM)
185  : // clang-format off
186  AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
187  AMDGPUSubtarget(TT),
188  TargetTriple(TT),
189  TargetID(*this),
190  Gen(INVALID),
191  InstrItins(getInstrItineraryForCPU(GPU)),
192  LDSBankCount(0),
193  MaxPrivateElementSize(0),
194 
195  FastFMAF32(false),
196  FastDenormalF32(false),
197  HalfRate64Ops(false),
198  FullRate64Ops(false),
199 
200  FlatForGlobal(false),
201  AutoWaitcntBeforeBarrier(false),
202  UnalignedScratchAccess(false),
203  UnalignedAccessMode(false),
204 
205  HasApertureRegs(false),
206  SupportsXNACK(false),
207  EnableXNACK(false),
208  EnableTgSplit(false),
209  EnableCuMode(false),
210  TrapHandler(false),
211 
213  EnableUnsafeDSOffsetFolding(false),
214  EnableSIScheduler(false),
215  EnableDS128(false),
216  EnablePRTStrictNull(false),
217  DumpCode(false),
218 
219  FP64(false),
220  CIInsts(false),
221  GFX8Insts(false),
222  GFX9Insts(false),
223  GFX90AInsts(false),
224  GFX10Insts(false),
225  GFX10_3Insts(false),
226  GFX7GFX8GFX9Insts(false),
227  SGPRInitBug(false),
228  NegativeScratchOffsetBug(false),
229  NegativeUnalignedScratchOffsetBug(false),
230  HasSMemRealTime(false),
231  HasIntClamp(false),
232  HasFmaMixInsts(false),
233  HasMovrel(false),
234  HasVGPRIndexMode(false),
235  HasScalarStores(false),
236  HasScalarAtomics(false),
237  HasSDWAOmod(false),
238  HasSDWAScalar(false),
239  HasSDWASdst(false),
240  HasSDWAMac(false),
241  HasSDWAOutModsVOPC(false),
242  HasDPP(false),
243  HasDPP8(false),
244  Has64BitDPP(false),
245  HasPackedFP32Ops(false),
246  HasExtendedImageInsts(false),
247  HasR128A16(false),
248  HasGFX10A16(false),
249  HasG16(false),
250  HasNSAEncoding(false),
251  NSAMaxSize(0),
252  GFX10_AEncoding(false),
253  GFX10_BEncoding(false),
254  HasDLInsts(false),
255  HasDot1Insts(false),
256  HasDot2Insts(false),
257  HasDot3Insts(false),
258  HasDot4Insts(false),
259  HasDot5Insts(false),
260  HasDot6Insts(false),
261  HasDot7Insts(false),
262  HasMAIInsts(false),
263  HasPkFmacF16Inst(false),
264  HasAtomicFaddInsts(false),
265  SupportsSRAMECC(false),
266  EnableSRAMECC(false),
267  HasNoSdstCMPX(false),
268  HasVscnt(false),
269  HasGetWaveIdInst(false),
270  HasSMemTimeInst(false),
271  HasShaderCyclesRegister(false),
272  HasRegisterBanking(false),
273  HasVOP3Literal(false),
274  HasNoDataDepHazard(false),
275  FlatAddressSpace(false),
276  FlatInstOffsets(false),
277  FlatGlobalInsts(false),
278  FlatScratchInsts(false),
279  ScalarFlatScratchInsts(false),
280  HasArchitectedFlatScratch(false),
281  AddNoCarryInsts(false),
282  HasUnpackedD16VMem(false),
283  LDSMisalignedBug(false),
284  HasMFMAInlineLiteralBug(false),
285  UnalignedBufferAccess(false),
286  UnalignedDSAccess(false),
287  HasPackedTID(false),
288 
290 
291  HasVcmpxPermlaneHazard(false),
292  HasVMEMtoScalarWriteHazard(false),
293  HasSMEMtoVectorWriteHazard(false),
294  HasInstFwdPrefetchBug(false),
295  HasVcmpxExecWARHazard(false),
296  HasLdsBranchVmemWARHazard(false),
297  HasNSAtoVMEMBug(false),
298  HasNSAClauseBug(false),
299  HasOffset3fBug(false),
300  HasFlatSegmentOffsetBug(false),
301  HasImageStoreD16Bug(false),
302  HasImageGather4D16Bug(false),
303 
304  FeatureDisable(false),
305  InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
306  TLInfo(TM, *this),
307  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
308  // clang-format on
310  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
311  InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
312  Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
313  RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
314  InstSelector.reset(new AMDGPUInstructionSelector(
315  *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
316 }
317 
319  return flatScratchIsArchitected() ||
321 }
322 
323 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
324  if (getGeneration() < GFX10)
325  return 1;
326 
327  switch (Opcode) {
328  case AMDGPU::V_LSHLREV_B64_e64:
329  case AMDGPU::V_LSHLREV_B64_gfx10:
330  case AMDGPU::V_LSHL_B64_e64:
331  case AMDGPU::V_LSHRREV_B64_e64:
332  case AMDGPU::V_LSHRREV_B64_gfx10:
333  case AMDGPU::V_LSHR_B64_e64:
334  case AMDGPU::V_ASHRREV_I64_e64:
335  case AMDGPU::V_ASHRREV_I64_gfx10:
336  case AMDGPU::V_ASHR_I64_e64:
337  return 1;
338  }
339 
340  return 2;
341 }
342 
343 /// This list was mostly derived from experimentation.
344 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
345  switch (Opcode) {
346  case AMDGPU::V_CVT_F16_F32_e32:
347  case AMDGPU::V_CVT_F16_F32_e64:
348  case AMDGPU::V_CVT_F16_U16_e32:
349  case AMDGPU::V_CVT_F16_U16_e64:
350  case AMDGPU::V_CVT_F16_I16_e32:
351  case AMDGPU::V_CVT_F16_I16_e64:
352  case AMDGPU::V_RCP_F16_e64:
353  case AMDGPU::V_RCP_F16_e32:
354  case AMDGPU::V_RSQ_F16_e64:
355  case AMDGPU::V_RSQ_F16_e32:
356  case AMDGPU::V_SQRT_F16_e64:
357  case AMDGPU::V_SQRT_F16_e32:
358  case AMDGPU::V_LOG_F16_e64:
359  case AMDGPU::V_LOG_F16_e32:
360  case AMDGPU::V_EXP_F16_e64:
361  case AMDGPU::V_EXP_F16_e32:
362  case AMDGPU::V_SIN_F16_e64:
363  case AMDGPU::V_SIN_F16_e32:
364  case AMDGPU::V_COS_F16_e64:
365  case AMDGPU::V_COS_F16_e32:
366  case AMDGPU::V_FLOOR_F16_e64:
367  case AMDGPU::V_FLOOR_F16_e32:
368  case AMDGPU::V_CEIL_F16_e64:
369  case AMDGPU::V_CEIL_F16_e32:
370  case AMDGPU::V_TRUNC_F16_e64:
371  case AMDGPU::V_TRUNC_F16_e32:
372  case AMDGPU::V_RNDNE_F16_e64:
373  case AMDGPU::V_RNDNE_F16_e32:
374  case AMDGPU::V_FRACT_F16_e64:
375  case AMDGPU::V_FRACT_F16_e32:
376  case AMDGPU::V_FREXP_MANT_F16_e64:
377  case AMDGPU::V_FREXP_MANT_F16_e32:
378  case AMDGPU::V_FREXP_EXP_I16_F16_e64:
379  case AMDGPU::V_FREXP_EXP_I16_F16_e32:
380  case AMDGPU::V_LDEXP_F16_e64:
381  case AMDGPU::V_LDEXP_F16_e32:
382  case AMDGPU::V_LSHLREV_B16_e64:
383  case AMDGPU::V_LSHLREV_B16_e32:
384  case AMDGPU::V_LSHRREV_B16_e64:
385  case AMDGPU::V_LSHRREV_B16_e32:
386  case AMDGPU::V_ASHRREV_I16_e64:
387  case AMDGPU::V_ASHRREV_I16_e32:
388  case AMDGPU::V_ADD_U16_e64:
389  case AMDGPU::V_ADD_U16_e32:
390  case AMDGPU::V_SUB_U16_e64:
391  case AMDGPU::V_SUB_U16_e32:
392  case AMDGPU::V_SUBREV_U16_e64:
393  case AMDGPU::V_SUBREV_U16_e32:
394  case AMDGPU::V_MUL_LO_U16_e64:
395  case AMDGPU::V_MUL_LO_U16_e32:
396  case AMDGPU::V_ADD_F16_e64:
397  case AMDGPU::V_ADD_F16_e32:
398  case AMDGPU::V_SUB_F16_e64:
399  case AMDGPU::V_SUB_F16_e32:
400  case AMDGPU::V_SUBREV_F16_e64:
401  case AMDGPU::V_SUBREV_F16_e32:
402  case AMDGPU::V_MUL_F16_e64:
403  case AMDGPU::V_MUL_F16_e32:
404  case AMDGPU::V_MAX_F16_e64:
405  case AMDGPU::V_MAX_F16_e32:
406  case AMDGPU::V_MIN_F16_e64:
407  case AMDGPU::V_MIN_F16_e32:
408  case AMDGPU::V_MAX_U16_e64:
409  case AMDGPU::V_MAX_U16_e32:
410  case AMDGPU::V_MIN_U16_e64:
411  case AMDGPU::V_MIN_U16_e32:
412  case AMDGPU::V_MAX_I16_e64:
413  case AMDGPU::V_MAX_I16_e32:
414  case AMDGPU::V_MIN_I16_e64:
415  case AMDGPU::V_MIN_I16_e32:
416  // On gfx10, all 16-bit instructions preserve the high bits.
418  case AMDGPU::V_MAD_F16_e64:
419  case AMDGPU::V_MADAK_F16:
420  case AMDGPU::V_MADMK_F16:
421  case AMDGPU::V_MAC_F16_e64:
422  case AMDGPU::V_MAC_F16_e32:
423  case AMDGPU::V_FMAMK_F16:
424  case AMDGPU::V_FMAAK_F16:
425  case AMDGPU::V_MAD_U16_e64:
426  case AMDGPU::V_MAD_I16_e64:
427  case AMDGPU::V_FMA_F16_e64:
428  case AMDGPU::V_FMAC_F16_e64:
429  case AMDGPU::V_FMAC_F16_e32:
430  case AMDGPU::V_DIV_FIXUP_F16_e64:
431  // In gfx9, the preferred handling of the unused high 16-bits changed. Most
432  // instructions maintain the legacy behavior of 0ing. Some instructions
433  // changed to preserving the high bits.
435  case AMDGPU::V_MAD_MIXLO_F16:
436  case AMDGPU::V_MAD_MIXHI_F16:
437  default:
438  return false;
439  }
440 }
441 
443  const Function &F) const {
444  if (NWaves == 1)
445  return getLocalMemorySize();
446  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
447  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
448  if (!WorkGroupsPerCu)
449  return 0;
450  unsigned MaxWaves = getMaxWavesPerEU();
451  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
452 }
453 
454 // FIXME: Should return min,max range.
456  const Function &F) const {
457  const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
458  const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
459  if (!MaxWorkGroupsPerCu)
460  return 0;
461 
462  const unsigned WaveSize = getWavefrontSize();
463 
464  // FIXME: Do we need to account for alignment requirement of LDS rounding the
465  // size up?
466  // Compute restriction based on LDS usage
467  unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
468 
469  // This can be queried with more LDS than is possible, so just assume the
470  // worst.
471  if (NumGroups == 0)
472  return 1;
473 
474  NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
475 
476  // Round to the number of waves.
477  const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
478  unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
479 
480  // Clamp to the maximum possible number of waves.
481  MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
482 
483  // FIXME: Needs to be a multiple of the group size?
484  //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
485 
486  assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
487  "computed invalid occupancy");
488  return MaxWaves;
489 }
490 
491 unsigned
493  const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
494  return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
495 }
496 
497 std::pair<unsigned, unsigned>
499  switch (CC) {
506  return std::make_pair(1, getWavefrontSize());
507  default:
508  return std::make_pair(1u, getMaxFlatWorkGroupSize());
509  }
510 }
511 
512 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
513  const Function &F) const {
514  // Default minimum/maximum flat work group sizes.
515  std::pair<unsigned, unsigned> Default =
516  getDefaultFlatWorkGroupSize(F.getCallingConv());
517 
518  // Requested minimum/maximum flat work group sizes.
519  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
520  F, "amdgpu-flat-work-group-size", Default);
521 
522  // Make sure requested minimum is less than requested maximum.
523  if (Requested.first > Requested.second)
524  return Default;
525 
526  // Make sure requested values do not violate subtarget's specifications.
527  if (Requested.first < getMinFlatWorkGroupSize())
528  return Default;
529  if (Requested.second > getMaxFlatWorkGroupSize())
530  return Default;
531 
532  return Requested;
533 }
534 
535 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
536  const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
537  // Default minimum/maximum number of waves per execution unit.
538  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
539 
540  // If minimum/maximum flat work group sizes were explicitly requested using
541  // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
542  // number of waves per execution unit to values implied by requested
543  // minimum/maximum flat work group sizes.
544  unsigned MinImpliedByFlatWorkGroupSize =
545  getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
546  Default.first = MinImpliedByFlatWorkGroupSize;
547  bool RequestedFlatWorkGroupSize =
548  F.hasFnAttribute("amdgpu-flat-work-group-size");
549 
550  // Requested minimum/maximum number of waves per execution unit.
551  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
552  F, "amdgpu-waves-per-eu", Default, true);
553 
554  // Make sure requested minimum is less than requested maximum.
555  if (Requested.second && Requested.first > Requested.second)
556  return Default;
557 
558  // Make sure requested values do not violate subtarget's specifications.
559  if (Requested.first < getMinWavesPerEU() ||
560  Requested.second > getMaxWavesPerEU())
561  return Default;
562 
563  // Make sure requested values are compatible with values implied by requested
564  // minimum/maximum flat work group sizes.
565  if (RequestedFlatWorkGroupSize &&
566  Requested.first < MinImpliedByFlatWorkGroupSize)
567  return Default;
568 
569  return Requested;
570 }
571 
572 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
573  auto Node = Kernel.getMetadata("reqd_work_group_size");
574  if (Node && Node->getNumOperands() == 3)
575  return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
577 }
578 
580  return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
581 }
582 
584  unsigned Dimension) const {
585  unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
586  if (ReqdSize != std::numeric_limits<unsigned>::max())
587  return ReqdSize - 1;
588  return getFlatWorkGroupSizes(Kernel).second - 1;
589 }
590 
592  Function *Kernel = I->getParent()->getParent();
593  unsigned MinSize = 0;
594  unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
595  bool IdQuery = false;
596 
597  // If reqd_work_group_size is present it narrows value down.
598  if (auto *CI = dyn_cast<CallInst>(I)) {
599  const Function *F = CI->getCalledFunction();
600  if (F) {
601  unsigned Dim = UINT_MAX;
602  switch (F->getIntrinsicID()) {
603  case Intrinsic::amdgcn_workitem_id_x:
604  case Intrinsic::r600_read_tidig_x:
605  IdQuery = true;
607  case Intrinsic::r600_read_local_size_x:
608  Dim = 0;
609  break;
610  case Intrinsic::amdgcn_workitem_id_y:
611  case Intrinsic::r600_read_tidig_y:
612  IdQuery = true;
614  case Intrinsic::r600_read_local_size_y:
615  Dim = 1;
616  break;
617  case Intrinsic::amdgcn_workitem_id_z:
618  case Intrinsic::r600_read_tidig_z:
619  IdQuery = true;
621  case Intrinsic::r600_read_local_size_z:
622  Dim = 2;
623  break;
624  default:
625  break;
626  }
627 
628  if (Dim <= 3) {
629  unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
630  if (ReqdSize != std::numeric_limits<unsigned>::max())
631  MinSize = MaxSize = ReqdSize;
632  }
633  }
634  }
635 
636  if (!MaxSize)
637  return false;
638 
639  // Range metadata is [Lo, Hi). For ID query we need to pass max size
640  // as Hi. For size query we need to pass Hi + 1.
641  if (IdQuery)
642  MinSize = 0;
643  else
644  ++MaxSize;
645 
646  MDBuilder MDB(I->getContext());
647  MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
648  APInt(32, MaxSize));
649  I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
650  return true;
651 }
652 
654  if (isMesaKernel(F))
655  return 16;
656  return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
657 }
658 
660  Align &MaxAlign) const {
661  assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
662  F.getCallingConv() == CallingConv::SPIR_KERNEL);
663 
664  const DataLayout &DL = F.getParent()->getDataLayout();
665  uint64_t ExplicitArgBytes = 0;
666  MaxAlign = Align(1);
667 
668  for (const Argument &Arg : F.args()) {
669  const bool IsByRef = Arg.hasByRefAttr();
670  Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
671  MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
672  if (!Alignment)
673  Alignment = DL.getABITypeAlign(ArgTy);
674 
675  uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
676  ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
677  MaxAlign = max(MaxAlign, Alignment);
678  }
679 
680  return ExplicitArgBytes;
681 }
682 
684  Align &MaxAlign) const {
685  uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
686 
687  unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
688 
689  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
690  unsigned ImplicitBytes = getImplicitArgNumBytes(F);
691  if (ImplicitBytes != 0) {
692  const Align Alignment = getAlignmentForImplicitArgPtr();
693  TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
694  }
695 
696  // Being able to dereference past the end is useful for emitting scalar loads.
697  return alignTo(TotalSize, 4);
698 }
699 
703 }
704 
706  unsigned NumRegionInstrs) const {
707  // Track register pressure so the scheduler can try to decrease
708  // pressure once register usage is above the threshold defined by
709  // SIRegisterInfo::getRegPressureSetLimit()
710  Policy.ShouldTrackPressure = true;
711 
712  // Enabling both top down and bottom up scheduling seems to give us less
713  // register spills than just using one of these approaches on its own.
714  Policy.OnlyTopDown = false;
715  Policy.OnlyBottomUp = false;
716 
717  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
718  if (!enableSIScheduler())
719  Policy.ShouldTrackLaneMasks = true;
720 }
721 
723  return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
724 }
725 
727  return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
728 }
729 
730 bool GCNSubtarget::useAA() const { return UseAA; }
731 
732 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
734  return getMaxWavesPerEU();
735 
737  if (SGPRs <= 80)
738  return 10;
739  if (SGPRs <= 88)
740  return 9;
741  if (SGPRs <= 100)
742  return 8;
743  return 7;
744  }
745  if (SGPRs <= 48)
746  return 10;
747  if (SGPRs <= 56)
748  return 9;
749  if (SGPRs <= 64)
750  return 8;
751  if (SGPRs <= 72)
752  return 7;
753  if (SGPRs <= 80)
754  return 6;
755  return 5;
756 }
757 
758 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
759  unsigned MaxWaves = getMaxWavesPerEU();
760  unsigned Granule = getVGPRAllocGranule();
761  if (VGPRs < Granule)
762  return MaxWaves;
763  unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
764  return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
765 }
766 
767 unsigned
768 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const {
770  return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
771 
772  if (HasFlatScratchInit) {
774  return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
776  return 4; // FLAT_SCRATCH, VCC (in that order).
777  }
778 
779  if (isXNACKEnabled())
780  return 4; // XNACK, VCC (in that order).
781  return 2; // VCC.
782 }
783 
787 }
788 
790  // The logic to detect if the function has
791  // flat scratch init is slightly different than how
792  // SIMachineFunctionInfo constructor derives.
793  // We don't use amdgpu-calls, amdgpu-stack-objects
794  // attributes and isAmdHsaOrMesa here as it doesn't really matter.
795  // TODO: Outline this derivation logic and have just
796  // one common function in the backend to avoid duplication.
797  bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv());
798  bool FunctionHasFlatScratchInit = false;
799  if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() &&
800  enableFlatScratch()) {
801  FunctionHasFlatScratchInit = true;
802  }
803  return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit);
804 }
805 
806 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
807  unsigned NumSGPRs,
808  unsigned NumVGPRs) const {
809  unsigned Occupancy =
811  getOccupancyWithLocalMemSize(LDSSize, F));
812  if (NumSGPRs)
813  Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
814  if (NumVGPRs)
815  Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
816  return Occupancy;
817 }
818 
820  const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
821  unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
822  // Compute maximum number of SGPRs function can use using default/requested
823  // minimum number of waves per execution unit.
824  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
825  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
826 
827  // Check if maximum number of SGPRs was explicitly requested using
828  // "amdgpu-num-sgpr" attribute.
829  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
830  unsigned Requested = AMDGPU::getIntegerAttribute(
831  F, "amdgpu-num-sgpr", MaxNumSGPRs);
832 
833  // Make sure requested value does not violate subtarget's specifications.
834  if (Requested && (Requested <= ReservedNumSGPRs))
835  Requested = 0;
836 
837  // If more SGPRs are required to support the input user/system SGPRs,
838  // increase to accommodate them.
839  //
840  // FIXME: This really ends up using the requested number of SGPRs + number
841  // of reserved special registers in total. Theoretically you could re-use
842  // the last input registers for these special registers, but this would
843  // require a lot of complexity to deal with the weird aliasing.
844  unsigned InputNumSGPRs = PreloadedSGPRs;
845  if (Requested && Requested < InputNumSGPRs)
846  Requested = InputNumSGPRs;
847 
848  // Make sure requested value is compatible with values implied by
849  // default/requested minimum/maximum number of waves per execution unit.
850  if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
851  Requested = 0;
852  if (WavesPerEU.second &&
853  Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
854  Requested = 0;
855 
856  if (Requested)
857  MaxNumSGPRs = Requested;
858  }
859 
860  if (hasSGPRInitBug())
862 
863  return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
864 }
865 
867  const Function &F = MF.getFunction();
870  getReservedNumSGPRs(MF));
871 }
872 
873 static unsigned getMaxNumPreloadedSGPRs() {
874  // Max number of user SGPRs
875  unsigned MaxUserSGPRs = 4 + // private segment buffer
876  2 + // Dispatch ptr
877  2 + // queue ptr
878  2 + // kernel segment ptr
879  2 + // dispatch ID
880  2 + // flat scratch init
881  2; // Implicit buffer ptr
882  // Max number of system SGPRs
883  unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
884  1 + // WorkGroupIDY
885  1 + // WorkGroupIDZ
886  1 + // WorkGroupInfo
887  1; // private segment wave byte offset
888  return MaxUserSGPRs + MaxSystemSGPRs;
889 }
890 
891 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
894 }
895 
897  const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
898  // Compute maximum number of VGPRs function can use using default/requested
899  // minimum number of waves per execution unit.
900  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
901 
902  // Check if maximum number of VGPRs was explicitly requested using
903  // "amdgpu-num-vgpr" attribute.
904  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
905  unsigned Requested = AMDGPU::getIntegerAttribute(
906  F, "amdgpu-num-vgpr", MaxNumVGPRs);
907 
908  if (hasGFX90AInsts())
909  Requested *= 2;
910 
911  // Make sure requested value is compatible with values implied by
912  // default/requested minimum/maximum number of waves per execution unit.
913  if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
914  Requested = 0;
915  if (WavesPerEU.second &&
916  Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
917  Requested = 0;
918 
919  if (Requested)
920  MaxNumVGPRs = Requested;
921  }
922 
923  return MaxNumVGPRs;
924 }
925 
926 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
928 }
929 
931  const Function &F = MF.getFunction();
933  return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
934 }
935 
937  int UseOpIdx, SDep &Dep) const {
938  if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
939  !Def->isInstr() || !Use->isInstr())
940  return;
941 
942  MachineInstr *DefI = Def->getInstr();
943  MachineInstr *UseI = Use->getInstr();
944 
945  if (DefI->isBundle()) {
947  auto Reg = Dep.getReg();
950  unsigned Lat = 0;
951  for (++I; I != E && I->isBundledWithPred(); ++I) {
952  if (I->modifiesRegister(Reg, TRI))
953  Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
954  else if (Lat)
955  --Lat;
956  }
957  Dep.setLatency(Lat);
958  } else if (UseI->isBundle()) {
960  auto Reg = Dep.getReg();
963  unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
964  for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
965  if (I->readsRegister(Reg, TRI))
966  break;
967  --Lat;
968  }
969  Dep.setLatency(Lat);
970  }
971 }
972 
973 namespace {
974 struct FillMFMAShadowMutation : ScheduleDAGMutation {
975  const SIInstrInfo *TII;
976 
977  ScheduleDAGMI *DAG;
978 
979  FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
980 
981  bool isSALU(const SUnit *SU) const {
982  const MachineInstr *MI = SU->getInstr();
983  return MI && TII->isSALU(*MI) && !MI->isTerminator();
984  }
985 
986  bool isVALU(const SUnit *SU) const {
987  const MachineInstr *MI = SU->getInstr();
988  return MI && TII->isVALU(*MI);
989  }
990 
991  bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
992  if (Pred->NodeNum < Succ->NodeNum)
993  return true;
994 
995  SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
996 
997  for (unsigned I = 0; I < Succs.size(); ++I) {
998  for (const SDep &SI : Succs[I]->Succs) {
999  const SUnit *SU = SI.getSUnit();
1000  if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
1001  Succs.push_back(SU);
1002  }
1003  }
1004 
1006  while (!Preds.empty()) {
1007  const SUnit *SU = Preds.pop_back_val();
1008  if (llvm::is_contained(Succs, SU))
1009  return false;
1010  Visited.insert(SU);
1011  for (const SDep &SI : SU->Preds)
1012  if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1013  Preds.push_back(SI.getSUnit());
1014  }
1015 
1016  return true;
1017  }
1018 
1019  // Link as many SALU instructions in chain as possible. Return the size
1020  // of the chain. Links up to MaxChain instructions.
1021  unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1022  SmallPtrSetImpl<SUnit *> &Visited) const {
1023  SmallVector<SUnit *, 8> Worklist({To});
1024  unsigned Linked = 0;
1025 
1026  while (!Worklist.empty() && MaxChain-- > 0) {
1027  SUnit *SU = Worklist.pop_back_val();
1028  if (!Visited.insert(SU).second)
1029  continue;
1030 
1031  LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1032  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1033 
1034  if (SU->addPred(SDep(From, SDep::Artificial), false))
1035  ++Linked;
1036 
1037  for (SDep &SI : From->Succs) {
1038  SUnit *SUv = SI.getSUnit();
1039  if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1040  SUv->addPred(SDep(SU, SDep::Artificial), false);
1041  }
1042 
1043  for (SDep &SI : SU->Succs) {
1044  SUnit *Succ = SI.getSUnit();
1045  if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1046  Worklist.push_back(Succ);
1047  }
1048  }
1049 
1050  return Linked;
1051  }
1052 
1053  void apply(ScheduleDAGInstrs *DAGInstrs) override {
1054  const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1055  if (!ST.hasMAIInsts() || DisablePowerSched)
1056  return;
1057  DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1058  const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1059  if (!TSchedModel || DAG->SUnits.empty())
1060  return;
1061 
1062  // Scan for MFMA long latency instructions and try to add a dependency
1063  // of available SALU instructions to give them a chance to fill MFMA
1064  // shadow. That is desirable to fill MFMA shadow with SALU instructions
1065  // rather than VALU to prevent power consumption bursts and throttle.
1066  auto LastSALU = DAG->SUnits.begin();
1067  auto E = DAG->SUnits.end();
1068  SmallPtrSet<SUnit*, 32> Visited;
1069  for (SUnit &SU : DAG->SUnits) {
1070  MachineInstr &MAI = *SU.getInstr();
1071  if (!TII->isMAI(MAI) ||
1072  MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1073  MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1074  continue;
1075 
1076  unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1077 
1078  LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1079  dbgs() << "Need " << Lat
1080  << " instructions to cover latency.\n");
1081 
1082  // Find up to Lat independent scalar instructions as early as
1083  // possible such that they can be scheduled after this MFMA.
1084  for ( ; Lat && LastSALU != E; ++LastSALU) {
1085  if (Visited.count(&*LastSALU))
1086  continue;
1087 
1088  if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1089  continue;
1090 
1091  Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1092  }
1093  }
1094  }
1095 };
1096 } // namespace
1097 
1099  std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1100  Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1101 }
1102 
1103 std::unique_ptr<ScheduleDAGMutation>
1105  return std::make_unique<FillMFMAShadowMutation>(&InstrInfo);
1106 }
1107 
1110  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1111  else
1112  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1113 }
1114 
1116  if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1117  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1118  else
1119  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1120 }
llvm::MachineSchedPolicy::OnlyBottomUp
bool OnlyBottomUp
Definition: MachineScheduler.h:184
llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:148
llvm::AMDGPUSubtarget::getAlignmentForImplicitArgPtr
Align getAlignmentForImplicitArgPtr() const
Definition: AMDGPUSubtarget.h:208
llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
llvm::GCNSubtarget::Gen
unsigned Gen
Definition: GCNSubtarget.h:61
llvm::AMDGPURegisterBankInfo
Definition: AMDGPURegisterBankInfo.h:42
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:103
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
Reg
unsigned Reg
Definition: MachineSink.cpp:1566
TargetFrameLowering.h
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::AMDGPU::HSAMD::Kernel::CodeProps::Key::NumSGPRs
constexpr char NumSGPRs[]
Key for Kernel::CodeProps::Metadata::mNumSGPRs.
Definition: AMDGPUMetadata.h:253
llvm::TargetFrameLowering
Information about stack frame layout on the target.
Definition: TargetFrameLowering.h:43
llvm::GCNSubtarget::hasVGPRIndexMode
bool hasVGPRIndexMode() const
Definition: GCNSubtarget.h:796
llvm::Wave32
@ Wave32
Definition: AMDGPUMCTargetDesc.h:31
llvm::SDep::Artificial
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
llvm::InlineAsmLowering
Definition: InlineAsmLowering.h:28
llvm::AMDGPUSubtarget::HasFminFmaxLegacy
bool HasFminFmaxLegacy
Definition: AMDGPUSubtarget.h:59
SIMachineFunctionInfo.h
llvm::GCNSubtarget::initializeSubtargetDependencies
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Definition: AMDGPUSubtarget.cpp:65
llvm::Function
Definition: Function.h:61
llvm::CallingConv::AMDGPU_HS
@ AMDGPU_HS
Calling convention used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:223
llvm::StringRef::npos
static constexpr size_t npos
Definition: StringRef.h:60
llvm::GCNSubtarget::hasMovrel
bool hasMovrel() const
Definition: GCNSubtarget.h:792
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::AMDGPUSubtarget::GFX9
@ GFX9
Definition: AMDGPUSubtarget.h:40
llvm::AMDGPUSubtarget::SOUTHERN_ISLANDS
@ SOUTHERN_ISLANDS
Definition: AMDGPUSubtarget.h:37
llvm::Triple::amdgcn
@ amdgcn
Definition: Triple.h:72
llvm::AMDGPUSubtarget::getMinWavesPerEU
virtual unsigned getMinWavesPerEU() const =0
llvm::GCNSubtarget::hasFlatScratchInsts
bool hasFlatScratchInsts() const
Definition: GCNSubtarget.h:560
llvm::GCNSubtarget::hasFP64
bool hasFP64() const
Definition: GCNSubtarget.h:292
llvm::GlobalObject::getMetadata
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition: Metadata.cpp:1197
llvm::AMDGPUSubtarget::SEA_ISLANDS
@ SEA_ISLANDS
Definition: AMDGPUSubtarget.h:38
llvm::Triple
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:45
llvm::Wave64
@ Wave64
Definition: AMDGPUMCTargetDesc.h:31
llvm::AMDGPUSubtarget::getOccupancyWithLocalMemSize
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
Definition: AMDGPUSubtarget.cpp:455
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
EnableVGPRIndexMode
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))
llvm::AMDGPUSubtarget::getMaxWavesPerEU
unsigned getMaxWavesPerEU() const
Definition: AMDGPUSubtarget.h:239
llvm::SUnit::Succs
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::GCNSubtarget::getBaseReservedNumSGPRs
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const
Definition: AMDGPUSubtarget.cpp:768
llvm::GCNSubtarget::TargetID
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
Definition: GCNSubtarget.h:60
llvm::AMDGPUSubtarget::getMaxFlatWorkGroupSize
virtual unsigned getMaxFlatWorkGroupSize() const =0
getMaxNumPreloadedSGPRs
static unsigned getMaxNumPreloadedSGPRs()
Definition: AMDGPUSubtarget.cpp:873
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1567
llvm::SIMachineFunctionInfo::getNumPreloadedSGPRs
unsigned getNumPreloadedSGPRs() const
Definition: SIMachineFunctionInfo.h:727
llvm::Data
@ Data
Definition: SIMachineScheduler.h:55
llvm::AMDGPUSubtarget::getKernArgSegmentSize
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
Definition: AMDGPUSubtarget.cpp:683
llvm::GCNSubtarget::getRegisterInfo
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:225
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::AMDGPULegalizerInfo
This class provides the information for the target register banks.
Definition: AMDGPULegalizerInfo.h:32
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::CallingConv::AMDGPU_KERNEL
@ AMDGPU_KERNEL
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:216
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:206
llvm::AMDGPUSubtarget::isAmdHsaOS
bool isAmdHsaOS() const
Definition: AMDGPUSubtarget.h:118
llvm::GCNSubtarget::HasVGPRIndexMode
bool HasVGPRIndexMode
Definition: GCNSubtarget.h:115
llvm::GCNSubtarget::overrideSchedPolicy
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
Definition: AMDGPUSubtarget.cpp:705
llvm::TargetInstrInfo
TargetInstrInfo - Interface to description of machine instruction set.
Definition: TargetInstrInfo.h:97
llvm::cl::apply
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1320
llvm::AMDGPUSubtarget::get
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Definition: AMDGPUSubtarget.cpp:1108
llvm::GCNSubtarget::ParseSubtargetFeatures
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
llvm::AMDGPU::isShader
bool isShader(CallingConv::ID cc)
Definition: AMDGPUBaseInfo.cpp:1358
InlineAsmLowering.h
llvm::GCNSubtarget::getMaxNumSGPRs
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
Definition: GCNSubtarget.h:1031
llvm::GCNSubtarget::useVGPRIndexMode
bool useVGPRIndexMode() const
Definition: AMDGPUSubtarget.cpp:726
llvm::GCNSubtarget::hasAddr64
bool hasAddr64() const
Definition: GCNSubtarget.h:316
SmallString.h
ScalarizeGlobal
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden)
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:724
llvm::Legalizer
Definition: Legalizer.h:31
getReqdWorkGroupSize
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim)
Definition: AMDGPUSubtarget.cpp:572
llvm::GCNSubtarget::getTargetLowering
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:221
llvm::AMDGPUSubtarget::isMesa3DOS
bool isMesa3DOS() const
Definition: AMDGPUSubtarget.h:126
llvm::SUnit::NodeNum
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:264
llvm::AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG
@ FIXED_NUM_SGPRS_FOR_INIT_BUG
Definition: AMDGPUBaseInfo.h:74
llvm::GCNSubtarget::getOccupancyWithNumSGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
Definition: AMDGPUSubtarget.cpp:732
llvm::AMDGPUSubtarget::getImplicitArgNumBytes
unsigned getImplicitArgNumBytes(const Function &F) const
Definition: AMDGPUSubtarget.cpp:653
llvm::GCNSubtarget::getBaseMaxNumSGPRs
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
Definition: AMDGPUSubtarget.cpp:819
AMDGPUSubtarget.h
false
Definition: StackSlotColoring.cpp:142
llvm::AMDGPUSubtarget::getWavesPerEUForWorkGroup
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
EnableFlatScratch
static cl::opt< bool > EnableFlatScratch("amdgpu-enable-flat-scratch", cl::desc("Use flat scratch instructions"), cl::init(false))
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:129
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
MCSubtargetInfo.h
llvm::AMDGPUSubtarget::isMesaKernel
bool isMesaKernel(const Function &F) const
Definition: AMDGPUSubtarget.cpp:579
llvm::Instruction
Definition: Instruction.h:45
llvm::GCNSubtarget::hasSGPRInitBug
bool hasSGPRInitBug() const
Definition: GCNSubtarget.h:897
llvm::SIInstrInfo::getInstrLatency
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
Definition: SIInstrInfo.cpp:8021
MDBuilder.h
llvm::AMDGPU::IsaInfo::AMDGPUTargetID::getSramEccSetting
TargetIDSetting getSramEccSetting() const
Definition: AMDGPUBaseInfo.h:144
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:28
llvm::AMDGPUSubtarget::getMaxWorkitemID
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
Definition: AMDGPUSubtarget.cpp:583
Align
uint64_t Align
Definition: ELFObjHandler.cpp:83
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::CallingConv::AMDGPU_GS
@ AMDGPU_GS
Calling convention used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:207
llvm::Triple::getArch
ArchType getArch() const
getArch - Get the parsed architecture type of this triple.
Definition: Triple.h:307
llvm::GCNSubtarget::getMaxNumVGPRs
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
Definition: GCNSubtarget.h:1101
llvm::GCNSubtarget::LDSBankCount
int LDSBankCount
Definition: GCNSubtarget.h:63
llvm::AMDGPUSubtarget::getLocalMemorySize
unsigned getLocalMemorySize() const
Definition: AMDGPUSubtarget.h:204
llvm::None
const NoneType None
Definition: None.h:23
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::AMDGPU::isEntryFunctionCC
bool isEntryFunctionCC(CallingConv::ID CC)
Definition: AMDGPUBaseInfo.cpp:1381
llvm::SmallString< 256 >
llvm::CallingConv::AMDGPU_LS
@ AMDGPU_LS
Calling convention used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:231
llvm::AMDGPUSubtarget::AMDGPUSubtarget
AMDGPUSubtarget(const Triple &TT)
Definition: AMDGPUSubtarget.cpp:162
llvm::AMDGPUSubtarget::getWavefrontSize
unsigned getWavefrontSize() const
Definition: AMDGPUSubtarget.h:196
llvm::Triple::AMDHSA
@ AMDHSA
Definition: Triple.h:190
llvm::AMDGPUSubtarget::GFX10
@ GFX10
Definition: AMDGPUSubtarget.h:41
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:626
llvm::AMDGPUDwarfFlavour
AMDGPUDwarfFlavour
Definition: AMDGPUMCTargetDesc.h:31
llvm::cl::opt< bool >
llvm::AMDGPUSubtarget::WavefrontSizeLog2
char WavefrontSizeLog2
Definition: AMDGPUSubtarget.h:64
llvm::AMDGPUSubtarget::makeLIDRangeMetadata
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
Definition: AMDGPUSubtarget.cpp:591
llvm::R600Subtarget
Definition: R600Subtarget.h:35
llvm::AMDGPUSubtarget
Definition: AMDGPUSubtarget.h:29
AMDGPURegisterBankInfo.h
llvm::TargetSchedModel
Provide an instruction scheduling machine model to CodeGen passes.
Definition: TargetSchedule.h:31
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:64
uint64_t
llvm::omp::Kernel
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition: OpenMPOpt.h:21
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::CallingConv::AMDGPU_VS
@ AMDGPU_VS
Calling convention used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (ve...
Definition: CallingConv.h:204
llvm::AMDGPU::getIntegerAttribute
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
Definition: AMDGPUBaseInfo.cpp:852
llvm::GCNSubtarget::getTotalNumVGPRs
unsigned getTotalNumVGPRs() const
Definition: GCNSubtarget.h:1084
llvm::GCNSubtarget::GCNSubtarget
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
Definition: AMDGPUSubtarget.cpp:183
llvm::AMDGPUCallLowering
Definition: AMDGPUCallLowering.h:26
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::SUnit::getInstr
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
llvm::SDep::getReg
unsigned getReg() const
Returns the register associated with this edge.
Definition: ScheduleDAG.h:218
llvm::GCNSubtarget::hasFlat
bool hasFlat() const
Definition: GCNSubtarget.h:320
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
AMDGPUInstructionSelector.h
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1616
llvm::CallingConv::SPIR_KERNEL
@ SPIR_KERNEL
SPIR_KERNEL - Calling convention for SPIR kernel functions.
Definition: CallingConv.h:152
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:79
llvm::AMDGPUInstructionSelector
Definition: AMDGPUInstructionSelector.h:49
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
Definition: AMDGPUSubtarget.cpp:442
llvm::AMDGPU::IsaInfo::AMDGPUTargetID::getXnackSetting
TargetIDSetting getXnackSetting() const
Definition: AMDGPUBaseInfo.h:115
llvm::GCNSubtarget::getMinNumVGPRs
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
Definition: GCNSubtarget.h:1095
llvm::GCNSubtarget::hasGFX90AInsts
bool hasGFX90AInsts() const
Definition: GCNSubtarget.h:965
llvm::GCNSubtarget::getOccupancyWithNumVGPRs
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
Definition: AMDGPUSubtarget.cpp:758
EnableLoadStoreOpt
static cl::opt< bool > EnableLoadStoreOpt("aarch64-enable-ldst-opt", cl::desc("Enable the load/store pair" " optimization pass"), cl::init(true), cl::Hidden)
llvm::X86AS::FS
@ FS
Definition: X86.h:188
llvm::ScheduleDAGMI
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Definition: MachineScheduler.h:266
llvm::MDNode
Metadata node.
Definition: Metadata.h:906
UseAA
static cl::opt< bool > UseAA("amdgpu-use-aa-in-codegen", cl::desc("Enable the use of AA during codegen."), cl::init(true))
llvm::GCNSubtarget::MaxPrivateElementSize
unsigned MaxPrivateElementSize
Definition: GCNSubtarget.h:64
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
llvm::AMDGPUSubtarget::MaxWavesPerEU
unsigned MaxWavesPerEU
Definition: AMDGPUSubtarget.h:62
llvm::AMDGPUSubtarget::getExplicitKernelArgOffset
unsigned getExplicitKernelArgOffset(const Function &F) const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
Definition: AMDGPUSubtarget.h:214
llvm::MachineBasicBlock::instr_end
instr_iterator instr_end()
Definition: MachineBasicBlock.h:254
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::MachineFunction
Definition: MachineFunction.h:230
llvm::GCNTargetMachine
Definition: AMDGPUTargetMachine.h:72
llvm::AMDGPUSubtarget::getMinFlatWorkGroupSize
virtual unsigned getMinFlatWorkGroupSize() const =0
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
this
Analysis the ScalarEvolution expression for r is this
Definition: README.txt:8
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:489
llvm::AMDGPUSubtarget::getAMDGPUDwarfFlavour
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
Definition: AMDGPUSubtarget.cpp:700
llvm::ScheduleDAG::MF
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:560
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
uint32_t
llvm::GCNSubtarget::hasMadF16
bool hasMadF16() const
Definition: AMDGPUSubtarget.cpp:722
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:81
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:286
llvm::AMDGPUSubtarget::getMaxWorkGroupsPerCU
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:273
llvm::GCNSubtarget::adjustSchedDependency
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep) const override
Definition: AMDGPUSubtarget.cpp:936
llvm::GCNSubtarget::getMinNumSGPRs
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
Definition: GCNSubtarget.h:1025
llvm::GCNSubtarget::enableFlatScratch
bool enableFlatScratch() const
Definition: AMDGPUSubtarget.cpp:318
llvm::MachineSchedPolicy::ShouldTrackPressure
bool ShouldTrackPressure
Definition: MachineScheduler.h:176
llvm::GCNSubtarget::getPostRAMutations
void getPostRAMutations(std::vector< std::unique_ptr< ScheduleDAGMutation >> &Mutations) const override
Definition: AMDGPUSubtarget.cpp:1098
llvm::GCNSubtarget::hasFlatAddressSpace
bool hasFlatAddressSpace() const
Definition: GCNSubtarget.h:544
llvm::GCNSubtarget::FlatForGlobal
bool FlatForGlobal
Definition: GCNSubtarget.h:73
llvm::GCNSubtarget::getReservedNumSGPRs
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
Definition: AMDGPUSubtarget.cpp:784
llvm::SDep
Scheduling dependency.
Definition: ScheduleDAG.h:49
llvm::GCNSubtarget::getVGPRAllocGranule
unsigned getVGPRAllocGranule() const
Definition: GCNSubtarget.h:1074
llvm::AMDGPUSubtarget::getExplicitKernArgSize
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
Definition: AMDGPUSubtarget.cpp:659
llvm::AMDGPUSubtarget::INVALID
@ INVALID
Definition: AMDGPUSubtarget.h:32
llvm::ScheduleDAG::SUnits
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:562
AMDGPUGenSubtargetInfo
llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:592
llvm::AMDGPUSubtarget::HasSMulHi
bool HasSMulHi
Definition: AMDGPUSubtarget.h:57
llvm::MachineSchedPolicy::OnlyTopDown
bool OnlyTopDown
Definition: MachineScheduler.h:183
llvm::MachineFunction::getTarget
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Definition: MachineFunction.h:622
llvm::AMDGPUSubtarget::VOLCANIC_ISLANDS
@ VOLCANIC_ISLANDS
Definition: AMDGPUSubtarget.h:39
llvm::GCNSubtarget::flatScratchIsArchitected
bool flatScratchIsArchitected() const
Definition: GCNSubtarget.h:995
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::SDep::setLatency
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
llvm::GCNSubtarget::useAA
bool useAA() const override
Definition: AMDGPUSubtarget.cpp:730
R600Subtarget.h
llvm::SIMachineFunctionInfo::getWavesPerEU
std::pair< unsigned, unsigned > getWavesPerEU() const
Definition: SIMachineFunctionInfo.h:870
llvm::SUnit::addPred
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
Definition: ScheduleDAG.cpp:107
llvm::GCNSubtarget::zeroesHigh16BitsOfDest
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
Definition: AMDGPUSubtarget.cpp:344
llvm::SDep::getKind
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:486
llvm::MachineInstr::isBundle
bool isBundle() const
Definition: MachineInstr.h:1287
llvm::MDBuilder
Definition: MDBuilder.h:35
llvm::GCNSubtarget::getGeneration
Generation getGeneration() const
Definition: GCNSubtarget.h:264
llvm::SIInstrInfo
Definition: SIInstrInfo.h:38
llvm::AMDGPUSubtarget::LocalMemorySize
unsigned LocalMemorySize
Definition: AMDGPUSubtarget.h:63
AMDGPULegalizerInfo.h
llvm::GCNSubtarget::isXNACKEnabled
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:532
AMDGPUCallLowering.h
llvm::GCNSubtarget::HasMovrel
bool HasMovrel
Definition: GCNSubtarget.h:114
MachineScheduler.h
DisablePowerSched
static cl::opt< bool > DisablePowerSched("amdgpu-disable-power-sched", cl::desc("Disable scheduling to minimize mAI power bursts"), cl::init(false))
llvm::CallingConv::AMDGPU_ES
@ AMDGPU_ES
Calling convention used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:236
llvm::ScheduleDAGInstrs::dumpNode
void dumpNode(const SUnit &SU) const override
Definition: ScheduleDAGInstrs.cpp:1164
llvm::GCNSubtarget::getConstantBusLimit
unsigned getConstantBusLimit(unsigned Opcode) const
Definition: AMDGPUSubtarget.cpp:323
llvm::MDBuilder::createRange
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:84
llvm::GCNSubtarget::~GCNSubtarget
~GCNSubtarget() override
llvm::AMDGPUSubtarget::getDefaultFlatWorkGroupSize
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
Definition: AMDGPUSubtarget.cpp:498
llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition: SIMachineFunctionInfo.h:335
llvm::AMDGPU::IsaInfo::getMaxWavesPerEU
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
Definition: AMDGPUBaseInfo.cpp:562
llvm::TargetMachine::getTargetTriple
const Triple & getTargetTriple() const
Definition: TargetMachine.h:128
llvm::MachineSchedPolicy::ShouldTrackLaneMasks
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg.
Definition: MachineScheduler.h:179
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
llvm::GCNSubtarget::enableSIScheduler
bool enableSIScheduler() const
Definition: GCNSubtarget.h:889
llvm::GCNSubtarget::getInstrItineraryData
const InstrItineraryData * getInstrItineraryData() const override
Definition: GCNSubtarget.h:258
llvm::ScheduleDAGInstrs::getSchedModel
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
Definition: ScheduleDAGInstrs.h:262
llvm::ScheduleDAGMutation
Mutate the DAG as a postpass after normal DAG building.
Definition: ScheduleDAGMutation.h:22
llvm::SmallPtrSetImpl
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::AMDGPU::HSAMD::Kernel::CodeProps::Key::NumVGPRs
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
Definition: AMDGPUMetadata.h:255
llvm::AMDGPU::IsaInfo::AMDGPUTargetID::setTargetIDFromFeaturesString
void setTargetIDFromFeaturesString(StringRef FS)
Definition: AMDGPUBaseInfo.cpp:332
llvm::SUnit::Preds
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:256
llvm::SUnit
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
llvm::GCNSubtarget::createFillMFMAShadowMutation
std::unique_ptr< ScheduleDAGMutation > createFillMFMAShadowMutation(const TargetInstrInfo *TII) const
Definition: AMDGPUSubtarget.cpp:1104
From
BlockVerifier::State From
Definition: BlockVerifier.cpp:55
llvm::MachineSchedPolicy
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
Definition: MachineScheduler.h:174
llvm::AMDGPUSubtarget::getFlatWorkGroupSizes
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
Definition: AMDGPUSubtarget.cpp:512
llvm::cl::desc
Definition: CommandLine.h:414
llvm::ScheduleDAGInstrs
A ScheduleDAG for scheduling lists of MachineInstr.
Definition: ScheduleDAGInstrs.h:119
llvm::AMDGPU::getIntegerPairAttribute
std::pair< int, int > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< int, int > Default, bool OnlyFirstRequired)
Definition: AMDGPUBaseInfo.cpp:867
llvm::AMDGPUSubtarget::getWavesPerEU
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
Definition: AMDGPUSubtarget.h:94
true
basic Basic Alias true
Definition: BasicAliasAnalysis.cpp:1815
llvm::GCNSubtarget::computeOccupancy
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
Definition: AMDGPUSubtarget.cpp:806
llvm::CallingConv::AMDGPU_PS
@ AMDGPU_PS
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:210
llvm::SIMachineFunctionInfo::hasFlatScratchInit
bool hasFlatScratchInit() const
Definition: SIMachineFunctionInfo.h:647
AMDGPUTargetMachine.h
llvm::SIInstrInfo::pseudoToMCOpcode
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
Definition: SIInstrInfo.cpp:7725
llvm::GCNSubtarget::getBaseMaxNumVGPRs
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU) const
Definition: AMDGPUSubtarget.cpp:896
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:44
AMDGPUBaseInfo.h
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364