LLVM  13.0.0git
AMDGPUSubtarget.cpp
Go to the documentation of this file.
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUCallLowering.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPURegisterBankInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #define GET_SUBTARGETINFO_TARGET_DESC
42 #define GET_SUBTARGETINFO_CTOR
43 #undef AMDGPUSubtarget
44 #include "R600GenSubtargetInfo.inc"
45 
47  "amdgpu-disable-power-sched",
48  cl::desc("Disable scheduling to minimize mAI power bursts"),
49  cl::init(false));
50 
52  "amdgpu-vgpr-index-mode",
53  cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
54  cl::init(false));
55 
57  "amdgpu-enable-flat-scratch",
58  cl::desc("Use flat scratch instructions"),
59  cl::init(false));
60 
61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
62  cl::desc("Enable the use of AA during codegen."),
63  cl::init(true));
64 
65 GCNSubtarget::~GCNSubtarget() = default;
66 
69  StringRef GPU, StringRef FS) {
70  SmallString<256> FullFS("+promote-alloca,");
71  FullFS += FS;
72  ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
73 
76 
77  return *this;
78 }
79 
82  StringRef GPU, StringRef FS) {
83  // Determine default and user-specified characteristics
84  //
85  // We want to be able to turn these off, but making this a subtarget feature
86  // for SI has the unhelpful behavior that it unsets everything else if you
87  // disable it.
88  //
89  // Similarly we want enable-prt-strict-null to be on by default and not to
90  // unset everything else if it is disabled
91 
92  SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
93 
94  // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
95  if (isAmdHsaOS())
96  FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
97 
98  FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
99 
100  // Disable mutually exclusive bits.
101  if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
102  if (FS.find_lower("wavefrontsize16") == StringRef::npos)
103  FullFS += "-wavefrontsize16,";
104  if (FS.find_lower("wavefrontsize32") == StringRef::npos)
105  FullFS += "-wavefrontsize32,";
106  if (FS.find_lower("wavefrontsize64") == StringRef::npos)
107  FullFS += "-wavefrontsize64,";
108  }
109 
110  FullFS += FS;
111 
112  ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
113 
114  // Implement the "generic" processors, which acts as the default when no
115  // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
116  // the first amdgcn target that supports flat addressing. Other OSes defaults
117  // to the first amdgcn target.
118  if (Gen == AMDGPUSubtarget::INVALID) {
121  }
122 
123  // We don't support FP64 for EG/NI atm.
125 
126  // Targets must either support 64-bit offsets for MUBUF instructions, and/or
127  // support flat operations, otherwise they cannot access a 64-bit global
128  // address space
129  assert(hasAddr64() || hasFlat());
130  // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
131  // that do not support ADDR64 variants of MUBUF instructions. Such targets
132  // cannot use a 64 bit offset with a MUBUF instruction to access the global
133  // address space
134  if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
135  ToggleFeature(AMDGPU::FeatureFlatForGlobal);
136  FlatForGlobal = true;
137  }
138  // Unless +-flat-for-global is specified, use MUBUF instructions for global
139  // address space access if flat operations are not available.
140  if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
141  ToggleFeature(AMDGPU::FeatureFlatForGlobal);
142  FlatForGlobal = false;
143  }
144 
145  // Set defaults if needed.
146  if (MaxPrivateElementSize == 0)
148 
149  if (LDSBankCount == 0)
150  LDSBankCount = 32;
151 
152  if (TT.getArch() == Triple::amdgcn) {
153  if (LocalMemorySize == 0)
154  LocalMemorySize = 32768;
155 
156  // Do something sensible for unspecified target.
157  if (!HasMovrel && !HasVGPRIndexMode)
158  HasMovrel = true;
159  }
160 
161  // Don't crash on invalid devices.
162  if (WavefrontSizeLog2 == 0)
163  WavefrontSizeLog2 = 5;
164 
166 
168 
169  LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
170  << TargetID.getXnackSetting() << '\n');
171  LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
172  << TargetID.getSramEccSetting() << '\n');
173 
174  return *this;
175 }
176 
178  TargetTriple(TT),
179  GCN3Encoding(false),
180  Has16BitInsts(false),
181  HasMadMixInsts(false),
182  HasMadMacF32Insts(false),
183  HasDsSrc2Insts(false),
184  HasSDWA(false),
185  HasVOP3PInsts(false),
186  HasMulI24(true),
187  HasMulU24(true),
188  HasInv2PiInlineImm(false),
189  HasFminFmaxLegacy(true),
190  EnablePromoteAlloca(false),
191  HasTrigReducedRange(false),
192  MaxWavesPerEU(10),
193  LocalMemorySize(0),
194  WavefrontSizeLog2(0)
195  { }
196 
198  const GCNTargetMachine &TM) :
199  AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
200  AMDGPUSubtarget(TT),
201  TargetTriple(TT),
202  TargetID(*this),
203  Gen(INVALID),
204  InstrItins(getInstrItineraryForCPU(GPU)),
205  LDSBankCount(0),
206  MaxPrivateElementSize(0),
207 
208  FastFMAF32(false),
209  FastDenormalF32(false),
210  HalfRate64Ops(false),
211  FullRate64Ops(false),
212 
213  FlatForGlobal(false),
214  AutoWaitcntBeforeBarrier(false),
215  UnalignedScratchAccess(false),
216  UnalignedAccessMode(false),
217 
218  HasApertureRegs(false),
219  SupportsXNACK(false),
220  EnableXNACK(false),
221  EnableTgSplit(false),
222  EnableCuMode(false),
223  TrapHandler(false),
224 
226  EnableUnsafeDSOffsetFolding(false),
227  EnableSIScheduler(false),
228  EnableDS128(false),
229  EnablePRTStrictNull(false),
230  DumpCode(false),
231 
232  FP64(false),
233  CIInsts(false),
234  GFX8Insts(false),
235  GFX9Insts(false),
236  GFX90AInsts(false),
237  GFX10Insts(false),
238  GFX10_3Insts(false),
239  GFX7GFX8GFX9Insts(false),
240  SGPRInitBug(false),
241  HasSMemRealTime(false),
242  HasIntClamp(false),
243  HasFmaMixInsts(false),
244  HasMovrel(false),
245  HasVGPRIndexMode(false),
246  HasScalarStores(false),
247  HasScalarAtomics(false),
248  HasSDWAOmod(false),
249  HasSDWAScalar(false),
250  HasSDWASdst(false),
251  HasSDWAMac(false),
252  HasSDWAOutModsVOPC(false),
253  HasDPP(false),
254  HasDPP8(false),
255  Has64BitDPP(false),
256  HasPackedFP32Ops(false),
257  HasExtendedImageInsts(false),
258  HasR128A16(false),
259  HasGFX10A16(false),
260  HasG16(false),
261  HasNSAEncoding(false),
262  GFX10_BEncoding(false),
263  HasDLInsts(false),
264  HasDot1Insts(false),
265  HasDot2Insts(false),
266  HasDot3Insts(false),
267  HasDot4Insts(false),
268  HasDot5Insts(false),
269  HasDot6Insts(false),
270  HasDot7Insts(false),
271  HasMAIInsts(false),
272  HasPkFmacF16Inst(false),
273  HasAtomicFaddInsts(false),
274  SupportsSRAMECC(false),
275  EnableSRAMECC(false),
276  HasNoSdstCMPX(false),
277  HasVscnt(false),
278  HasGetWaveIdInst(false),
279  HasSMemTimeInst(false),
280  HasShaderCyclesRegister(false),
281  HasRegisterBanking(false),
282  HasVOP3Literal(false),
283  HasNoDataDepHazard(false),
284  FlatAddressSpace(false),
285  FlatInstOffsets(false),
286  FlatGlobalInsts(false),
287  FlatScratchInsts(false),
288  ScalarFlatScratchInsts(false),
289  AddNoCarryInsts(false),
290  HasUnpackedD16VMem(false),
291  LDSMisalignedBug(false),
292  HasMFMAInlineLiteralBug(false),
293  UnalignedBufferAccess(false),
294  UnalignedDSAccess(false),
295  HasPackedTID(false),
296 
298 
299  HasVcmpxPermlaneHazard(false),
300  HasVMEMtoScalarWriteHazard(false),
301  HasSMEMtoVectorWriteHazard(false),
302  HasInstFwdPrefetchBug(false),
303  HasVcmpxExecWARHazard(false),
304  HasLdsBranchVmemWARHazard(false),
305  HasNSAtoVMEMBug(false),
306  HasOffset3fBug(false),
307  HasFlatSegmentOffsetBug(false),
308  HasImageStoreD16Bug(false),
309  HasImageGather4D16Bug(false),
310 
311  FeatureDisable(false),
312  InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
313  TLInfo(TM, *this),
314  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
316  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
317  InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
318  Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
319  RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
320  InstSelector.reset(new AMDGPUInstructionSelector(
321  *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
322 }
323 
326 }
327 
328 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
329  if (getGeneration() < GFX10)
330  return 1;
331 
332  switch (Opcode) {
333  case AMDGPU::V_LSHLREV_B64_e64:
334  case AMDGPU::V_LSHLREV_B64_gfx10:
335  case AMDGPU::V_LSHL_B64_e64:
336  case AMDGPU::V_LSHRREV_B64_e64:
337  case AMDGPU::V_LSHRREV_B64_gfx10:
338  case AMDGPU::V_LSHR_B64_e64:
339  case AMDGPU::V_ASHRREV_I64_e64:
340  case AMDGPU::V_ASHRREV_I64_gfx10:
341  case AMDGPU::V_ASHR_I64_e64:
342  return 1;
343  }
344 
345  return 2;
346 }
347 
349  const Function &F) const {
350  if (NWaves == 1)
351  return getLocalMemorySize();
352  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
353  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
354  if (!WorkGroupsPerCu)
355  return 0;
356  unsigned MaxWaves = getMaxWavesPerEU();
357  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
358 }
359 
360 // FIXME: Should return min,max range.
362  const Function &F) const {
363  const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
364  const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
365  if (!MaxWorkGroupsPerCu)
366  return 0;
367 
368  const unsigned WaveSize = getWavefrontSize();
369 
370  // FIXME: Do we need to account for alignment requirement of LDS rounding the
371  // size up?
372  // Compute restriction based on LDS usage
373  unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
374 
375  // This can be queried with more LDS than is possible, so just assume the
376  // worst.
377  if (NumGroups == 0)
378  return 1;
379 
380  NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
381 
382  // Round to the number of waves.
383  const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
384  unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
385 
386  // Clamp to the maximum possible number of waves.
387  MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
388 
389  // FIXME: Needs to be a multiple of the group size?
390  //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
391 
392  assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
393  "computed invalid occupancy");
394  return MaxWaves;
395 }
396 
397 unsigned
399  const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
400  return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
401 }
402 
403 std::pair<unsigned, unsigned>
405  switch (CC) {
412  return std::make_pair(1, getWavefrontSize());
413  default:
414  return std::make_pair(1u, getMaxFlatWorkGroupSize());
415  }
416 }
417 
418 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
419  const Function &F) const {
420  // Default minimum/maximum flat work group sizes.
421  std::pair<unsigned, unsigned> Default =
422  getDefaultFlatWorkGroupSize(F.getCallingConv());
423 
424  // Requested minimum/maximum flat work group sizes.
425  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
426  F, "amdgpu-flat-work-group-size", Default);
427 
428  // Make sure requested minimum is less than requested maximum.
429  if (Requested.first > Requested.second)
430  return Default;
431 
432  // Make sure requested values do not violate subtarget's specifications.
433  if (Requested.first < getMinFlatWorkGroupSize())
434  return Default;
435  if (Requested.second > getMaxFlatWorkGroupSize())
436  return Default;
437 
438  return Requested;
439 }
440 
441 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
442  const Function &F) const {
443  // Default minimum/maximum number of waves per execution unit.
444  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
445 
446  // Default/requested minimum/maximum flat work group sizes.
447  std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
448 
449  // If minimum/maximum flat work group sizes were explicitly requested using
450  // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
451  // number of waves per execution unit to values implied by requested
452  // minimum/maximum flat work group sizes.
453  unsigned MinImpliedByFlatWorkGroupSize =
454  getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
455  Default.first = MinImpliedByFlatWorkGroupSize;
456  bool RequestedFlatWorkGroupSize =
457  F.hasFnAttribute("amdgpu-flat-work-group-size");
458 
459  // Requested minimum/maximum number of waves per execution unit.
460  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
461  F, "amdgpu-waves-per-eu", Default, true);
462 
463  // Make sure requested minimum is less than requested maximum.
464  if (Requested.second && Requested.first > Requested.second)
465  return Default;
466 
467  // Make sure requested values do not violate subtarget's specifications.
468  if (Requested.first < getMinWavesPerEU() ||
469  Requested.second > getMaxWavesPerEU())
470  return Default;
471 
472  // Make sure requested values are compatible with values implied by requested
473  // minimum/maximum flat work group sizes.
474  if (RequestedFlatWorkGroupSize &&
475  Requested.first < MinImpliedByFlatWorkGroupSize)
476  return Default;
477 
478  return Requested;
479 }
480 
481 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
482  auto Node = Kernel.getMetadata("reqd_work_group_size");
483  if (Node && Node->getNumOperands() == 3)
484  return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
486 }
487 
489  return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
490 }
491 
493  unsigned Dimension) const {
494  unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
495  if (ReqdSize != std::numeric_limits<unsigned>::max())
496  return ReqdSize - 1;
497  return getFlatWorkGroupSizes(Kernel).second - 1;
498 }
499 
501  Function *Kernel = I->getParent()->getParent();
502  unsigned MinSize = 0;
503  unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
504  bool IdQuery = false;
505 
506  // If reqd_work_group_size is present it narrows value down.
507  if (auto *CI = dyn_cast<CallInst>(I)) {
508  const Function *F = CI->getCalledFunction();
509  if (F) {
510  unsigned Dim = UINT_MAX;
511  switch (F->getIntrinsicID()) {
512  case Intrinsic::amdgcn_workitem_id_x:
513  case Intrinsic::r600_read_tidig_x:
514  IdQuery = true;
516  case Intrinsic::r600_read_local_size_x:
517  Dim = 0;
518  break;
519  case Intrinsic::amdgcn_workitem_id_y:
520  case Intrinsic::r600_read_tidig_y:
521  IdQuery = true;
523  case Intrinsic::r600_read_local_size_y:
524  Dim = 1;
525  break;
526  case Intrinsic::amdgcn_workitem_id_z:
527  case Intrinsic::r600_read_tidig_z:
528  IdQuery = true;
530  case Intrinsic::r600_read_local_size_z:
531  Dim = 2;
532  break;
533  default:
534  break;
535  }
536 
537  if (Dim <= 3) {
538  unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
539  if (ReqdSize != std::numeric_limits<unsigned>::max())
540  MinSize = MaxSize = ReqdSize;
541  }
542  }
543  }
544 
545  if (!MaxSize)
546  return false;
547 
548  // Range metadata is [Lo, Hi). For ID query we need to pass max size
549  // as Hi. For size query we need to pass Hi + 1.
550  if (IdQuery)
551  MinSize = 0;
552  else
553  ++MaxSize;
554 
555  MDBuilder MDB(I->getContext());
556  MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
557  APInt(32, MaxSize));
558  I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
559  return true;
560 }
561 
563  if (isMesaKernel(F))
564  return 16;
565  return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
566 }
567 
569  Align &MaxAlign) const {
570  assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
571  F.getCallingConv() == CallingConv::SPIR_KERNEL);
572 
573  const DataLayout &DL = F.getParent()->getDataLayout();
574  uint64_t ExplicitArgBytes = 0;
575  MaxAlign = Align(1);
576 
577  for (const Argument &Arg : F.args()) {
578  const bool IsByRef = Arg.hasByRefAttr();
579  Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
580  MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
581  if (!Alignment)
582  Alignment = DL.getABITypeAlign(ArgTy);
583 
584  uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
585  ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
586  MaxAlign = max(MaxAlign, Alignment);
587  }
588 
589  return ExplicitArgBytes;
590 }
591 
593  Align &MaxAlign) const {
594  uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
595 
596  unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
597 
598  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
599  unsigned ImplicitBytes = getImplicitArgNumBytes(F);
600  if (ImplicitBytes != 0) {
601  const Align Alignment = getAlignmentForImplicitArgPtr();
602  TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
603  }
604 
605  // Being able to dereference past the end is useful for emitting scalar loads.
606  return alignTo(TotalSize, 4);
607 }
608 
612 }
613 
615  const TargetMachine &TM) :
616  R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
617  AMDGPUSubtarget(TT),
618  InstrInfo(*this),
619  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
620  FMA(false),
621  CaymanISA(false),
622  CFALUBug(false),
623  HasVertexCache(false),
624  R600ALUInst(false),
625  FP64(false),
626  TexVTXClauseSize(0),
627  Gen(R600),
628  TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
629  InstrItins(getInstrItineraryForCPU(GPU)) { }
630 
632  unsigned NumRegionInstrs) const {
633  // Track register pressure so the scheduler can try to decrease
634  // pressure once register usage is above the threshold defined by
635  // SIRegisterInfo::getRegPressureSetLimit()
636  Policy.ShouldTrackPressure = true;
637 
638  // Enabling both top down and bottom up scheduling seems to give us less
639  // register spills than just using one of these approaches on its own.
640  Policy.OnlyTopDown = false;
641  Policy.OnlyBottomUp = false;
642 
643  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
644  if (!enableSIScheduler())
645  Policy.ShouldTrackLaneMasks = true;
646 }
647 
649  return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
650 }
651 
653  return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
654 }
655 
656 bool GCNSubtarget::useAA() const { return UseAA; }
657 
658 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
660  return getMaxWavesPerEU();
661 
663  if (SGPRs <= 80)
664  return 10;
665  if (SGPRs <= 88)
666  return 9;
667  if (SGPRs <= 100)
668  return 8;
669  return 7;
670  }
671  if (SGPRs <= 48)
672  return 10;
673  if (SGPRs <= 56)
674  return 9;
675  if (SGPRs <= 64)
676  return 8;
677  if (SGPRs <= 72)
678  return 7;
679  if (SGPRs <= 80)
680  return 6;
681  return 5;
682 }
683 
684 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
685  unsigned MaxWaves = getMaxWavesPerEU();
686  unsigned Granule = getVGPRAllocGranule();
687  if (VGPRs < Granule)
688  return MaxWaves;
689  unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
690  return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
691 }
692 
696  return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
697 
698  if (MFI.hasFlatScratchInit()) {
700  return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
702  return 4; // FLAT_SCRATCH, VCC (in that order).
703  }
704 
705  if (isXNACKEnabled())
706  return 4; // XNACK, VCC (in that order).
707  return 2; // VCC.
708 }
709 
710 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
711  unsigned NumSGPRs,
712  unsigned NumVGPRs) const {
713  unsigned Occupancy =
715  getOccupancyWithLocalMemSize(LDSSize, F));
716  if (NumSGPRs)
717  Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
718  if (NumVGPRs)
719  Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
720  return Occupancy;
721 }
722 
724  const Function &F = MF.getFunction();
726 
727  // Compute maximum number of SGPRs function can use using default/requested
728  // minimum number of waves per execution unit.
729  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
730  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
731  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
732 
733  // Check if maximum number of SGPRs was explicitly requested using
734  // "amdgpu-num-sgpr" attribute.
735  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
736  unsigned Requested = AMDGPU::getIntegerAttribute(
737  F, "amdgpu-num-sgpr", MaxNumSGPRs);
738 
739  // Make sure requested value does not violate subtarget's specifications.
740  if (Requested && (Requested <= getReservedNumSGPRs(MF)))
741  Requested = 0;
742 
743  // If more SGPRs are required to support the input user/system SGPRs,
744  // increase to accommodate them.
745  //
746  // FIXME: This really ends up using the requested number of SGPRs + number
747  // of reserved special registers in total. Theoretically you could re-use
748  // the last input registers for these special registers, but this would
749  // require a lot of complexity to deal with the weird aliasing.
750  unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
751  if (Requested && Requested < InputNumSGPRs)
752  Requested = InputNumSGPRs;
753 
754  // Make sure requested value is compatible with values implied by
755  // default/requested minimum/maximum number of waves per execution unit.
756  if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
757  Requested = 0;
758  if (WavesPerEU.second &&
759  Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
760  Requested = 0;
761 
762  if (Requested)
763  MaxNumSGPRs = Requested;
764  }
765 
766  if (hasSGPRInitBug())
768 
769  return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
770  MaxAddressableNumSGPRs);
771 }
772 
774  const Function &F = MF.getFunction();
776 
777  // Compute maximum number of VGPRs function can use using default/requested
778  // minimum number of waves per execution unit.
779  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
780  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
781 
782  // Check if maximum number of VGPRs was explicitly requested using
783  // "amdgpu-num-vgpr" attribute.
784  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
785  unsigned Requested = AMDGPU::getIntegerAttribute(
786  F, "amdgpu-num-vgpr", MaxNumVGPRs);
787 
788  if (hasGFX90AInsts())
789  Requested *= 2;
790 
791  // Make sure requested value is compatible with values implied by
792  // default/requested minimum/maximum number of waves per execution unit.
793  if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
794  Requested = 0;
795  if (WavesPerEU.second &&
796  Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
797  Requested = 0;
798 
799  if (Requested)
800  MaxNumVGPRs = Requested;
801  }
802 
803  return MaxNumVGPRs;
804 }
805 
807  int UseOpIdx, SDep &Dep) const {
808  if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
809  !Def->isInstr() || !Use->isInstr())
810  return;
811 
812  MachineInstr *DefI = Def->getInstr();
813  MachineInstr *UseI = Use->getInstr();
814 
815  if (DefI->isBundle()) {
817  auto Reg = Dep.getReg();
820  unsigned Lat = 0;
821  for (++I; I != E && I->isBundledWithPred(); ++I) {
822  if (I->modifiesRegister(Reg, TRI))
823  Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
824  else if (Lat)
825  --Lat;
826  }
827  Dep.setLatency(Lat);
828  } else if (UseI->isBundle()) {
830  auto Reg = Dep.getReg();
833  unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
834  for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
835  if (I->readsRegister(Reg, TRI))
836  break;
837  --Lat;
838  }
839  Dep.setLatency(Lat);
840  }
841 }
842 
843 namespace {
844 struct FillMFMAShadowMutation : ScheduleDAGMutation {
845  const SIInstrInfo *TII;
846 
847  ScheduleDAGMI *DAG;
848 
849  FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
850 
851  bool isSALU(const SUnit *SU) const {
852  const MachineInstr *MI = SU->getInstr();
853  return MI && TII->isSALU(*MI) && !MI->isTerminator();
854  }
855 
856  bool isVALU(const SUnit *SU) const {
857  const MachineInstr *MI = SU->getInstr();
858  return MI && TII->isVALU(*MI);
859  }
860 
861  bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
862  if (Pred->NodeNum < Succ->NodeNum)
863  return true;
864 
865  SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
866 
867  for (unsigned I = 0; I < Succs.size(); ++I) {
868  for (const SDep &SI : Succs[I]->Succs) {
869  const SUnit *SU = SI.getSUnit();
870  if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
871  Succs.push_back(SU);
872  }
873  }
874 
876  while (!Preds.empty()) {
877  const SUnit *SU = Preds.pop_back_val();
878  if (llvm::is_contained(Succs, SU))
879  return false;
880  Visited.insert(SU);
881  for (const SDep &SI : SU->Preds)
882  if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
883  Preds.push_back(SI.getSUnit());
884  }
885 
886  return true;
887  }
888 
889  // Link as much SALU intructions in chain as possible. Return the size
890  // of the chain. Links up to MaxChain instructions.
891  unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
892  SmallPtrSetImpl<SUnit *> &Visited) const {
893  SmallVector<SUnit *, 8> Worklist({To});
894  unsigned Linked = 0;
895 
896  while (!Worklist.empty() && MaxChain-- > 0) {
897  SUnit *SU = Worklist.pop_back_val();
898  if (!Visited.insert(SU).second)
899  continue;
900 
901  LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
902  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
903 
904  if (SU->addPred(SDep(From, SDep::Artificial), false))
905  ++Linked;
906 
907  for (SDep &SI : From->Succs) {
908  SUnit *SUv = SI.getSUnit();
909  if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
910  SUv->addPred(SDep(SU, SDep::Artificial), false);
911  }
912 
913  for (SDep &SI : SU->Succs) {
914  SUnit *Succ = SI.getSUnit();
915  if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
916  Worklist.push_back(Succ);
917  }
918  }
919 
920  return Linked;
921  }
922 
923  void apply(ScheduleDAGInstrs *DAGInstrs) override {
924  const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
925  if (!ST.hasMAIInsts() || DisablePowerSched)
926  return;
927  DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
928  const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
929  if (!TSchedModel || DAG->SUnits.empty())
930  return;
931 
932  // Scan for MFMA long latency instructions and try to add a dependency
933  // of available SALU instructions to give them a chance to fill MFMA
934  // shadow. That is desirable to fill MFMA shadow with SALU instructions
935  // rather than VALU to prevent power consumption bursts and throttle.
936  auto LastSALU = DAG->SUnits.begin();
937  auto E = DAG->SUnits.end();
938  SmallPtrSet<SUnit*, 32> Visited;
939  for (SUnit &SU : DAG->SUnits) {
940  MachineInstr &MAI = *SU.getInstr();
941  if (!TII->isMAI(MAI) ||
942  MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
943  MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
944  continue;
945 
946  unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
947 
948  LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
949  dbgs() << "Need " << Lat
950  << " instructions to cover latency.\n");
951 
952  // Find up to Lat independent scalar instructions as early as
953  // possible such that they can be scheduled after this MFMA.
954  for ( ; Lat && LastSALU != E; ++LastSALU) {
955  if (Visited.count(&*LastSALU))
956  continue;
957 
958  if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
959  continue;
960 
961  Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
962  }
963  }
964  }
965 };
966 } // namespace
967 
969  std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
970  Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
971 }
972 
975  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
976  else
977  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
978 }
979 
981  if (TM.getTargetTriple().getArch() == Triple::amdgcn)
982  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
983  else
984  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
985 }
llvm::MachineSchedPolicy::OnlyBottomUp
bool OnlyBottomUp
Definition: MachineScheduler.h:183
llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:158
llvm::AMDGPUSubtarget::getAlignmentForImplicitArgPtr
Align getAlignmentForImplicitArgPtr() const
Definition: AMDGPUSubtarget.h:192
llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
llvm::GCNSubtarget::Gen
unsigned Gen
Definition: GCNSubtarget.h:68
llvm::AMDGPUSubtarget::HasMulI24
bool HasMulI24
Definition: AMDGPUSubtarget.h:55
llvm::AMDGPURegisterBankInfo
Definition: AMDGPURegisterBankInfo.h:42
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:100
llvm
Definition: AllocatorList.h:23
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
llvm::CallingConv::AMDGPU_HS
@ AMDGPU_HS
Calling convention used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:218
Reg
unsigned Reg
Definition: MachineSink.cpp:1566
llvm::SystemZISD::TM
@ TM
Definition: SystemZISelLowering.h:65
TargetFrameLowering.h
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::AMDGPU::HSAMD::Kernel::CodeProps::Key::NumSGPRs
constexpr char NumSGPRs[]
Key for Kernel::CodeProps::Metadata::mNumSGPRs.
Definition: AMDGPUMetadata.h:253
llvm::TargetFrameLowering
Information about stack frame layout on the target.
Definition: TargetFrameLowering.h:42
llvm::GCNSubtarget::hasVGPRIndexMode
bool hasVGPRIndexMode() const
Definition: GCNSubtarget.h:792
llvm::Wave32
@ Wave32
Definition: AMDGPUMCTargetDesc.h:34
llvm::SDep::Artificial
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
llvm::InlineAsmLowering
Definition: InlineAsmLowering.h:28
llvm::AMDGPUSubtarget::HasFminFmaxLegacy
bool HasFminFmaxLegacy
Definition: AMDGPUSubtarget.h:58
SIMachineFunctionInfo.h
llvm::R600Subtarget::initializeSubtargetDependencies
R600Subtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Definition: AMDGPUSubtarget.cpp:68
llvm::GCNSubtarget::initializeSubtargetDependencies
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Definition: AMDGPUSubtarget.cpp:81
llvm::Function
Definition: Function.h:61
llvm::StringRef::npos
static constexpr size_t npos
Definition: StringRef.h:59
llvm::GCNSubtarget::hasMovrel
bool hasMovrel() const
Definition: GCNSubtarget.h:788
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::AMDGPUSubtarget::SOUTHERN_ISLANDS
@ SOUTHERN_ISLANDS
Definition: AMDGPUSubtarget.h:37
llvm::Triple::amdgcn
@ amdgcn
Definition: Triple.h:72
llvm::R600Subtarget::R600Subtarget
R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const TargetMachine &TM)
Definition: AMDGPUSubtarget.cpp:614
llvm::AMDGPUSubtarget::getMinWavesPerEU
virtual unsigned getMinWavesPerEU() const =0
llvm::GCNSubtarget::hasFlatScratchInsts
bool hasFlatScratchInsts() const
Definition: GCNSubtarget.h:556
llvm::GCNSubtarget::hasFP64
bool hasFP64() const
Definition: GCNSubtarget.h:288
llvm::GlobalObject::getMetadata
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition: Metadata.cpp:1191
llvm::AMDGPUSubtarget::SEA_ISLANDS
@ SEA_ISLANDS
Definition: AMDGPUSubtarget.h:38
llvm::Triple
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:45
llvm::ISD::FMA
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:460
llvm::Wave64
@ Wave64
Definition: AMDGPUMCTargetDesc.h:34
llvm::AMDGPUSubtarget::getOccupancyWithLocalMemSize
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
Definition: AMDGPUSubtarget.cpp:361
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
EnableVGPRIndexMode
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))
llvm::AMDGPUSubtarget::getMaxWavesPerEU
unsigned getMaxWavesPerEU() const
Definition: AMDGPUSubtarget.h:223
llvm::SUnit::Succs
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::GCNSubtarget
Definition: GCNSubtarget.h:38
llvm::GCNSubtarget::TargetID
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
Definition: GCNSubtarget.h:67
llvm::AMDGPUSubtarget::getMaxFlatWorkGroupSize
virtual unsigned getMaxFlatWorkGroupSize() const =0
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1567
llvm::SIMachineFunctionInfo::getNumPreloadedSGPRs
unsigned getNumPreloadedSGPRs() const
Definition: SIMachineFunctionInfo.h:715
llvm::Data
@ Data
Definition: SIMachineScheduler.h:56
llvm::AMDGPUSubtarget::getKernArgSegmentSize
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
Definition: AMDGPUSubtarget.cpp:592
llvm::GCNSubtarget::getRegisterInfo
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:226
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
llvm::AMDGPULegalizerInfo
This class provides the information for the target register banks.
Definition: AMDGPULegalizerInfo.h:32
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:205
llvm::R600Subtarget::hasCaymanISA
bool hasCaymanISA() const
Definition: R600Subtarget.h:114
llvm::AMDGPUSubtarget::isAmdHsaOS
bool isAmdHsaOS() const
Definition: AMDGPUSubtarget.h:106
llvm::GCNSubtarget::HasVGPRIndexMode
bool HasVGPRIndexMode
Definition: GCNSubtarget.h:120
llvm::GCNSubtarget::overrideSchedPolicy
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
Definition: AMDGPUSubtarget.cpp:631
llvm::cl::apply
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1305
llvm::AMDGPUSubtarget::get
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Definition: AMDGPUSubtarget.cpp:973
llvm::GCNSubtarget::ParseSubtargetFeatures
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
llvm::AMDGPU::isShader
bool isShader(CallingConv::ID cc)
Definition: AMDGPUBaseInfo.cpp:1313
InlineAsmLowering.h
llvm::GCNSubtarget::getMaxNumSGPRs
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
Definition: GCNSubtarget.h:1011
llvm::GCNSubtarget::useVGPRIndexMode
bool useVGPRIndexMode() const
Definition: AMDGPUSubtarget.cpp:652
llvm::GCNSubtarget::hasAddr64
bool hasAddr64() const
Definition: GCNSubtarget.h:312
SmallString.h
llvm::X86AS::FS
@ FS
Definition: X86.h:183
ScalarizeGlobal
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden)
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:653
llvm::Legalizer
Definition: Legalizer.h:31
getReqdWorkGroupSize
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim)
Definition: AMDGPUSubtarget.cpp:481
llvm::GCNSubtarget::getTargetLowering
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:222
llvm::AMDGPUSubtarget::isMesa3DOS
bool isMesa3DOS() const
Definition: AMDGPUSubtarget.h:114
llvm::SUnit::NodeNum
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:264
SI
@ SI
Definition: SIInstrInfo.cpp:7342
llvm::GCNSubtarget::getOccupancyWithNumSGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
Definition: AMDGPUSubtarget.cpp:658
llvm::AMDGPUSubtarget::getImplicitArgNumBytes
unsigned getImplicitArgNumBytes(const Function &F) const
Definition: AMDGPUSubtarget.cpp:562
AMDGPUSubtarget.h
false
Definition: StackSlotColoring.cpp:142
llvm::AMDGPUSubtarget::getWavesPerEUForWorkGroup
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
EnableFlatScratch
static cl::opt< bool > EnableFlatScratch("amdgpu-enable-flat-scratch", cl::desc("Use flat scratch instructions"), cl::init(false))
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:129
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:119
MCSubtargetInfo.h
llvm::AMDGPUSubtarget::isMesaKernel
bool isMesaKernel(const Function &F) const
Definition: AMDGPUSubtarget.cpp:488
llvm::Instruction
Definition: Instruction.h:45
llvm::GCNSubtarget::hasSGPRInitBug
bool hasSGPRInitBug() const
Definition: GCNSubtarget.h:889
llvm::SIInstrInfo::getInstrLatency
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
Definition: SIInstrInfo.cpp:7685
MDBuilder.h
llvm::AMDGPU::IsaInfo::AMDGPUTargetID::getSramEccSetting
TargetIDSetting getSramEccSetting() const
Definition: AMDGPUBaseInfo.h:144
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:29
llvm::AMDGPUSubtarget::getMaxWorkitemID
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
Definition: AMDGPUSubtarget.cpp:492
Align
uint64_t Align
Definition: ELFObjHandler.cpp:83
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::Triple::getArch
ArchType getArch() const
getArch - Get the parsed architecture type of this triple.
Definition: Triple.h:307
llvm::GCNSubtarget::getMaxNumVGPRs
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
Definition: GCNSubtarget.h:1056
llvm::GCNSubtarget::LDSBankCount
int LDSBankCount
Definition: GCNSubtarget.h:70
llvm::AMDGPUSubtarget::getLocalMemorySize
unsigned getLocalMemorySize() const
Definition: AMDGPUSubtarget.h:188
llvm::None
const NoneType None
Definition: None.h:23
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::SmallString< 256 >
llvm::AMDGPUSubtarget::AMDGPUSubtarget
AMDGPUSubtarget(const Triple &TT)
Definition: AMDGPUSubtarget.cpp:177
llvm::AMDGPUSubtarget::getWavefrontSize
unsigned getWavefrontSize() const
Definition: AMDGPUSubtarget.h:180
llvm::Triple::AMDHSA
@ AMDHSA
Definition: Triple.h:190
llvm::AMDGPUSubtarget::GFX10
@ GFX10
Definition: AMDGPUSubtarget.h:41
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:555
llvm::AMDGPUDwarfFlavour
AMDGPUDwarfFlavour
Definition: AMDGPUMCTargetDesc.h:34
llvm::cl::opt< bool >
llvm::AMDGPUSubtarget::WavefrontSizeLog2
char WavefrontSizeLog2
Definition: AMDGPUSubtarget.h:63
llvm::CallingConv::AMDGPU_GS
@ AMDGPU_GS
Calling convention used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:202
llvm::AMDGPUSubtarget::makeLIDRangeMetadata
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
Definition: AMDGPUSubtarget.cpp:500
llvm::R600Subtarget
Definition: R600Subtarget.h:36
llvm::CallingConv::AMDGPU_KERNEL
@ AMDGPU_KERNEL
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:211
llvm::AMDGPUSubtarget
Definition: AMDGPUSubtarget.h:29
AMDGPURegisterBankInfo.h
llvm::AMDGPUSubtarget::getWavesPerEU
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
Definition: AMDGPUSubtarget.cpp:441
llvm::TargetSchedModel
Provide an instruction scheduling machine model to CodeGen passes.
Definition: TargetSchedule.h:31
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:64
llvm::omp::Kernel
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition: OpenMPOpt.h:21
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::AMDGPU::getIntegerAttribute
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
Definition: AMDGPUBaseInfo.cpp:818
llvm::GCNSubtarget::getTotalNumVGPRs
unsigned getTotalNumVGPRs() const
Definition: GCNSubtarget.h:1039
llvm::GCNSubtarget::GCNSubtarget
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
Definition: AMDGPUSubtarget.cpp:197
llvm::AMDGPUCallLowering
Definition: AMDGPUCallLowering.h:24
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::SUnit::getInstr
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
llvm::SDep::getReg
unsigned getReg() const
Returns the register associated with this edge.
Definition: ScheduleDAG.h:218
llvm::GCNSubtarget::hasFlat
bool hasFlat() const
Definition: GCNSubtarget.h:316
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:440
AMDGPUInstructionSelector.h
llvm::R600Subtarget::getGeneration
Generation getGeneration() const
Definition: R600Subtarget.h:82
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1570
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
llvm::AMDGPUInstructionSelector
Definition: AMDGPUInstructionSelector.h:52
llvm::AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
Definition: AMDGPUSubtarget.cpp:348
llvm::AMDGPU::IsaInfo::AMDGPUTargetID::getXnackSetting
TargetIDSetting getXnackSetting() const
Definition: AMDGPUBaseInfo.h:115
llvm::GCNSubtarget::getMinNumVGPRs
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
Definition: GCNSubtarget.h:1050
llvm::GCNSubtarget::hasGFX90AInsts
bool hasGFX90AInsts() const
Definition: GCNSubtarget.h:949
llvm::GCNSubtarget::getOccupancyWithNumVGPRs
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
Definition: AMDGPUSubtarget.cpp:684
EnableLoadStoreOpt
static cl::opt< bool > EnableLoadStoreOpt("aarch64-enable-ldst-opt", cl::desc("Enable the load/store pair" " optimization pass"), cl::init(true), cl::Hidden)
llvm::ScheduleDAGMI
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Definition: MachineScheduler.h:265
llvm::MDNode
Metadata node.
Definition: Metadata.h:897
UseAA
static cl::opt< bool > UseAA("amdgpu-use-aa-in-codegen", cl::desc("Enable the use of AA during codegen."), cl::init(true))
llvm::GCNSubtarget::MaxPrivateElementSize
unsigned MaxPrivateElementSize
Definition: GCNSubtarget.h:71
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
llvm::AMDGPUSubtarget::MaxWavesPerEU
unsigned MaxWavesPerEU
Definition: AMDGPUSubtarget.h:61
llvm::AMDGPUSubtarget::getExplicitKernelArgOffset
unsigned getExplicitKernelArgOffset(const Function &F) const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
Definition: AMDGPUSubtarget.h:198
llvm::MachineBasicBlock::instr_end
instr_iterator instr_end()
Definition: MachineBasicBlock.h:254
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:71
llvm::MachineFunction
Definition: MachineFunction.h:227
llvm::GCNTargetMachine
Definition: AMDGPUTargetMachine.h:96
llvm::AMDGPUSubtarget::getMinFlatWorkGroupSize
virtual unsigned getMinFlatWorkGroupSize() const =0
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
this
Analysis the ScalarEvolution expression for r is this
Definition: README.txt:8
AMDGPU.h
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:478
llvm::AMDGPUSubtarget::getAMDGPUDwarfFlavour
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
Definition: AMDGPUSubtarget.cpp:609
llvm::ScheduleDAG::MF
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:560
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
uint32_t
llvm::GCNSubtarget::hasMadF16
bool hasMadF16() const
Definition: AMDGPUSubtarget.cpp:648
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:81
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:286
llvm::AMDGPUSubtarget::getMaxWorkGroupsPerCU
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:281
llvm::GCNSubtarget::adjustSchedDependency
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep) const override
Definition: AMDGPUSubtarget.cpp:806
llvm::GCNSubtarget::getMinNumSGPRs
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
Definition: GCNSubtarget.h:1005
llvm::GCNSubtarget::enableFlatScratch
bool enableFlatScratch() const
Definition: AMDGPUSubtarget.cpp:324
llvm::MachineSchedPolicy::ShouldTrackPressure
bool ShouldTrackPressure
Definition: MachineScheduler.h:175
llvm::GCNSubtarget::getPostRAMutations
void getPostRAMutations(std::vector< std::unique_ptr< ScheduleDAGMutation >> &Mutations) const override
Definition: AMDGPUSubtarget.cpp:968
llvm::CallingConv::AMDGPU_LS
@ AMDGPU_LS
Calling convention used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:226
llvm::GCNSubtarget::FlatForGlobal
bool FlatForGlobal
Definition: GCNSubtarget.h:80
llvm::GCNSubtarget::getReservedNumSGPRs
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
Definition: AMDGPUSubtarget.cpp:693
llvm::SDep
Scheduling dependency.
Definition: ScheduleDAG.h:49
llvm::GCNSubtarget::getVGPRAllocGranule
unsigned getVGPRAllocGranule() const
Definition: GCNSubtarget.h:1029
llvm::AMDGPUSubtarget::getExplicitKernArgSize
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
Definition: AMDGPUSubtarget.cpp:568
llvm::AMDGPUSubtarget::INVALID
@ INVALID
Definition: AMDGPUSubtarget.h:32
llvm::CallingConv::SPIR_KERNEL
@ SPIR_KERNEL
SPIR_KERNEL - Calling convention for SPIR kernel functions.
Definition: CallingConv.h:147
llvm::ScheduleDAG::SUnits
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:562
AMDGPUGenSubtargetInfo
llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:521
llvm::MachineSchedPolicy::OnlyTopDown
bool OnlyTopDown
Definition: MachineScheduler.h:182
llvm::MachineFunction::getTarget
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Definition: MachineFunction.h:551
llvm::AMDGPUSubtarget::VOLCANIC_ISLANDS
@ VOLCANIC_ISLANDS
Definition: AMDGPUSubtarget.h:39
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::SDep::setLatency
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
llvm::GCNSubtarget::useAA
bool useAA() const override
Definition: AMDGPUSubtarget.cpp:656
llvm::SIMachineFunctionInfo::getWavesPerEU
std::pair< unsigned, unsigned > getWavesPerEU() const
Definition: SIMachineFunctionInfo.h:858
llvm::SUnit::addPred
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
Definition: ScheduleDAG.cpp:107
R600GenSubtargetInfo
llvm::SDep::getKind
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:486
llvm::MachineInstr::isBundle
bool isBundle() const
Definition: MachineInstr.h:1275
llvm::MDBuilder
Definition: MDBuilder.h:35
llvm::GCNSubtarget::getGeneration
Generation getGeneration() const
Definition: GCNSubtarget.h:265
llvm::SIInstrInfo
Definition: SIInstrInfo.h:38
llvm::AMDGPUSubtarget::LocalMemorySize
unsigned LocalMemorySize
Definition: AMDGPUSubtarget.h:62
AMDGPULegalizerInfo.h
llvm::GCNSubtarget::isXNACKEnabled
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:528
AMDGPUCallLowering.h
llvm::GCNSubtarget::HasMovrel
bool HasMovrel
Definition: GCNSubtarget.h:119
MachineScheduler.h
DisablePowerSched
static cl::opt< bool > DisablePowerSched("amdgpu-disable-power-sched", cl::desc("Disable scheduling to minimize mAI power bursts"), cl::init(false))
llvm::ScheduleDAGInstrs::dumpNode
void dumpNode(const SUnit &SU) const override
Definition: ScheduleDAGInstrs.cpp:1163
llvm::GCNSubtarget::getConstantBusLimit
unsigned getConstantBusLimit(unsigned Opcode) const
Definition: AMDGPUSubtarget.cpp:328
llvm::MDBuilder::createRange
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:84
llvm::GCNSubtarget::~GCNSubtarget
~GCNSubtarget() override
llvm::AMDGPUSubtarget::getDefaultFlatWorkGroupSize
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
Definition: AMDGPUSubtarget.cpp:404
llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition: SIMachineFunctionInfo.h:331
llvm::AMDGPU::IsaInfo::getMaxWavesPerEU
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
Definition: AMDGPUBaseInfo.cpp:528
llvm::TargetMachine::getTargetTriple
const Triple & getTargetTriple() const
Definition: TargetMachine.h:123
llvm::MachineSchedPolicy::ShouldTrackLaneMasks
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg.
Definition: MachineScheduler.h:178
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:350
llvm::GCNSubtarget::enableSIScheduler
bool enableSIScheduler() const
Definition: GCNSubtarget.h:881
llvm::GCNSubtarget::getInstrItineraryData
const InstrItineraryData * getInstrItineraryData() const override
Definition: GCNSubtarget.h:259
llvm::ScheduleDAGInstrs::getSchedModel
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
Definition: ScheduleDAGInstrs.h:262
llvm::ScheduleDAGMutation
Mutate the DAG as a postpass after normal DAG building.
Definition: ScheduleDAGMutation.h:22
llvm::SmallPtrSetImpl
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
llvm::CallingConv::AMDGPU_ES
@ AMDGPU_ES
Calling convention used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:231
llvm::AMDGPU::HSAMD::Kernel::CodeProps::Key::NumVGPRs
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
Definition: AMDGPUMetadata.h:255
llvm::AMDGPU::IsaInfo::AMDGPUTargetID::setTargetIDFromFeaturesString
void setTargetIDFromFeaturesString(StringRef FS)
Definition: AMDGPUBaseInfo.cpp:298
llvm::SUnit::Preds
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:256
llvm::SUnit
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
From
BlockVerifier::State From
Definition: BlockVerifier.cpp:55
llvm::MachineSchedPolicy
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
Definition: MachineScheduler.h:173
llvm::CallingConv::AMDGPU_PS
@ AMDGPU_PS
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:205
llvm::AMDGPUSubtarget::HasMulU24
bool HasMulU24
Definition: AMDGPUSubtarget.h:56
llvm::AMDGPUSubtarget::getFlatWorkGroupSizes
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
Definition: AMDGPUSubtarget.cpp:418
llvm::cl::desc
Definition: CommandLine.h:411
llvm::ScheduleDAGInstrs
A ScheduleDAG for scheduling lists of MachineInstr.
Definition: ScheduleDAGInstrs.h:119
llvm::CallingConv::AMDGPU_VS
@ AMDGPU_VS
Calling convention used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (ve...
Definition: CallingConv.h:199
llvm::AMDGPU::getIntegerPairAttribute
std::pair< int, int > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< int, int > Default, bool OnlyFirstRequired)
Definition: AMDGPUBaseInfo.cpp:833
llvm::R600Subtarget::ParseSubtargetFeatures
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
true
basic Basic Alias true
Definition: BasicAliasAnalysis.cpp:1789
llvm::AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG
@ FIXED_NUM_SGPRS_FOR_INIT_BUG
Definition: AMDGPUBaseInfo.h:74
llvm::GCNSubtarget::computeOccupancy
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
Definition: AMDGPUSubtarget.cpp:710
llvm::SIMachineFunctionInfo::hasFlatScratchInit
bool hasFlatScratchInit() const
Definition: SIMachineFunctionInfo.h:635
llvm::AMDGPUSubtarget::EVERGREEN
@ EVERGREEN
Definition: AMDGPUSubtarget.h:35
AMDGPUTargetMachine.h
llvm::SIInstrInfo::pseudoToMCOpcode
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
Definition: SIInstrInfo.cpp:7389
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:44
AMDGPUBaseInfo.h
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364