LLVM  10.0.0svn
AMDGPUSubtarget.cpp
Go to the documentation of this file.
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/SmallString.h"
26 #include "llvm/IR/MDBuilder.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
44  "amdgpu-disable-power-sched",
45  cl::desc("Disable scheduling to minimize mAI power bursts"),
46  cl::init(false));
47 
48 GCNSubtarget::~GCNSubtarget() = default;
49 
52  StringRef GPU, StringRef FS) {
53  SmallString<256> FullFS("+promote-alloca,");
54  FullFS += FS;
55  ParseSubtargetFeatures(GPU, FullFS);
56 
57  // FIXME: I don't think think Evergreen has any useful support for
58  // denormals, but should be checked. Should we issue a warning somewhere
59  // if someone tries to enable these?
61  FP32Denormals = false;
62  }
63 
66 
67  return *this;
68 }
69 
72  StringRef GPU, StringRef FS) {
73  // Determine default and user-specified characteristics
74  // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
75  // enabled, but some instructions do not respect them and they run at the
76  // double precision rate, so don't enable by default.
77  //
78  // We want to be able to turn these off, but making this a subtarget feature
79  // for SI has the unhelpful behavior that it unsets everything else if you
80  // disable it.
81  //
82  // Similarly we want enable-prt-strict-null to be on by default and not to
83  // unset everything else if it is disabled
84 
85  // Assuming ECC is enabled is the conservative default.
86  SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
87 
88  if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
89  FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
90 
91  // FIXME: I don't think think Evergreen has any useful support for
92  // denormals, but should be checked. Should we issue a warning somewhere
93  // if someone tries to enable these?
95  FullFS += "+fp64-fp16-denormals,";
96  } else {
97  FullFS += "-fp32-denormals,";
98  }
99 
100  FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
101 
102  // Disable mutually exclusive bits.
103  if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
104  if (FS.find_lower("wavefrontsize16") == StringRef::npos)
105  FullFS += "-wavefrontsize16,";
106  if (FS.find_lower("wavefrontsize32") == StringRef::npos)
107  FullFS += "-wavefrontsize32,";
108  if (FS.find_lower("wavefrontsize64") == StringRef::npos)
109  FullFS += "-wavefrontsize64,";
110  }
111 
112  FullFS += FS;
113 
114  ParseSubtargetFeatures(GPU, FullFS);
115 
116  // We don't support FP64 for EG/NI atm.
118 
119  // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
120  // on VI and newer hardware to avoid assertion failures due to missing ADDR64
121  // variants of MUBUF instructions.
122  if (!hasAddr64() && !FS.contains("flat-for-global")) {
123  FlatForGlobal = true;
124  }
125 
126  // Set defaults if needed.
127  if (MaxPrivateElementSize == 0)
128  MaxPrivateElementSize = 4;
129 
130  if (LDSBankCount == 0)
131  LDSBankCount = 32;
132 
133  if (TT.getArch() == Triple::amdgcn) {
134  if (LocalMemorySize == 0)
135  LocalMemorySize = 32768;
136 
137  // Do something sensible for unspecified target.
138  if (!HasMovrel && !HasVGPRIndexMode)
139  HasMovrel = true;
140  }
141 
142  // Don't crash on invalid devices.
143  if (WavefrontSize == 0)
144  WavefrontSize = 64;
145 
147 
148  if (DoesNotSupportXNACK && EnableXNACK) {
149  ToggleFeature(AMDGPU::FeatureXNACK);
150  EnableXNACK = false;
151  }
152 
153  // ECC is on by default, but turn it off if the hardware doesn't support it
154  // anyway. This matters for the gfx9 targets with d16 loads, but don't support
155  // ECC.
156  if (DoesNotSupportSRAMECC && EnableSRAMECC) {
157  ToggleFeature(AMDGPU::FeatureSRAMECC);
158  EnableSRAMECC = false;
159  }
160 
161  return *this;
162 }
163 
165  TargetTriple(TT),
170  HasSDWA(false),
172  HasMulI24(true),
173  HasMulU24(true),
178  MaxWavesPerEU(10),
179  LocalMemorySize(0),
180  WavefrontSize(0)
181  { }
182 
184  const GCNTargetMachine &TM) :
185  AMDGPUGenSubtargetInfo(TT, GPU, FS),
186  AMDGPUSubtarget(TT),
187  TargetTriple(TT),
188  Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
189  InstrItins(getInstrItineraryForCPU(GPU)),
190  LDSBankCount(0),
191  MaxPrivateElementSize(0),
192 
193  FastFMAF32(false),
194  HalfRate64Ops(false),
195 
196  FP64FP16Denormals(false),
197  FlatForGlobal(false),
198  AutoWaitcntBeforeBarrier(false),
199  CodeObjectV3(false),
200  UnalignedScratchAccess(false),
201  UnalignedBufferAccess(false),
202 
203  HasApertureRegs(false),
204  EnableXNACK(false),
205  DoesNotSupportXNACK(false),
206  EnableCuMode(false),
207  TrapHandler(false),
208 
210  EnableUnsafeDSOffsetFolding(false),
211  EnableSIScheduler(false),
212  EnableDS128(false),
213  EnablePRTStrictNull(false),
214  DumpCode(false),
215 
216  FP64(false),
217  GCN3Encoding(false),
218  CIInsts(false),
219  GFX8Insts(false),
220  GFX9Insts(false),
221  GFX10Insts(false),
222  GFX7GFX8GFX9Insts(false),
223  SGPRInitBug(false),
224  HasSMemRealTime(false),
225  HasIntClamp(false),
226  HasFmaMixInsts(false),
227  HasMovrel(false),
228  HasVGPRIndexMode(false),
229  HasScalarStores(false),
230  HasScalarAtomics(false),
231  HasSDWAOmod(false),
232  HasSDWAScalar(false),
233  HasSDWASdst(false),
234  HasSDWAMac(false),
235  HasSDWAOutModsVOPC(false),
236  HasDPP(false),
237  HasDPP8(false),
238  HasR128A16(false),
239  HasNSAEncoding(false),
240  HasDLInsts(false),
241  HasDot1Insts(false),
242  HasDot2Insts(false),
243  HasDot3Insts(false),
244  HasDot4Insts(false),
245  HasDot5Insts(false),
246  HasDot6Insts(false),
247  HasMAIInsts(false),
248  HasPkFmacF16Inst(false),
249  HasAtomicFaddInsts(false),
250  EnableSRAMECC(false),
251  DoesNotSupportSRAMECC(false),
252  HasNoSdstCMPX(false),
253  HasVscnt(false),
254  HasRegisterBanking(false),
255  HasVOP3Literal(false),
256  HasNoDataDepHazard(false),
257  FlatAddressSpace(false),
258  FlatInstOffsets(false),
259  FlatGlobalInsts(false),
260  FlatScratchInsts(false),
261  ScalarFlatScratchInsts(false),
262  AddNoCarryInsts(false),
263  HasUnpackedD16VMem(false),
264  LDSMisalignedBug(false),
265 
267 
268  HasVcmpxPermlaneHazard(false),
269  HasVMEMtoScalarWriteHazard(false),
270  HasSMEMtoVectorWriteHazard(false),
271  HasInstFwdPrefetchBug(false),
272  HasVcmpxExecWARHazard(false),
273  HasLdsBranchVmemWARHazard(false),
274  HasNSAtoVMEMBug(false),
275  HasOffset3fBug(false),
276  HasFlatSegmentOffsetBug(false),
277 
278  FeatureDisable(false),
279  InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
280  TLInfo(TM, *this),
281  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
283  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
284  Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
285  RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
286  InstSelector.reset(new AMDGPUInstructionSelector(
287  *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
288 }
289 
290 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
291  if (getGeneration() < GFX10)
292  return 1;
293 
294  switch (Opcode) {
295  case AMDGPU::V_LSHLREV_B64:
296  case AMDGPU::V_LSHLREV_B64_gfx10:
297  case AMDGPU::V_LSHL_B64:
298  case AMDGPU::V_LSHRREV_B64:
299  case AMDGPU::V_LSHRREV_B64_gfx10:
300  case AMDGPU::V_LSHR_B64:
301  case AMDGPU::V_ASHRREV_I64:
302  case AMDGPU::V_ASHRREV_I64_gfx10:
303  case AMDGPU::V_ASHR_I64:
304  return 1;
305  }
306 
307  return 2;
308 }
309 
311  const Function &F) const {
312  if (NWaves == 1)
313  return getLocalMemorySize();
314  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
315  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
316  if (!WorkGroupsPerCu)
317  return 0;
318  unsigned MaxWaves = getMaxWavesPerEU();
319  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
320 }
321 
323  const Function &F) const {
324  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
325  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
326  if (!WorkGroupsPerCu)
327  return 0;
328  unsigned MaxWaves = getMaxWavesPerEU();
329  unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
330  unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
331  NumWaves = std::min(NumWaves, MaxWaves);
332  NumWaves = std::max(NumWaves, 1u);
333  return NumWaves;
334 }
335 
336 unsigned
338  const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
339  return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
340 }
341 
342 std::pair<unsigned, unsigned>
344  switch (CC) {
348  return std::make_pair(getWavefrontSize() * 2,
349  std::max(getWavefrontSize() * 4, 256u));
356  return std::make_pair(1, getWavefrontSize());
357  default:
358  return std::make_pair(1, 16 * getWavefrontSize());
359  }
360 }
361 
362 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
363  const Function &F) const {
364  // FIXME: 1024 if function.
365  // Default minimum/maximum flat work group sizes.
366  std::pair<unsigned, unsigned> Default =
368 
369  // Requested minimum/maximum flat work group sizes.
370  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
371  F, "amdgpu-flat-work-group-size", Default);
372 
373  // Make sure requested minimum is less than requested maximum.
374  if (Requested.first > Requested.second)
375  return Default;
376 
377  // Make sure requested values do not violate subtarget's specifications.
378  if (Requested.first < getMinFlatWorkGroupSize())
379  return Default;
380  if (Requested.second > getMaxFlatWorkGroupSize())
381  return Default;
382 
383  return Requested;
384 }
385 
386 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
387  const Function &F) const {
388  // Default minimum/maximum number of waves per execution unit.
389  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
390 
391  // Default/requested minimum/maximum flat work group sizes.
392  std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
393 
394  // If minimum/maximum flat work group sizes were explicitly requested using
395  // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
396  // number of waves per execution unit to values implied by requested
397  // minimum/maximum flat work group sizes.
398  unsigned MinImpliedByFlatWorkGroupSize =
399  getMaxWavesPerEU(FlatWorkGroupSizes.second);
400  bool RequestedFlatWorkGroupSize = false;
401 
402  if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
403  Default.first = MinImpliedByFlatWorkGroupSize;
404  RequestedFlatWorkGroupSize = true;
405  }
406 
407  // Requested minimum/maximum number of waves per execution unit.
408  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
409  F, "amdgpu-waves-per-eu", Default, true);
410 
411  // Make sure requested minimum is less than requested maximum.
412  if (Requested.second && Requested.first > Requested.second)
413  return Default;
414 
415  // Make sure requested values do not violate subtarget's specifications.
416  if (Requested.first < getMinWavesPerEU() ||
417  Requested.first > getMaxWavesPerEU())
418  return Default;
419  if (Requested.second > getMaxWavesPerEU())
420  return Default;
421 
422  // Make sure requested values are compatible with values implied by requested
423  // minimum/maximum flat work group sizes.
424  if (RequestedFlatWorkGroupSize &&
425  Requested.first < MinImpliedByFlatWorkGroupSize)
426  return Default;
427 
428  return Requested;
429 }
430 
432  Function *Kernel = I->getParent()->getParent();
433  unsigned MinSize = 0;
434  unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
435  bool IdQuery = false;
436 
437  // If reqd_work_group_size is present it narrows value down.
438  if (auto *CI = dyn_cast<CallInst>(I)) {
439  const Function *F = CI->getCalledFunction();
440  if (F) {
441  unsigned Dim = UINT_MAX;
442  switch (F->getIntrinsicID()) {
443  case Intrinsic::amdgcn_workitem_id_x:
444  case Intrinsic::r600_read_tidig_x:
445  IdQuery = true;
447  case Intrinsic::r600_read_local_size_x:
448  Dim = 0;
449  break;
450  case Intrinsic::amdgcn_workitem_id_y:
451  case Intrinsic::r600_read_tidig_y:
452  IdQuery = true;
454  case Intrinsic::r600_read_local_size_y:
455  Dim = 1;
456  break;
457  case Intrinsic::amdgcn_workitem_id_z:
458  case Intrinsic::r600_read_tidig_z:
459  IdQuery = true;
461  case Intrinsic::r600_read_local_size_z:
462  Dim = 2;
463  break;
464  default:
465  break;
466  }
467  if (Dim <= 3) {
468  if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
469  if (Node->getNumOperands() == 3)
470  MinSize = MaxSize = mdconst::extract<ConstantInt>(
471  Node->getOperand(Dim))->getZExtValue();
472  }
473  }
474  }
475 
476  if (!MaxSize)
477  return false;
478 
479  // Range metadata is [Lo, Hi). For ID query we need to pass max size
480  // as Hi. For size query we need to pass Hi + 1.
481  if (IdQuery)
482  MinSize = 0;
483  else
484  ++MaxSize;
485 
486  MDBuilder MDB(I->getContext());
487  MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
488  APInt(32, MaxSize));
489  I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
490  return true;
491 }
492 
494  unsigned &MaxAlign) const {
497 
498  const DataLayout &DL = F.getParent()->getDataLayout();
499  uint64_t ExplicitArgBytes = 0;
500  MaxAlign = 1;
501 
502  for (const Argument &Arg : F.args()) {
503  Type *ArgTy = Arg.getType();
504 
505  unsigned Align = DL.getABITypeAlignment(ArgTy);
506  uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
507  ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
508  MaxAlign = std::max(MaxAlign, Align);
509  }
510 
511  return ExplicitArgBytes;
512 }
513 
515  unsigned &MaxAlign) const {
516  uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
517 
518  unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
519 
520  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
521  unsigned ImplicitBytes = getImplicitArgNumBytes(F);
522  if (ImplicitBytes != 0) {
523  unsigned Alignment = getAlignmentForImplicitArgPtr();
524  TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
525  }
526 
527  // Being able to dereference past the end is useful for emitting scalar loads.
528  return alignTo(TotalSize, 4);
529 }
530 
532  const TargetMachine &TM) :
533  R600GenSubtargetInfo(TT, GPU, FS),
534  AMDGPUSubtarget(TT),
535  InstrInfo(*this),
536  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
537  FMA(false),
538  CaymanISA(false),
539  CFALUBug(false),
542  FP64(false),
543  TexVTXClauseSize(0),
544  Gen(R600),
545  TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
546  InstrItins(getInstrItineraryForCPU(GPU)) { }
547 
549  unsigned NumRegionInstrs) const {
550  // Track register pressure so the scheduler can try to decrease
551  // pressure once register usage is above the threshold defined by
552  // SIRegisterInfo::getRegPressureSetLimit()
553  Policy.ShouldTrackPressure = true;
554 
555  // Enabling both top down and bottom up scheduling seems to give us less
556  // register spills than just using one of these approaches on its own.
557  Policy.OnlyTopDown = false;
558  Policy.OnlyBottomUp = false;
559 
560  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
561  if (!enableSIScheduler())
562  Policy.ShouldTrackLaneMasks = true;
563 }
564 
566  return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
567 }
568 
569 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
571  return getMaxWavesPerEU();
572 
574  if (SGPRs <= 80)
575  return 10;
576  if (SGPRs <= 88)
577  return 9;
578  if (SGPRs <= 100)
579  return 8;
580  return 7;
581  }
582  if (SGPRs <= 48)
583  return 10;
584  if (SGPRs <= 56)
585  return 9;
586  if (SGPRs <= 64)
587  return 8;
588  if (SGPRs <= 72)
589  return 7;
590  if (SGPRs <= 80)
591  return 6;
592  return 5;
593 }
594 
595 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
596  unsigned MaxWaves = getMaxWavesPerEU();
597  unsigned Granule = getVGPRAllocGranule();
598  if (VGPRs < Granule)
599  return MaxWaves;
600  unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
601  return std::min(getTotalNumVGPRs() / RoundedRegs, MaxWaves);
602 }
603 
607  return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
608 
609  if (MFI.hasFlatScratchInit()) {
611  return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
613  return 4; // FLAT_SCRATCH, VCC (in that order).
614  }
615 
616  if (isXNACKEnabled())
617  return 4; // XNACK, VCC (in that order).
618  return 2; // VCC.
619 }
620 
622  unsigned LDSSize,
623  unsigned NumSGPRs,
624  unsigned NumVGPRs) const {
625  unsigned Occupancy =
626  std::min(getMaxWavesPerEU(),
628  if (NumSGPRs)
629  Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
630  if (NumVGPRs)
631  Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
632  return Occupancy;
633 }
634 
636  const Function &F = MF.getFunction();
638 
639  // Compute maximum number of SGPRs function can use using default/requested
640  // minimum number of waves per execution unit.
641  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
642  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
643  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
644 
645  // Check if maximum number of SGPRs was explicitly requested using
646  // "amdgpu-num-sgpr" attribute.
647  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
648  unsigned Requested = AMDGPU::getIntegerAttribute(
649  F, "amdgpu-num-sgpr", MaxNumSGPRs);
650 
651  // Make sure requested value does not violate subtarget's specifications.
652  if (Requested && (Requested <= getReservedNumSGPRs(MF)))
653  Requested = 0;
654 
655  // If more SGPRs are required to support the input user/system SGPRs,
656  // increase to accommodate them.
657  //
658  // FIXME: This really ends up using the requested number of SGPRs + number
659  // of reserved special registers in total. Theoretically you could re-use
660  // the last input registers for these special registers, but this would
661  // require a lot of complexity to deal with the weird aliasing.
662  unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
663  if (Requested && Requested < InputNumSGPRs)
664  Requested = InputNumSGPRs;
665 
666  // Make sure requested value is compatible with values implied by
667  // default/requested minimum/maximum number of waves per execution unit.
668  if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
669  Requested = 0;
670  if (WavesPerEU.second &&
671  Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
672  Requested = 0;
673 
674  if (Requested)
675  MaxNumSGPRs = Requested;
676  }
677 
678  if (hasSGPRInitBug())
680 
681  return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
682  MaxAddressableNumSGPRs);
683 }
684 
686  const Function &F = MF.getFunction();
688 
689  // Compute maximum number of VGPRs function can use using default/requested
690  // minimum number of waves per execution unit.
691  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
692  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
693 
694  // Check if maximum number of VGPRs was explicitly requested using
695  // "amdgpu-num-vgpr" attribute.
696  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
697  unsigned Requested = AMDGPU::getIntegerAttribute(
698  F, "amdgpu-num-vgpr", MaxNumVGPRs);
699 
700  // Make sure requested value is compatible with values implied by
701  // default/requested minimum/maximum number of waves per execution unit.
702  if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
703  Requested = 0;
704  if (WavesPerEU.second &&
705  Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
706  Requested = 0;
707 
708  if (Requested)
709  MaxNumVGPRs = Requested;
710  }
711 
712  return MaxNumVGPRs;
713 }
714 
715 namespace {
716 struct MemOpClusterMutation : ScheduleDAGMutation {
717  const SIInstrInfo *TII;
718 
719  MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
720 
721  void apply(ScheduleDAGInstrs *DAG) override {
722  SUnit *SUa = nullptr;
723  // Search for two consequent memory operations and link them
724  // to prevent scheduler from moving them apart.
725  // In DAG pre-process SUnits are in the original order of
726  // the instructions before scheduling.
727  for (SUnit &SU : DAG->SUnits) {
728  MachineInstr &MI2 = *SU.getInstr();
729  if (!MI2.mayLoad() && !MI2.mayStore()) {
730  SUa = nullptr;
731  continue;
732  }
733  if (!SUa) {
734  SUa = &SU;
735  continue;
736  }
737 
738  MachineInstr &MI1 = *SUa->getInstr();
739  if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
740  (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
741  (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
742  (TII->isDS(MI1) && TII->isDS(MI2))) {
743  SU.addPredBarrier(SUa);
744 
745  for (const SDep &SI : SU.Preds) {
746  if (SI.getSUnit() != SUa)
747  SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
748  }
749 
750  if (&SU != &DAG->ExitSU) {
751  for (const SDep &SI : SUa->Succs) {
752  if (SI.getSUnit() != &SU)
753  SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
754  }
755  }
756  }
757 
758  SUa = &SU;
759  }
760  }
761 };
762 
763 struct FillMFMAShadowMutation : ScheduleDAGMutation {
764  const SIInstrInfo *TII;
765 
766  ScheduleDAGMI *DAG;
767 
768  FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
769 
770  bool isSALU(const SUnit *SU) const {
771  const MachineInstr *MI = SU->getInstr();
772  return MI && TII->isSALU(*MI) && !MI->isTerminator();
773  }
774 
775  bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
776  if (Pred->NodeNum < Succ->NodeNum)
777  return true;
778 
779  SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
780 
781  for (unsigned I = 0; I < Succs.size(); ++I) {
782  for (const SDep &SI : Succs[I]->Succs) {
783  const SUnit *SU = SI.getSUnit();
784  if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
785  Succs.push_back(SU);
786  }
787  }
788 
790  while (!Preds.empty()) {
791  const SUnit *SU = Preds.pop_back_val();
792  if (llvm::find(Succs, SU) != Succs.end())
793  return false;
794  Visited.insert(SU);
795  for (const SDep &SI : SU->Preds)
796  if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
797  Preds.push_back(SI.getSUnit());
798  }
799 
800  return true;
801  }
802 
803  // Link as much SALU intructions in chain as possible. Return the size
804  // of the chain. Links up to MaxChain instructions.
805  unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
806  SmallPtrSetImpl<SUnit *> &Visited) const {
807  SmallVector<SUnit *, 8> Worklist({To});
808  unsigned Linked = 0;
809 
810  while (!Worklist.empty() && MaxChain-- > 0) {
811  SUnit *SU = Worklist.pop_back_val();
812  if (!Visited.insert(SU).second)
813  continue;
814 
815  LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
816  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
817 
818  if (SU->addPred(SDep(From, SDep::Artificial), false))
819  ++Linked;
820 
821  for (SDep &SI : From->Succs) {
822  SUnit *SUv = SI.getSUnit();
823  if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU))
824  SUv->addPred(SDep(SU, SDep::Artificial), false);
825  }
826 
827  for (SDep &SI : SU->Succs) {
828  SUnit *Succ = SI.getSUnit();
829  if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
830  Worklist.push_back(Succ);
831  }
832  }
833 
834  return Linked;
835  }
836 
837  void apply(ScheduleDAGInstrs *DAGInstrs) override {
838  const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
839  if (!ST.hasMAIInsts() || DisablePowerSched)
840  return;
841  DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
842  const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
843  if (!TSchedModel || DAG->SUnits.empty())
844  return;
845 
846  // Scan for MFMA long latency instructions and try to add a dependency
847  // of available SALU instructions to give them a chance to fill MFMA
848  // shadow. That is desirable to fill MFMA shadow with SALU instructions
849  // rather than VALU to prevent power consumption bursts and throttle.
850  auto LastSALU = DAG->SUnits.begin();
851  auto E = DAG->SUnits.end();
852  SmallPtrSet<SUnit*, 32> Visited;
853  for (SUnit &SU : DAG->SUnits) {
854  MachineInstr &MAI = *SU.getInstr();
855  if (!TII->isMAI(MAI) ||
856  MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
857  MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
858  continue;
859 
860  unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
861 
862  LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
863  dbgs() << "Need " << Lat
864  << " instructions to cover latency.\n");
865 
866  // Find up to Lat independent scalar instructions as early as
867  // possible such that they can be scheduled after this MFMA.
868  for ( ; Lat && LastSALU != E; ++LastSALU) {
869  if (Visited.count(&*LastSALU))
870  continue;
871 
872  if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
873  continue;
874 
875  Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
876  }
877  }
878  }
879 };
880 } // namespace
881 
883  std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
884  Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
885  Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
886 }
887 
890  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
891  else
892  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
893 }
894 
896  if (TM.getTargetTriple().getArch() == Triple::amdgcn)
897  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
898  else
899  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
900 }
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* inrinsic call or load.
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
AMDGPU specific subclass of TargetSubtarget.
This class represents lattice values for constants.
Definition: AllocatorList.h:23
static cl::opt< bool > DisablePowerSched("amdgpu-disable-power-sched", cl::desc("Disable scheduling to minimize mAI power bursts"), cl::init(false))
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, Optional< bool > EnableWavefrontSize32)
This file describes how to lower LLVM calls to machine code calls.
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:745
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:323
Mutate the DAG as a postpass after normal DAG building.
Metadata node.
Definition: Metadata.h:863
F(f)
Calling convention used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:207
block Block Frequency true
InstrItineraryData InstrItins
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Generation getGeneration() const
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:452
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:256
std::pair< int, int > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< int, int > Default, bool OnlyFirstRequired)
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:560
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:462
static const AMDGPUSubtarget & get(const MachineFunction &MF)
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:488
This file declares the targeting of the InstructionSelector class for AMDGPU.
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:369
Provide an instruction scheduling machine model to CodeGen passes.
const HexagonInstrInfo * TII
int getLocalMemorySize() const
void getPostRAMutations(std::vector< std::unique_ptr< ScheduleDAGMutation >> &Mutations) const override
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition: Metadata.cpp:1440
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1217
bool isTerminator(QueryType Type=AnyInBundle) const
Returns true if this instruction part of the terminator for a basic block.
Definition: MachineInstr.h:651
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:411
Calling convention used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:332
Calling convention used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, bool Addressable)
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI)
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableLoadStoreOpt("aarch64-enable-ldst-opt", cl::desc("Enable the load/store pair" " optimization pass"), cl::init(true), cl::Hidden)
ArchType getArch() const
getArch - Get the parsed architecture type of this triple.
Definition: Triple.h:296
uint64_t getExplicitKernArgSize(const Function &F, unsigned &MaxAlign) const
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
SUnit * getSUnit() const
Definition: ScheduleDAG.h:480
Scheduling dependency.
Definition: ScheduleDAG.h:49
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:822
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
unsigned getStackAlignment() const
void ParseSubtargetFeatures(StringRef CPU, StringRef FS)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const TargetMachine &TM)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg...
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Generation getGeneration() const
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
const Triple & getTargetTriple() const
std::pair< unsigned, unsigned > getWavesPerEU() const
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
LLVM_NODISCARD bool contains(StringRef Other) const
Return true if the given string is a substring of *this, and false otherwise.
Definition: StringRef.h:432
The AMDGPU TargetMachine interface definition for hw codgen targets.
auto find(R &&Range, const T &Val) -> decltype(adl_begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1198
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1222
constexpr char NumSGPRs[]
Key for Kernel::CodeProps::Metadata::mNumSGPRs.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:40
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:43
unsigned getKernArgSegmentSize(const Function &F, unsigned &MaxAlign) const
unsigned getWavefrontSize() const
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:200
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
BlockVerifier::State From
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:212
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:324
unsigned getExplicitKernelArgOffset(const Function &F) const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument...
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
SPIR_KERNEL - Calling convention for SPIR kernel functions.
Definition: CallingConv.h:136
Information about stack frame layout on the target.
LLVM_NODISCARD size_t find_lower(char C, size_t From=0) const
Search for the first character C in the string, ignoring case.
Definition: StringRef.cpp:57
bool hasCaymanISA() const
bool addPredBarrier(SUnit *SU)
Adds a barrier edge to SU by calling addPred(), with latency 0 generally or latency 1 for a store fol...
Definition: ScheduleDAG.h:384
unsigned getAlignmentForImplicitArgPtr() const
This class provides the information for the target register banks.
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition: Function.h:193
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
Class for arbitrary precision integers.
Definition: APInt.h:69
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
This file declares the targeting of the Machinelegalizer class for AMDGPU.
unsigned computeOccupancy(const MachineFunction &MF, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
void dumpNode(const SUnit &SU) const override
Calling convention used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (ve...
Definition: CallingConv.h:188
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
unsigned getMaxWavesPerEU() const
Provides AMDGPU specific target descriptions.
A ScheduleDAG for scheduling lists of MachineInstr.
Define a generic scheduling policy for targets that don&#39;t provide their own MachineSchedStrategy.
Representation of each machine instruction.
Definition: MachineInstr.h:64
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:564
Calling convention used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:215
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:126
static const size_t npos
Definition: StringRef.h:50
AMDGPUSubtarget(const Triple &TT)
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:573
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:106
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:340
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
unsigned getMaxFlatWorkGroupSize() const override
unsigned getMinFlatWorkGroupSize() const override
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:264
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:809
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:575
~GCNSubtarget() override
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:258
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
bool hasMAIInsts() const
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:65
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
unsigned getConstantBusLimit(unsigned Opcode) const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount...
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:562
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
unsigned getMinWavesPerEU() const override
const SITargetLowering * getTargetLowering() const override
#define LLVM_DEBUG(X)
Definition: Debug.h:122
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
R600Subtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Calling convention used for AMDPAL shader stage before geometry shader if geometry is in use...
Definition: CallingConv.h:220
iterator_range< arg_iterator > args()
Definition: Function.h:719
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
const BasicBlock * getParent() const
Definition: Instruction.h:66
const SIRegisterInfo * getRegisterInfo() const override