LLVM  9.0.0svn
AMDGPUSubtarget.cpp
Go to the documentation of this file.
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/SmallString.h"
26 #include "llvm/IR/MDBuilder.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 GCNSubtarget::~GCNSubtarget() = default;
44 
47  StringRef GPU, StringRef FS) {
48  SmallString<256> FullFS("+promote-alloca,");
49  FullFS += FS;
50  ParseSubtargetFeatures(GPU, FullFS);
51 
52  // FIXME: I don't think think Evergreen has any useful support for
53  // denormals, but should be checked. Should we issue a warning somewhere
54  // if someone tries to enable these?
56  FP32Denormals = false;
57  }
58 
61 
62  return *this;
63 }
64 
67  StringRef GPU, StringRef FS) {
68  // Determine default and user-specified characteristics
69  // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
70  // enabled, but some instructions do not respect them and they run at the
71  // double precision rate, so don't enable by default.
72  //
73  // We want to be able to turn these off, but making this a subtarget feature
74  // for SI has the unhelpful behavior that it unsets everything else if you
75  // disable it.
76  //
77  // Similarly we want enable-prt-strict-null to be on by default and not to
78  // unset everything else if it is disabled
79 
80  // Assuming ECC is enabled is the conservative default.
81  SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
82 
83  if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84  FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86  // FIXME: I don't think think Evergreen has any useful support for
87  // denormals, but should be checked. Should we issue a warning somewhere
88  // if someone tries to enable these?
90  FullFS += "+fp64-fp16-denormals,";
91  } else {
92  FullFS += "-fp32-denormals,";
93  }
94 
95  FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
96 
97  FullFS += FS;
98 
99  ParseSubtargetFeatures(GPU, FullFS);
100 
101  // We don't support FP64 for EG/NI atm.
103 
104  // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
105  // on VI and newer hardware to avoid assertion failures due to missing ADDR64
106  // variants of MUBUF instructions.
107  if (!hasAddr64() && !FS.contains("flat-for-global")) {
108  FlatForGlobal = true;
109  }
110 
111  // Set defaults if needed.
112  if (MaxPrivateElementSize == 0)
113  MaxPrivateElementSize = 4;
114 
115  if (LDSBankCount == 0)
116  LDSBankCount = 32;
117 
118  if (TT.getArch() == Triple::amdgcn) {
119  if (LocalMemorySize == 0)
120  LocalMemorySize = 32768;
121 
122  // Do something sensible for unspecified target.
123  if (!HasMovrel && !HasVGPRIndexMode)
124  HasMovrel = true;
125  }
126 
127  // Don't crash on invalid devices.
128  if (WavefrontSize == 0)
129  WavefrontSize = 64;
130 
132 
133  if (DoesNotSupportXNACK && EnableXNACK) {
134  ToggleFeature(AMDGPU::FeatureXNACK);
135  EnableXNACK = false;
136  }
137 
138  // ECC is on by default, but turn it off if the hardware doesn't support it
139  // anyway. This matters for the gfx9 targets with d16 loads, but don't support
140  // ECC.
141  if (DoesNotSupportSRAMECC && EnableSRAMECC) {
142  ToggleFeature(AMDGPU::FeatureSRAMECC);
143  EnableSRAMECC = false;
144  }
145 
146  return *this;
147 }
148 
150  TargetTriple(TT),
155  HasSDWA(false),
157  HasMulI24(true),
158  HasMulU24(true),
163  LocalMemorySize(0),
164  WavefrontSize(0)
165  { }
166 
168  const GCNTargetMachine &TM) :
169  AMDGPUGenSubtargetInfo(TT, GPU, FS),
170  AMDGPUSubtarget(TT),
171  TargetTriple(TT),
172  Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
173  InstrItins(getInstrItineraryForCPU(GPU)),
174  LDSBankCount(0),
175  MaxPrivateElementSize(0),
176 
177  FastFMAF32(false),
178  HalfRate64Ops(false),
179 
180  FP64FP16Denormals(false),
181  FlatForGlobal(false),
182  AutoWaitcntBeforeBarrier(false),
183  CodeObjectV3(false),
184  UnalignedScratchAccess(false),
185  UnalignedBufferAccess(false),
186 
187  HasApertureRegs(false),
188  EnableXNACK(false),
189  DoesNotSupportXNACK(false),
190  EnableCuMode(false),
191  TrapHandler(false),
192 
193  EnableHugePrivateBuffer(false),
195  EnableUnsafeDSOffsetFolding(false),
196  EnableSIScheduler(false),
197  EnableDS128(false),
198  EnablePRTStrictNull(false),
199  DumpCode(false),
200 
201  FP64(false),
202  GCN3Encoding(false),
203  CIInsts(false),
204  GFX8Insts(false),
205  GFX9Insts(false),
206  GFX10Insts(false),
207  GFX7GFX8GFX9Insts(false),
208  SGPRInitBug(false),
209  HasSMemRealTime(false),
210  HasIntClamp(false),
211  HasFmaMixInsts(false),
212  HasMovrel(false),
213  HasVGPRIndexMode(false),
214  HasScalarStores(false),
215  HasScalarAtomics(false),
216  HasSDWAOmod(false),
217  HasSDWAScalar(false),
218  HasSDWASdst(false),
219  HasSDWAMac(false),
220  HasSDWAOutModsVOPC(false),
221  HasDPP(false),
222  HasR128A16(false),
223  HasNSAEncoding(false),
224  HasDLInsts(false),
225  HasDot1Insts(false),
226  HasDot2Insts(false),
227  EnableSRAMECC(false),
228  DoesNotSupportSRAMECC(false),
229  HasNoSdstCMPX(false),
230  HasVscnt(false),
231  HasRegisterBanking(false),
232  HasVOP3Literal(false),
233  HasNoDataDepHazard(false),
234  FlatAddressSpace(false),
235  FlatInstOffsets(false),
236  FlatGlobalInsts(false),
237  FlatScratchInsts(false),
238  ScalarFlatScratchInsts(false),
239  AddNoCarryInsts(false),
240  HasUnpackedD16VMem(false),
241  LDSMisalignedBug(false),
242 
244 
245  HasVcmpxPermlaneHazard(false),
246  HasVMEMtoScalarWriteHazard(false),
247  HasSMEMtoVectorWriteHazard(false),
248  HasInstFwdPrefetchBug(false),
249  HasVcmpxExecWARHazard(false),
250  HasLdsBranchVmemWARHazard(false),
251  HasNSAtoVMEMBug(false),
252  HasFlatSegmentOffsetBug(false),
253 
254  FeatureDisable(false),
255  InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
256  TLInfo(TM, *this),
257  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
258  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
259  Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
260  RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
261  InstSelector.reset(new AMDGPUInstructionSelector(
262  *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
263 }
264 
265 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
266  if (getGeneration() < GFX10)
267  return 1;
268 
269  switch (Opcode) {
270  case AMDGPU::V_LSHLREV_B64:
271  case AMDGPU::V_LSHLREV_B64_gfx10:
272  case AMDGPU::V_LSHL_B64:
273  case AMDGPU::V_LSHRREV_B64:
274  case AMDGPU::V_LSHRREV_B64_gfx10:
275  case AMDGPU::V_LSHR_B64:
276  case AMDGPU::V_ASHRREV_I64:
277  case AMDGPU::V_ASHRREV_I64_gfx10:
278  case AMDGPU::V_ASHR_I64:
279  return 1;
280  }
281 
282  return 2;
283 }
284 
286  const Function &F) const {
287  if (NWaves == 1)
288  return getLocalMemorySize();
289  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
290  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
291  if (!WorkGroupsPerCu)
292  return 0;
293  unsigned MaxWaves = getMaxWavesPerEU();
294  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
295 }
296 
298  const Function &F) const {
299  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
300  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
301  if (!WorkGroupsPerCu)
302  return 0;
303  unsigned MaxWaves = getMaxWavesPerEU();
304  unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
305  unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
306  NumWaves = std::min(NumWaves, MaxWaves);
307  NumWaves = std::max(NumWaves, 1u);
308  return NumWaves;
309 }
310 
311 unsigned
313  const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
314  return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
315 }
316 
317 std::pair<unsigned, unsigned>
319  switch (CC) {
323  return std::make_pair(getWavefrontSize() * 2,
324  std::max(getWavefrontSize() * 4, 256u));
331  return std::make_pair(1, getWavefrontSize());
332  default:
333  return std::make_pair(1, 16 * getWavefrontSize());
334  }
335 }
336 
337 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
338  const Function &F) const {
339  // FIXME: 1024 if function.
340  // Default minimum/maximum flat work group sizes.
341  std::pair<unsigned, unsigned> Default =
343 
344  // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
345  // starts using "amdgpu-flat-work-group-size" attribute.
346  Default.second = AMDGPU::getIntegerAttribute(
347  F, "amdgpu-max-work-group-size", Default.second);
348  Default.first = std::min(Default.first, Default.second);
349 
350  // Requested minimum/maximum flat work group sizes.
351  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
352  F, "amdgpu-flat-work-group-size", Default);
353 
354  // Make sure requested minimum is less than requested maximum.
355  if (Requested.first > Requested.second)
356  return Default;
357 
358  // Make sure requested values do not violate subtarget's specifications.
359  if (Requested.first < getMinFlatWorkGroupSize())
360  return Default;
361  if (Requested.second > getMaxFlatWorkGroupSize())
362  return Default;
363 
364  return Requested;
365 }
366 
367 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
368  const Function &F) const {
369  // Default minimum/maximum number of waves per execution unit.
370  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
371 
372  // Default/requested minimum/maximum flat work group sizes.
373  std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
374 
375  // If minimum/maximum flat work group sizes were explicitly requested using
376  // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
377  // number of waves per execution unit to values implied by requested
378  // minimum/maximum flat work group sizes.
379  unsigned MinImpliedByFlatWorkGroupSize =
380  getMaxWavesPerEU(FlatWorkGroupSizes.second);
381  bool RequestedFlatWorkGroupSize = false;
382 
383  // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
384  // starts using "amdgpu-flat-work-group-size" attribute.
385  if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
386  F.hasFnAttribute("amdgpu-flat-work-group-size")) {
387  Default.first = MinImpliedByFlatWorkGroupSize;
388  RequestedFlatWorkGroupSize = true;
389  }
390 
391  // Requested minimum/maximum number of waves per execution unit.
392  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
393  F, "amdgpu-waves-per-eu", Default, true);
394 
395  // Make sure requested minimum is less than requested maximum.
396  if (Requested.second && Requested.first > Requested.second)
397  return Default;
398 
399  // Make sure requested values do not violate subtarget's specifications.
400  if (Requested.first < getMinWavesPerEU() ||
401  Requested.first > getMaxWavesPerEU())
402  return Default;
403  if (Requested.second > getMaxWavesPerEU())
404  return Default;
405 
406  // Make sure requested values are compatible with values implied by requested
407  // minimum/maximum flat work group sizes.
408  if (RequestedFlatWorkGroupSize &&
409  Requested.first < MinImpliedByFlatWorkGroupSize)
410  return Default;
411 
412  return Requested;
413 }
414 
416  Function *Kernel = I->getParent()->getParent();
417  unsigned MinSize = 0;
418  unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
419  bool IdQuery = false;
420 
421  // If reqd_work_group_size is present it narrows value down.
422  if (auto *CI = dyn_cast<CallInst>(I)) {
423  const Function *F = CI->getCalledFunction();
424  if (F) {
425  unsigned Dim = UINT_MAX;
426  switch (F->getIntrinsicID()) {
427  case Intrinsic::amdgcn_workitem_id_x:
428  case Intrinsic::r600_read_tidig_x:
429  IdQuery = true;
431  case Intrinsic::r600_read_local_size_x:
432  Dim = 0;
433  break;
434  case Intrinsic::amdgcn_workitem_id_y:
435  case Intrinsic::r600_read_tidig_y:
436  IdQuery = true;
438  case Intrinsic::r600_read_local_size_y:
439  Dim = 1;
440  break;
441  case Intrinsic::amdgcn_workitem_id_z:
442  case Intrinsic::r600_read_tidig_z:
443  IdQuery = true;
445  case Intrinsic::r600_read_local_size_z:
446  Dim = 2;
447  break;
448  default:
449  break;
450  }
451  if (Dim <= 3) {
452  if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
453  if (Node->getNumOperands() == 3)
454  MinSize = MaxSize = mdconst::extract<ConstantInt>(
455  Node->getOperand(Dim))->getZExtValue();
456  }
457  }
458  }
459 
460  if (!MaxSize)
461  return false;
462 
463  // Range metadata is [Lo, Hi). For ID query we need to pass max size
464  // as Hi. For size query we need to pass Hi + 1.
465  if (IdQuery)
466  MinSize = 0;
467  else
468  ++MaxSize;
469 
470  MDBuilder MDB(I->getContext());
471  MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
472  APInt(32, MaxSize));
473  I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
474  return true;
475 }
476 
478  unsigned &MaxAlign) const {
481 
482  const DataLayout &DL = F.getParent()->getDataLayout();
483  uint64_t ExplicitArgBytes = 0;
484  MaxAlign = 1;
485 
486  for (const Argument &Arg : F.args()) {
487  Type *ArgTy = Arg.getType();
488 
489  unsigned Align = DL.getABITypeAlignment(ArgTy);
490  uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
491  ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
492  MaxAlign = std::max(MaxAlign, Align);
493  }
494 
495  return ExplicitArgBytes;
496 }
497 
499  unsigned &MaxAlign) const {
500  uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
501 
502  unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
503 
504  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
505  unsigned ImplicitBytes = getImplicitArgNumBytes(F);
506  if (ImplicitBytes != 0) {
507  unsigned Alignment = getAlignmentForImplicitArgPtr();
508  TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
509  }
510 
511  // Being able to dereference past the end is useful for emitting scalar loads.
512  return alignTo(TotalSize, 4);
513 }
514 
516  const TargetMachine &TM) :
517  R600GenSubtargetInfo(TT, GPU, FS),
518  AMDGPUSubtarget(TT),
519  InstrInfo(*this),
520  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
521  FMA(false),
522  CaymanISA(false),
523  CFALUBug(false),
526  FP64(false),
527  TexVTXClauseSize(0),
528  Gen(R600),
529  TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
530  InstrItins(getInstrItineraryForCPU(GPU)) { }
531 
533  unsigned NumRegionInstrs) const {
534  // Track register pressure so the scheduler can try to decrease
535  // pressure once register usage is above the threshold defined by
536  // SIRegisterInfo::getRegPressureSetLimit()
537  Policy.ShouldTrackPressure = true;
538 
539  // Enabling both top down and bottom up scheduling seems to give us less
540  // register spills than just using one of these approaches on its own.
541  Policy.OnlyTopDown = false;
542  Policy.OnlyBottomUp = false;
543 
544  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
545  if (!enableSIScheduler())
546  Policy.ShouldTrackLaneMasks = true;
547 }
548 
550  return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
551 }
552 
553 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
555  return 10;
556 
558  if (SGPRs <= 80)
559  return 10;
560  if (SGPRs <= 88)
561  return 9;
562  if (SGPRs <= 100)
563  return 8;
564  return 7;
565  }
566  if (SGPRs <= 48)
567  return 10;
568  if (SGPRs <= 56)
569  return 9;
570  if (SGPRs <= 64)
571  return 8;
572  if (SGPRs <= 72)
573  return 7;
574  if (SGPRs <= 80)
575  return 6;
576  return 5;
577 }
578 
579 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
580  if (VGPRs <= 24)
581  return 10;
582  if (VGPRs <= 28)
583  return 9;
584  if (VGPRs <= 32)
585  return 8;
586  if (VGPRs <= 36)
587  return 7;
588  if (VGPRs <= 40)
589  return 6;
590  if (VGPRs <= 48)
591  return 5;
592  if (VGPRs <= 64)
593  return 4;
594  if (VGPRs <= 84)
595  return 3;
596  if (VGPRs <= 128)
597  return 2;
598  return 1;
599 }
600 
604  return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
605 
606  if (MFI.hasFlatScratchInit()) {
608  return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
610  return 4; // FLAT_SCRATCH, VCC (in that order).
611  }
612 
613  if (isXNACKEnabled())
614  return 4; // XNACK, VCC (in that order).
615  return 2; // VCC.
616 }
617 
619  const Function &F = MF.getFunction();
621 
622  // Compute maximum number of SGPRs function can use using default/requested
623  // minimum number of waves per execution unit.
624  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
625  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
626  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
627 
628  // Check if maximum number of SGPRs was explicitly requested using
629  // "amdgpu-num-sgpr" attribute.
630  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
631  unsigned Requested = AMDGPU::getIntegerAttribute(
632  F, "amdgpu-num-sgpr", MaxNumSGPRs);
633 
634  // Make sure requested value does not violate subtarget's specifications.
635  if (Requested && (Requested <= getReservedNumSGPRs(MF)))
636  Requested = 0;
637 
638  // If more SGPRs are required to support the input user/system SGPRs,
639  // increase to accommodate them.
640  //
641  // FIXME: This really ends up using the requested number of SGPRs + number
642  // of reserved special registers in total. Theoretically you could re-use
643  // the last input registers for these special registers, but this would
644  // require a lot of complexity to deal with the weird aliasing.
645  unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
646  if (Requested && Requested < InputNumSGPRs)
647  Requested = InputNumSGPRs;
648 
649  // Make sure requested value is compatible with values implied by
650  // default/requested minimum/maximum number of waves per execution unit.
651  if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
652  Requested = 0;
653  if (WavesPerEU.second &&
654  Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
655  Requested = 0;
656 
657  if (Requested)
658  MaxNumSGPRs = Requested;
659  }
660 
661  if (hasSGPRInitBug())
663 
664  return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
665  MaxAddressableNumSGPRs);
666 }
667 
669  const Function &F = MF.getFunction();
671 
672  // Compute maximum number of VGPRs function can use using default/requested
673  // minimum number of waves per execution unit.
674  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
675  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
676 
677  // Check if maximum number of VGPRs was explicitly requested using
678  // "amdgpu-num-vgpr" attribute.
679  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
680  unsigned Requested = AMDGPU::getIntegerAttribute(
681  F, "amdgpu-num-vgpr", MaxNumVGPRs);
682 
683  // Make sure requested value is compatible with values implied by
684  // default/requested minimum/maximum number of waves per execution unit.
685  if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
686  Requested = 0;
687  if (WavesPerEU.second &&
688  Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
689  Requested = 0;
690 
691  if (Requested)
692  MaxNumVGPRs = Requested;
693  }
694 
695  return MaxNumVGPRs;
696 }
697 
698 namespace {
699 struct MemOpClusterMutation : ScheduleDAGMutation {
700  const SIInstrInfo *TII;
701 
702  MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
703 
704  void apply(ScheduleDAGInstrs *DAG) override {
705  SUnit *SUa = nullptr;
706  // Search for two consequent memory operations and link them
707  // to prevent scheduler from moving them apart.
708  // In DAG pre-process SUnits are in the original order of
709  // the instructions before scheduling.
710  for (SUnit &SU : DAG->SUnits) {
711  MachineInstr &MI2 = *SU.getInstr();
712  if (!MI2.mayLoad() && !MI2.mayStore()) {
713  SUa = nullptr;
714  continue;
715  }
716  if (!SUa) {
717  SUa = &SU;
718  continue;
719  }
720 
721  MachineInstr &MI1 = *SUa->getInstr();
722  if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
723  (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
724  (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
725  (TII->isDS(MI1) && TII->isDS(MI2))) {
726  SU.addPredBarrier(SUa);
727 
728  for (const SDep &SI : SU.Preds) {
729  if (SI.getSUnit() != SUa)
730  SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
731  }
732 
733  if (&SU != &DAG->ExitSU) {
734  for (const SDep &SI : SUa->Succs) {
735  if (SI.getSUnit() != &SU)
736  SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
737  }
738  }
739  }
740 
741  SUa = &SU;
742  }
743  }
744 };
745 } // namespace
746 
748  std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
749  Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
750 }
751 
754  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
755  else
756  return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
757 }
758 
760  if (TM.getTargetTriple().getArch() == Triple::amdgcn)
761  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
762  else
763  return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
764 }
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* inrinsic call or load.
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:110
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
AMDGPU specific subclass of TargetSubtarget.
This class represents lattice values for constants.
Definition: AllocatorList.h:23
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:200
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override
This file describes how to lower LLVM calls to machine code calls.
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:709
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:323
Mutate the DAG as a postpass after normal DAG building.
Metadata node.
Definition: Metadata.h:863
F(f)
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
Definition: MathExtras.h:684
block Block Frequency true
InstrItineraryData InstrItins
unsigned getMaxWavesPerEU() const
Generation getGeneration() const
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:446
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:256
std::pair< int, int > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< int, int > Default, bool OnlyFirstRequired)
Calling convention used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:207
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:456
static const AMDGPUSubtarget & get(const MachineFunction &MF)
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:482
This file declares the targeting of the InstructionSelector class for AMDGPU.
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:369
const HexagonInstrInfo * TII
int getLocalMemorySize() const
void getPostRAMutations(std::vector< std::unique_ptr< ScheduleDAGMutation >> &Mutations) const override
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition: Metadata.cpp:1440
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1217
unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, bool Addressable)
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableLoadStoreOpt("aarch64-enable-ldst-opt", cl::desc("Enable the load/store pair" " optimization pass"), cl::init(true), cl::Hidden)
ArchType getArch() const
getArch - Get the parsed architecture type of this triple.
Definition: Triple.h:294
uint64_t getExplicitKernArgSize(const Function &F, unsigned &MaxAlign) const
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
SUnit * getSUnit() const
Definition: ScheduleDAG.h:480
Calling convention used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:215
Scheduling dependency.
Definition: ScheduleDAG.h:49
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:819
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
unsigned getStackAlignment() const
void ParseSubtargetFeatures(StringRef CPU, StringRef FS)
R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const TargetMachine &TM)
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg...
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Generation getGeneration() const
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
const Triple & getTargetTriple() const
std::pair< unsigned, unsigned > getWavesPerEU() const
LLVM_NODISCARD bool contains(StringRef Other) const
Return true if the given string is a substring of *this, and false otherwise.
Definition: StringRef.h:432
The AMDGPU TargetMachine interface definition for hw codgen targets.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1222
SPIR_KERNEL - Calling convention for SPIR kernel functions.
Definition: CallingConv.h:136
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:43
unsigned getKernArgSegmentSize(const Function &F, unsigned &MaxAlign) const
unsigned getWavefrontSize() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:212
unsigned getExplicitKernelArgOffset(const Function &F) const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument...
Calling convention used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (ve...
Definition: CallingConv.h:188
Information about stack frame layout on the target.
bool hasCaymanISA() const
bool addPredBarrier(SUnit *SU)
Adds a barrier edge to SU by calling addPred(), with latency 0 generally or latency 1 for a store fol...
Definition: ScheduleDAG.h:384
unsigned getAlignmentForImplicitArgPtr() const
This class provides the information for the target register banks.
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition: Function.h:193
const Function & getFunction() const
Return the LLVM function that this machine code represents.
Class for arbitrary precision integers.
Definition: APInt.h:69
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
Calling convention used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Calling convention used for AMDPAL shader stage before geometry shader if geometry is in use...
Definition: CallingConv.h:220
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
Provides AMDGPU specific target descriptions.
A ScheduleDAG for scheduling lists of MachineInstr.
Define a generic scheduling policy for targets that don&#39;t provide their own MachineSchedStrategy.
Representation of each machine instruction.
Definition: MachineInstr.h:63
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:564
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPUSubtarget(const Triple &TT)
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:106
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:334
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
unsigned getMaxFlatWorkGroupSize() const override
unsigned getMinFlatWorkGroupSize() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:806
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:565
~GCNSubtarget() override
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:250
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:65
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
unsigned getConstantBusLimit(unsigned Opcode) const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount...
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:562
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
unsigned getMinWavesPerEU() const override
const SITargetLowering * getTargetLowering() const override
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
R600Subtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Calling convention used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
iterator_range< arg_iterator > args()
Definition: Function.h:694
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
const BasicBlock * getParent() const
Definition: Instruction.h:66
const SIRegisterInfo * getRegisterInfo() const override