LLVM  13.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
18 #include "AMDGPUTargetMachine.h"
19 #include "llvm/Analysis/LoopInfo.h"
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/IR/PatternMatch.h"
24 #include "llvm/Support/KnownBits.h"
25 
26 using namespace llvm;
27 
28 #define DEBUG_TYPE "AMDGPUtti"
29 
31  "amdgpu-unroll-threshold-private",
32  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
33  cl::init(2700), cl::Hidden);
34 
36  "amdgpu-unroll-threshold-local",
37  cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
38  cl::init(1000), cl::Hidden);
39 
41  "amdgpu-unroll-threshold-if",
42  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
43  cl::init(200), cl::Hidden);
44 
46  "amdgpu-unroll-runtime-local",
47  cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
48  cl::init(true), cl::Hidden);
49 
51  "amdgpu-use-legacy-divergence-analysis",
52  cl::desc("Enable legacy divergence analysis for AMDGPU"),
53  cl::init(false), cl::Hidden);
54 
56  "amdgpu-unroll-max-block-to-analyze",
57  cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
58  cl::init(32), cl::Hidden);
59 
60 static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
61  cl::Hidden, cl::init(4000),
62  cl::desc("Cost of alloca argument"));
63 
64 // If the amount of scratch memory to eliminate exceeds our ability to allocate
65 // it into registers we gain nothing by aggressively inlining functions for that
66 // heuristic.
67 static cl::opt<unsigned>
68  ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
69  cl::init(256),
70  cl::desc("Maximum alloca size to use for inline cost"));
71 
72 // Inliner constraint to achieve reasonable compilation time.
74  "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
75  cl::desc("Maximum number of BBs allowed in a function after inlining"
76  " (compile time constraint)"));
77 
78 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
79  unsigned Depth = 0) {
80  const Instruction *I = dyn_cast<Instruction>(Cond);
81  if (!I)
82  return false;
83 
84  for (const Value *V : I->operand_values()) {
85  if (!L->contains(I))
86  continue;
87  if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
88  if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
89  return SubLoop->contains(PHI); }))
90  return true;
91  } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
92  return true;
93  }
94  return false;
95 }
96 
98  : BaseT(TM, F.getParent()->getDataLayout()),
99  TargetTriple(TM->getTargetTriple()),
100  ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
101  TLI(ST->getTargetLowering()) {}
102 
105  const Function &F = *L->getHeader()->getParent();
106  UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
108  UP.Partial = true;
109 
110  // Conditional branch in a loop back edge needs 3 additional exec
111  // manipulations in average.
112  UP.BEInsns += 3;
113 
114  // TODO: Do we want runtime unrolling?
115 
116  // Maximum alloca size than can fit registers. Reserve 16 registers.
117  const unsigned MaxAlloca = (256 - 16) * 4;
118  unsigned ThresholdPrivate = UnrollThresholdPrivate;
119  unsigned ThresholdLocal = UnrollThresholdLocal;
120 
121  // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
122  // provided threshold value as the default for Threshold
123  if (MDNode *LoopUnrollThreshold =
124  findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
125  if (LoopUnrollThreshold->getNumOperands() == 2) {
126  ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
127  LoopUnrollThreshold->getOperand(1));
128  if (MetaThresholdValue) {
129  // We will also use the supplied value for PartialThreshold for now.
130  // We may introduce additional metadata if it becomes necessary in the
131  // future.
132  UP.Threshold = MetaThresholdValue->getSExtValue();
133  UP.PartialThreshold = UP.Threshold;
134  ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
135  ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
136  }
137  }
138  }
139 
140  unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
141  for (const BasicBlock *BB : L->getBlocks()) {
142  const DataLayout &DL = BB->getModule()->getDataLayout();
143  unsigned LocalGEPsSeen = 0;
144 
145  if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
146  return SubLoop->contains(BB); }))
147  continue; // Block belongs to an inner loop.
148 
149  for (const Instruction &I : *BB) {
150  // Unroll a loop which contains an "if" statement whose condition
151  // defined by a PHI belonging to the loop. This may help to eliminate
152  // if region and potentially even PHI itself, saving on both divergence
153  // and registers used for the PHI.
154  // Add a small bonus for each of such "if" statements.
155  if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
156  if (UP.Threshold < MaxBoost && Br->isConditional()) {
157  BasicBlock *Succ0 = Br->getSuccessor(0);
158  BasicBlock *Succ1 = Br->getSuccessor(1);
159  if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
160  (L->contains(Succ1) && L->isLoopExiting(Succ1)))
161  continue;
162  if (dependsOnLocalPhi(L, Br->getCondition())) {
164  LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
165  << " for loop:\n"
166  << *L << " due to " << *Br << '\n');
167  if (UP.Threshold >= MaxBoost)
168  return;
169  }
170  }
171  continue;
172  }
173 
174  const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
175  if (!GEP)
176  continue;
177 
178  unsigned AS = GEP->getAddressSpace();
179  unsigned Threshold = 0;
180  if (AS == AMDGPUAS::PRIVATE_ADDRESS)
181  Threshold = ThresholdPrivate;
182  else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
183  Threshold = ThresholdLocal;
184  else
185  continue;
186 
187  if (UP.Threshold >= Threshold)
188  continue;
189 
190  if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
191  const Value *Ptr = GEP->getPointerOperand();
192  const AllocaInst *Alloca =
193  dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
194  if (!Alloca || !Alloca->isStaticAlloca())
195  continue;
196  Type *Ty = Alloca->getAllocatedType();
197  unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
198  if (AllocaSize > MaxAlloca)
199  continue;
200  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
201  AS == AMDGPUAS::REGION_ADDRESS) {
202  LocalGEPsSeen++;
203  // Inhibit unroll for local memory if we have seen addressing not to
204  // a variable, most likely we will be unable to combine it.
205  // Do not unroll too deep inner loops for local memory to give a chance
206  // to unroll an outer loop for a more important reason.
207  if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
208  (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
209  !isa<Argument>(GEP->getPointerOperand())))
210  continue;
211  LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
212  << *L << " due to LDS use.\n");
214  }
215 
216  // Check if GEP depends on a value defined by this loop itself.
217  bool HasLoopDef = false;
218  for (const Value *Op : GEP->operands()) {
219  const Instruction *Inst = dyn_cast<Instruction>(Op);
220  if (!Inst || L->isLoopInvariant(Op))
221  continue;
222 
223  if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
224  return SubLoop->contains(Inst); }))
225  continue;
226  HasLoopDef = true;
227  break;
228  }
229  if (!HasLoopDef)
230  continue;
231 
232  // We want to do whatever we can to limit the number of alloca
233  // instructions that make it through to the code generator. allocas
234  // require us to use indirect addressing, which is slow and prone to
235  // compiler bugs. If this loop does an address calculation on an
236  // alloca ptr, then we want to use a higher than normal loop unroll
237  // threshold. This will give SROA a better chance to eliminate these
238  // allocas.
239  //
240  // We also want to have more unrolling for local memory to let ds
241  // instructions with different offsets combine.
242  //
243  // Don't use the maximum allowed value here as it will make some
244  // programs way too big.
245  UP.Threshold = Threshold;
246  LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
247  << " for loop:\n"
248  << *L << " due to " << *GEP << '\n');
249  if (UP.Threshold >= MaxBoost)
250  return;
251  }
252 
253  // If we got a GEP in a small BB from inner loop then increase max trip
254  // count to analyze for better estimation cost in unroll
255  if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
257  }
258 }
259 
262  BaseT::getPeelingPreferences(L, SE, PP);
263 }
264 
265 const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
266  // Codegen control options which don't matter.
267  AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
268  AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
269  AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
270  AMDGPU::FeatureUnalignedAccessMode,
271 
272  AMDGPU::FeatureAutoWaitcntBeforeBarrier,
273 
274  // Property of the kernel/environment which can't actually differ.
275  AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
276  AMDGPU::FeatureTrapHandler,
277 
278  // The default assumption needs to be ecc is enabled, but no directly
279  // exposed operations depend on it, so it can be safely inlined.
280  AMDGPU::FeatureSRAMECC,
281 
282  // Perf-tuning features
283  AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
284 
286  : BaseT(TM, F.getParent()->getDataLayout()),
287  ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
288  TLI(ST->getTargetLowering()), CommonTTI(TM, F),
289  IsGraphics(AMDGPU::isGraphics(F.getCallingConv())),
290  MaxVGPRs(ST->getMaxNumVGPRs(
291  std::max(ST->getWavesPerEU(F).first,
293  ST->getFlatWorkGroupSizes(F).second)))) {
295  HasFP32Denormals = Mode.allFP32Denormals();
296  HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
297 }
298 
300  // The concept of vector registers doesn't really exist. Some packed vector
301  // operations operate on the normal 32-bit registers.
302  return MaxVGPRs;
303 }
304 
305 unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
306  // This is really the number of registers to fill when vectorizing /
307  // interleaving loops, so we lie to avoid trying to use all registers.
308  return getHardwareNumberOfRegisters(Vec) >> 3;
309 }
310 
311 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
312  const SIRegisterInfo *TRI = ST->getRegisterInfo();
313  const TargetRegisterClass *RC = TRI->getRegClass(RCID);
314  unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32;
315  return getHardwareNumberOfRegisters(false) / NumVGPRs;
316 }
317 
318 TypeSize
320  switch (K) {
322  return TypeSize::getFixed(32);
324  return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
326  return TypeSize::getScalable(0);
327  }
328  llvm_unreachable("Unsupported register kind");
329 }
330 
332  return 32;
333 }
334 
335 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
336  if (Opcode == Instruction::Load || Opcode == Instruction::Store)
337  return 32 * 4 / ElemWidth;
338  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
339  : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
340  : 1;
341 }
342 
343 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
344  unsigned ChainSizeInBytes,
345  VectorType *VecTy) const {
346  unsigned VecRegBitWidth = VF * LoadSize;
347  if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
348  // TODO: Support element-size less than 32bit?
349  return 128 / LoadSize;
350 
351  return VF;
352 }
353 
354 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
355  unsigned ChainSizeInBytes,
356  VectorType *VecTy) const {
357  unsigned VecRegBitWidth = VF * StoreSize;
358  if (VecRegBitWidth > 128)
359  return 128 / StoreSize;
360 
361  return VF;
362 }
363 
364 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
365  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
366  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
367  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
368  AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
369  return 512;
370  }
371 
372  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
373  return 8 * ST->getMaxPrivateElementSize();
374 
375  // Common to flat, global, local and region. Assume for unknown addrspace.
376  return 128;
377 }
378 
379 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
380  Align Alignment,
381  unsigned AddrSpace) const {
382  // We allow vectorization of flat stores, even though we may need to decompose
383  // them later if they may access private memory. We don't have enough context
384  // here, and legalization can handle it.
385  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
386  return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
387  ChainSizeInBytes <= ST->getMaxPrivateElementSize();
388  }
389  return true;
390 }
391 
392 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
393  Align Alignment,
394  unsigned AddrSpace) const {
395  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
396 }
397 
398 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
399  Align Alignment,
400  unsigned AddrSpace) const {
401  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
402 }
403 
404 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
405 // iteration. Should we report a larger size and let it legalize?
406 //
407 // FIXME: Should we use narrower types for local/region, or account for when
408 // unaligned access is legal?
409 //
410 // FIXME: This could use fine tuning and microbenchmarks.
412  unsigned SrcAddrSpace,
413  unsigned DestAddrSpace,
414  unsigned SrcAlign,
415  unsigned DestAlign) const {
416  unsigned MinAlign = std::min(SrcAlign, DestAlign);
417 
418  // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
419  // hardware into byte accesses. If you assume all alignments are equally
420  // probable, it's more efficient on average to use short accesses for this
421  // case.
422  if (MinAlign == 2)
423  return Type::getInt16Ty(Context);
424 
425  // Not all subtargets have 128-bit DS instructions, and we currently don't
426  // form them by default.
427  if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
428  SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
429  DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
430  DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
432  }
433 
434  // Global memory works best with 16-byte accesses. Private memory will also
435  // hit this, although they'll be decomposed.
437 }
438 
441  unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
442  unsigned SrcAlign, unsigned DestAlign) const {
443  assert(RemainingBytes < 16);
444 
445  unsigned MinAlign = std::min(SrcAlign, DestAlign);
446 
447  if (MinAlign != 2) {
448  Type *I64Ty = Type::getInt64Ty(Context);
449  while (RemainingBytes >= 8) {
450  OpsOut.push_back(I64Ty);
451  RemainingBytes -= 8;
452  }
453 
454  Type *I32Ty = Type::getInt32Ty(Context);
455  while (RemainingBytes >= 4) {
456  OpsOut.push_back(I32Ty);
457  RemainingBytes -= 4;
458  }
459  }
460 
461  Type *I16Ty = Type::getInt16Ty(Context);
462  while (RemainingBytes >= 2) {
463  OpsOut.push_back(I16Ty);
464  RemainingBytes -= 2;
465  }
466 
467  Type *I8Ty = Type::getInt8Ty(Context);
468  while (RemainingBytes) {
469  OpsOut.push_back(I8Ty);
470  --RemainingBytes;
471  }
472 }
473 
474 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
475  // Disable unrolling if the loop is not vectorized.
476  // TODO: Enable this again.
477  if (VF == 1)
478  return 1;
479 
480  return 8;
481 }
482 
484  MemIntrinsicInfo &Info) const {
485  switch (Inst->getIntrinsicID()) {
486  case Intrinsic::amdgcn_atomic_inc:
487  case Intrinsic::amdgcn_atomic_dec:
488  case Intrinsic::amdgcn_ds_ordered_add:
489  case Intrinsic::amdgcn_ds_ordered_swap:
490  case Intrinsic::amdgcn_ds_fadd:
491  case Intrinsic::amdgcn_ds_fmin:
492  case Intrinsic::amdgcn_ds_fmax: {
493  auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
494  auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
495  if (!Ordering || !Volatile)
496  return false; // Invalid.
497 
498  unsigned OrderingVal = Ordering->getZExtValue();
499  if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
500  return false;
501 
502  Info.PtrVal = Inst->getArgOperand(0);
503  Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
504  Info.ReadMem = true;
505  Info.WriteMem = true;
506  Info.IsVolatile = !Volatile->isNullValue();
507  return true;
508  }
509  default:
510  return false;
511  }
512 }
513 
515  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
516  TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
517  TTI::OperandValueProperties Opd1PropInfo,
519  const Instruction *CxtI) {
520  EVT OrigTy = TLI->getValueType(DL, Ty);
521  if (!OrigTy.isSimple()) {
522  // FIXME: We're having to query the throughput cost so that the basic
523  // implementation tries to generate legalize and scalarization costs. Maybe
524  // we could hoist the scalarization code here?
527  Opd1Info, Opd2Info, Opd1PropInfo,
528  Opd2PropInfo, Args, CxtI);
529  // Scalarization
530 
531  // Check if any of the operands are vector operands.
532  int ISD = TLI->InstructionOpcodeToISD(Opcode);
533  assert(ISD && "Invalid opcode");
534 
535  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
536 
537  bool IsFloat = Ty->isFPOrFPVectorTy();
538  // Assume that floating point arithmetic operations cost twice as much as
539  // integer operations.
540  unsigned OpCost = (IsFloat ? 2 : 1);
541 
542  if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
543  // The operation is legal. Assume it costs 1.
544  // TODO: Once we have extract/insert subvector cost we need to use them.
545  return LT.first * OpCost;
546  }
547 
548  if (!TLI->isOperationExpand(ISD, LT.second)) {
549  // If the operation is custom lowered, then assume that the code is twice
550  // as expensive.
551  return LT.first * 2 * OpCost;
552  }
553 
554  // Else, assume that we need to scalarize this op.
555  // TODO: If one of the types get legalized by splitting, handle this
556  // similarly to what getCastInstrCost() does.
557  if (auto *VTy = dyn_cast<VectorType>(Ty)) {
558  unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
560  Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
561  Opd1PropInfo, Opd2PropInfo, Args, CxtI);
562  // Return the cost of multiple scalar invocation plus the cost of
563  // inserting and extracting the values.
564  SmallVector<Type *> Tys(Args.size(), Ty);
565  return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
566  }
567 
568  // We don't know anything about this scalar instruction.
569  return OpCost;
570  }
571 
572  // Legalize the type.
573  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
574  int ISD = TLI->InstructionOpcodeToISD(Opcode);
575 
576  // Because we don't have any legal vector operations, but the legal types, we
577  // need to account for split vectors.
578  unsigned NElts = LT.second.isVector() ?
579  LT.second.getVectorNumElements() : 1;
580 
581  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
582 
583  switch (ISD) {
584  case ISD::SHL:
585  case ISD::SRL:
586  case ISD::SRA:
587  if (SLT == MVT::i64)
588  return get64BitInstrCost(CostKind) * LT.first * NElts;
589 
590  if (ST->has16BitInsts() && SLT == MVT::i16)
591  NElts = (NElts + 1) / 2;
592 
593  // i32
594  return getFullRateInstrCost() * LT.first * NElts;
595  case ISD::ADD:
596  case ISD::SUB:
597  case ISD::AND:
598  case ISD::OR:
599  case ISD::XOR:
600  if (SLT == MVT::i64) {
601  // and, or and xor are typically split into 2 VALU instructions.
602  return 2 * getFullRateInstrCost() * LT.first * NElts;
603  }
604 
605  if (ST->has16BitInsts() && SLT == MVT::i16)
606  NElts = (NElts + 1) / 2;
607 
608  return LT.first * NElts * getFullRateInstrCost();
609  case ISD::MUL: {
610  const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
611  if (SLT == MVT::i64) {
612  const int FullRateCost = getFullRateInstrCost();
613  return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
614  }
615 
616  if (ST->has16BitInsts() && SLT == MVT::i16)
617  NElts = (NElts + 1) / 2;
618 
619  // i32
620  return QuarterRateCost * NElts * LT.first;
621  }
622  case ISD::FMUL:
623  // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
624  // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
625  // fused operation.
626  if (CxtI && CxtI->hasOneUse())
627  if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
628  const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
629  if (OPC == ISD::FADD || OPC == ISD::FSUB) {
630  if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
632  if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
634 
635  // Estimate all types may be fused with contract/unsafe flags
637  if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
638  Options.UnsafeFPMath ||
639  (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
641  }
642  }
644  case ISD::FADD:
645  case ISD::FSUB:
646  if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
647  NElts = (NElts + 1) / 2;
648  if (SLT == MVT::f64)
649  return LT.first * NElts * get64BitInstrCost(CostKind);
650 
651  if (ST->has16BitInsts() && SLT == MVT::f16)
652  NElts = (NElts + 1) / 2;
653 
654  if (SLT == MVT::f32 || SLT == MVT::f16)
655  return LT.first * NElts * getFullRateInstrCost();
656  break;
657  case ISD::FDIV:
658  case ISD::FREM:
659  // FIXME: frem should be handled separately. The fdiv in it is most of it,
660  // but the current lowering is also not entirely correct.
661  if (SLT == MVT::f64) {
662  int Cost = 7 * get64BitInstrCost(CostKind) +
663  getQuarterRateInstrCost(CostKind) +
664  3 * getHalfRateInstrCost(CostKind);
665  // Add cost of workaround.
667  Cost += 3 * getFullRateInstrCost();
668 
669  return LT.first * Cost * NElts;
670  }
671 
672  if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
673  // TODO: This is more complicated, unsafe flags etc.
674  if ((SLT == MVT::f32 && !HasFP32Denormals) ||
675  (SLT == MVT::f16 && ST->has16BitInsts())) {
676  return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
677  }
678  }
679 
680  if (SLT == MVT::f16 && ST->has16BitInsts()) {
681  // 2 x v_cvt_f32_f16
682  // f32 rcp
683  // f32 fmul
684  // v_cvt_f16_f32
685  // f16 div_fixup
686  int Cost =
687  4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
688  return LT.first * Cost * NElts;
689  }
690 
691  if (SLT == MVT::f32 || SLT == MVT::f16) {
692  // 4 more v_cvt_* insts without f16 insts support
693  int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
694  1 * getQuarterRateInstrCost(CostKind);
695 
696  if (!HasFP32Denormals) {
697  // FP mode switches.
698  Cost += 2 * getFullRateInstrCost();
699  }
700 
701  return LT.first * NElts * Cost;
702  }
703  break;
704  case ISD::FNEG:
705  // Use the backend' estimation. If fneg is not free each element will cost
706  // one additional instruction.
707  return TLI->isFNegFree(SLT) ? 0 : NElts;
708  default:
709  break;
710  }
711 
712  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
713  Opd1PropInfo, Opd2PropInfo, Args, CxtI);
714 }
715 
716 // Return true if there's a potential benefit from using v2f16/v2i16
717 // instructions for an intrinsic, even if it requires nontrivial legalization.
719  switch (ID) {
720  case Intrinsic::fma: // TODO: fmuladd
721  // There's a small benefit to using vector ops in the legalized code.
722  case Intrinsic::round:
723  case Intrinsic::uadd_sat:
724  case Intrinsic::usub_sat:
725  case Intrinsic::sadd_sat:
726  case Intrinsic::ssub_sat:
727  return true;
728  default:
729  return false;
730  }
731 }
732 
736  if (ICA.getID() == Intrinsic::fabs)
737  return 0;
738 
741 
742  Type *RetTy = ICA.getReturnType();
743  EVT OrigTy = TLI->getValueType(DL, RetTy);
744  if (!OrigTy.isSimple()) {
747 
748  // TODO: Combine these two logic paths.
749  if (ICA.isTypeBasedOnly())
751 
752  unsigned RetVF =
753  (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
754  : 1);
755  const IntrinsicInst *I = ICA.getInst();
757  FastMathFlags FMF = ICA.getFlags();
758  // Assume that we need to scalarize this intrinsic.
759 
760  // Compute the scalarization overhead based on Args for a vector
761  // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
762  // CostModel will pass a vector RetTy and VF is 1.
763  InstructionCost ScalarizationCost = InstructionCost::getInvalid();
764  if (RetVF > 1) {
765  ScalarizationCost = 0;
766  if (!RetTy->isVoidTy())
767  ScalarizationCost +=
768  getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
769  ScalarizationCost +=
771  }
772 
773  IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I,
774  ScalarizationCost);
776  }
777 
778  // Legalize the type.
779  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
780 
781  unsigned NElts = LT.second.isVector() ?
782  LT.second.getVectorNumElements() : 1;
783 
784  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
785 
786  if (SLT == MVT::f64)
787  return LT.first * NElts * get64BitInstrCost(CostKind);
788 
789  if ((ST->has16BitInsts() && SLT == MVT::f16) ||
790  (ST->hasPackedFP32Ops() && SLT == MVT::f32))
791  NElts = (NElts + 1) / 2;
792 
793  // TODO: Get more refined intrinsic costs?
794  unsigned InstRate = getQuarterRateInstrCost(CostKind);
795 
796  switch (ICA.getID()) {
797  case Intrinsic::fma:
798  InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
799  : getQuarterRateInstrCost(CostKind);
800  break;
801  case Intrinsic::uadd_sat:
802  case Intrinsic::usub_sat:
803  case Intrinsic::sadd_sat:
804  case Intrinsic::ssub_sat:
805  static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
806  if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
807  NElts = 1;
808  break;
809  }
810 
811  return LT.first * NElts * InstRate;
812 }
813 
816  const Instruction *I) {
817  assert((I == nullptr || I->getOpcode() == Opcode) &&
818  "Opcode should reflect passed instruction.");
819  const bool SCost =
821  const int CBrCost = SCost ? 5 : 7;
822  switch (Opcode) {
823  case Instruction::Br: {
824  // Branch instruction takes about 4 slots on gfx900.
825  auto BI = dyn_cast_or_null<BranchInst>(I);
826  if (BI && BI->isUnconditional())
827  return SCost ? 1 : 4;
828  // Suppose conditional branch takes additional 3 exec manipulations
829  // instructions in average.
830  return CBrCost;
831  }
832  case Instruction::Switch: {
833  auto SI = dyn_cast_or_null<SwitchInst>(I);
834  // Each case (including default) takes 1 cmp + 1 cbr instructions in
835  // average.
836  return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
837  }
838  case Instruction::Ret:
839  return SCost ? 1 : 10;
840  case Instruction::PHI:
841  // TODO: 1. A prediction phi won't be eliminated?
842  // 2. Estimate data copy instructions in this case.
843  return 1;
844  }
845  return BaseT::getCFInstrCost(Opcode, CostKind, I);
846 }
847 
850  bool IsPairwise,
852  EVT OrigTy = TLI->getValueType(DL, Ty);
853 
854  // Computes cost on targets that have packed math instructions(which support
855  // 16-bit types only).
856  if (IsPairwise ||
857  !ST->hasVOP3PInsts() ||
858  OrigTy.getScalarSizeInBits() != 16)
859  return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind);
860 
861  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
862  return LT.first * getFullRateInstrCost();
863 }
864 
867  bool IsPairwise, bool IsUnsigned,
869  EVT OrigTy = TLI->getValueType(DL, Ty);
870 
871  // Computes cost on targets that have packed math instructions(which support
872  // 16-bit types only).
873  if (IsPairwise ||
874  !ST->hasVOP3PInsts() ||
875  OrigTy.getScalarSizeInBits() != 16)
876  return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
877  CostKind);
878 
879  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
880  return LT.first * getHalfRateInstrCost(CostKind);
881 }
882 
884  unsigned Index) {
885  switch (Opcode) {
886  case Instruction::ExtractElement:
887  case Instruction::InsertElement: {
888  unsigned EltSize
889  = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
890  if (EltSize < 32) {
891  if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
892  return 0;
893  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
894  }
895 
896  // Extracts are just reads of a subregister, so are free. Inserts are
897  // considered free because we don't want to have any cost for scalarizing
898  // operations, and we don't have to copy into a different register class.
899 
900  // Dynamic indexing isn't free and is best avoided.
901  return Index == ~0u ? 2 : 0;
902  }
903  default:
904  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
905  }
906 }
907 
908 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
909 /// this is analyzing the collective result of all output registers. Otherwise,
910 /// this is only querying a specific result index if this returns multiple
911 /// registers in a struct.
913  const CallInst *CI, ArrayRef<unsigned> Indices) const {
914  // TODO: Handle complex extract indices
915  if (Indices.size() > 1)
916  return true;
917 
918  const DataLayout &DL = CI->getModule()->getDataLayout();
919  const SIRegisterInfo *TRI = ST->getRegisterInfo();
920  TargetLowering::AsmOperandInfoVector TargetConstraints =
921  TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
922 
923  const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
924 
925  int OutputIdx = 0;
926  for (auto &TC : TargetConstraints) {
927  if (TC.Type != InlineAsm::isOutput)
928  continue;
929 
930  // Skip outputs we don't care about.
931  if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
932  continue;
933 
934  TLI->ComputeConstraintToUse(TC, SDValue());
935 
936  Register AssignedReg;
937  const TargetRegisterClass *RC;
938  std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
939  TRI, TC.ConstraintCode, TC.ConstraintVT);
940  if (AssignedReg) {
941  // FIXME: This is a workaround for getRegForInlineAsmConstraint
942  // returning VS_32
943  RC = TRI->getPhysRegClass(AssignedReg);
944  }
945 
946  // For AGPR constraints null is returned on subtargets without AGPRs, so
947  // assume divergent for null.
948  if (!RC || !TRI->isSGPRClass(RC))
949  return true;
950  }
951 
952  return false;
953 }
954 
955 /// \returns true if the new GPU divergence analysis is enabled.
957  return !UseLegacyDA;
958 }
959 
960 /// \returns true if the result of the value could potentially be
961 /// different across workitems in a wavefront.
963  if (const Argument *A = dyn_cast<Argument>(V))
964  return !AMDGPU::isArgPassedInSGPR(A);
965 
966  // Loads from the private and flat address spaces are divergent, because
967  // threads can execute the load instruction with the same inputs and get
968  // different results.
969  //
970  // All other loads are not divergent, because if threads issue loads with the
971  // same arguments, they will always get the same result.
972  if (const LoadInst *Load = dyn_cast<LoadInst>(V))
973  return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
974  Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
975 
976  // Atomics are divergent because they are executed sequentially: when an
977  // atomic operation refers to the same address in each thread, then each
978  // thread after the first sees the value written by the previous thread as
979  // original value.
980  if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
981  return true;
982 
983  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
984  return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
985 
986  // Assume all function calls are a source of divergence.
987  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
988  if (CI->isInlineAsm())
990  return true;
991  }
992 
993  // Assume all function calls are a source of divergence.
994  if (isa<InvokeInst>(V))
995  return true;
996 
997  return false;
998 }
999 
1000 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
1001  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
1002  switch (Intrinsic->getIntrinsicID()) {
1003  default:
1004  return false;
1005  case Intrinsic::amdgcn_readfirstlane:
1006  case Intrinsic::amdgcn_readlane:
1007  case Intrinsic::amdgcn_icmp:
1008  case Intrinsic::amdgcn_fcmp:
1009  case Intrinsic::amdgcn_ballot:
1010  case Intrinsic::amdgcn_if_break:
1011  return true;
1012  }
1013  }
1014 
1015  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1016  if (CI->isInlineAsm())
1017  return !isInlineAsmSourceOfDivergence(CI);
1018  return false;
1019  }
1020 
1021  const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
1022  if (!ExtValue)
1023  return false;
1024 
1025  const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
1026  if (!CI)
1027  return false;
1028 
1029  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1030  switch (Intrinsic->getIntrinsicID()) {
1031  default:
1032  return false;
1033  case Intrinsic::amdgcn_if:
1034  case Intrinsic::amdgcn_else: {
1035  ArrayRef<unsigned> Indices = ExtValue->getIndices();
1036  return Indices.size() == 1 && Indices[0] == 1;
1037  }
1038  }
1039  }
1040 
1041  // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1042  // divergent for the overall struct return. We need to override it in the
1043  // case we're extracting an SGPR component here.
1044  if (CI->isInlineAsm())
1045  return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1046 
1047  return false;
1048 }
1049 
1051  Intrinsic::ID IID) const {
1052  switch (IID) {
1053  case Intrinsic::amdgcn_atomic_inc:
1054  case Intrinsic::amdgcn_atomic_dec:
1055  case Intrinsic::amdgcn_ds_fadd:
1056  case Intrinsic::amdgcn_ds_fmin:
1057  case Intrinsic::amdgcn_ds_fmax:
1058  case Intrinsic::amdgcn_is_shared:
1059  case Intrinsic::amdgcn_is_private:
1060  OpIndexes.push_back(0);
1061  return true;
1062  default:
1063  return false;
1064  }
1065 }
1066 
1068  Value *OldV,
1069  Value *NewV) const {
1070  auto IntrID = II->getIntrinsicID();
1071  switch (IntrID) {
1072  case Intrinsic::amdgcn_atomic_inc:
1073  case Intrinsic::amdgcn_atomic_dec:
1074  case Intrinsic::amdgcn_ds_fadd:
1075  case Intrinsic::amdgcn_ds_fmin:
1076  case Intrinsic::amdgcn_ds_fmax: {
1077  const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
1078  if (!IsVolatile->isZero())
1079  return nullptr;
1080  Module *M = II->getParent()->getParent()->getParent();
1081  Type *DestTy = II->getType();
1082  Type *SrcTy = NewV->getType();
1083  Function *NewDecl =
1084  Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
1085  II->setArgOperand(0, NewV);
1086  II->setCalledFunction(NewDecl);
1087  return II;
1088  }
1089  case Intrinsic::amdgcn_is_shared:
1090  case Intrinsic::amdgcn_is_private: {
1091  unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1093  unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1094  LLVMContext &Ctx = NewV->getType()->getContext();
1095  ConstantInt *NewVal = (TrueAS == NewAS) ?
1097  return NewVal;
1098  }
1099  case Intrinsic::ptrmask: {
1100  unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1101  unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1102  Value *MaskOp = II->getArgOperand(1);
1103  Type *MaskTy = MaskOp->getType();
1104 
1105  bool DoTruncate = false;
1106 
1107  const GCNTargetMachine &TM =
1108  static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1109  if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1110  // All valid 64-bit to 32-bit casts work by chopping off the high
1111  // bits. Any masking only clearing the low bits will also apply in the new
1112  // address space.
1113  if (DL.getPointerSizeInBits(OldAS) != 64 ||
1114  DL.getPointerSizeInBits(NewAS) != 32)
1115  return nullptr;
1116 
1117  // TODO: Do we need to thread more context in here?
1118  KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
1119  if (Known.countMinLeadingOnes() < 32)
1120  return nullptr;
1121 
1122  DoTruncate = true;
1123  }
1124 
1125  IRBuilder<> B(II);
1126  if (DoTruncate) {
1127  MaskTy = B.getInt32Ty();
1128  MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1129  }
1130 
1131  return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1132  {NewV, MaskOp});
1133  }
1134  default:
1135  return nullptr;
1136  }
1137 }
1138 
1141  int Index, VectorType *SubTp) {
1143  if (ST->hasVOP3PInsts()) {
1144  if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
1145  DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1146  // With op_sel VOP3P instructions freely can access the low half or high
1147  // half of a register, so any swizzle is free.
1148 
1149  switch (Kind) {
1150  case TTI::SK_Broadcast:
1151  case TTI::SK_Reverse:
1153  return 0;
1154  default:
1155  break;
1156  }
1157  }
1158  }
1159 
1160  return BaseT::getShuffleCost(Kind, VT, Mask, Index, SubTp);
1161 }
1162 
1164  const Function *Callee) const {
1165  const TargetMachine &TM = getTLI()->getTargetMachine();
1166  const GCNSubtarget *CallerST
1167  = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1168  const GCNSubtarget *CalleeST
1169  = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1170 
1171  const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1172  const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1173 
1174  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1175  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1176  if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1177  return false;
1178 
1179  // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1180  // no way to support merge for backend defined attributes.
1181  AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
1183  if (!CallerMode.isInlineCompatible(CalleeMode))
1184  return false;
1185 
1186  if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1187  Callee->hasFnAttribute(Attribute::InlineHint))
1188  return true;
1189 
1190  // Hack to make compile times reasonable.
1191  if (InlineMaxBB) {
1192  // Single BB does not increase total BB amount.
1193  if (Callee->size() == 1)
1194  return true;
1195  size_t BBSize = Caller->size() + Callee->size() - 1;
1196  return BBSize <= InlineMaxBB;
1197  }
1198 
1199  return true;
1200 }
1201 
1203  // If we have a pointer to private array passed into a function
1204  // it will not be optimized out, leaving scratch usage.
1205  // Increase the inline threshold to allow inlining in this case.
1206  uint64_t AllocaSize = 0;
1208  for (Value *PtrArg : CB->args()) {
1209  PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1210  if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
1212  continue;
1213 
1214  PtrArg = getUnderlyingObject(PtrArg);
1215  if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
1216  if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1217  continue;
1218  AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1219  // If the amount of stack memory is excessive we will not be able
1220  // to get rid of the scratch anyway, bail out.
1221  if (AllocaSize > ArgAllocaCutoff) {
1222  AllocaSize = 0;
1223  break;
1224  }
1225  }
1226  }
1227  if (AllocaSize)
1228  return ArgAllocaCost;
1229  return 0;
1230 }
1231 
1234  CommonTTI.getUnrollingPreferences(L, SE, UP);
1235 }
1236 
1239  CommonTTI.getPeelingPreferences(L, SE, PP);
1240 }
1241 
1242 int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1243  return ST->hasFullRate64Ops()
1244  ? getFullRateInstrCost()
1245  : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1246  : getQuarterRateInstrCost(CostKind);
1247 }
1248 
1250  : BaseT(TM, F.getParent()->getDataLayout()),
1251  ST(static_cast<const R600Subtarget *>(TM->getSubtargetImpl(F))),
1252  TLI(ST->getTargetLowering()), CommonTTI(TM, F) {}
1253 
1255  return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
1256 }
1257 
1258 unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
1259  return getHardwareNumberOfRegisters(Vec);
1260 }
1261 
1262 TypeSize
1264  return TypeSize::getFixed(32);
1265 }
1266 
1268  return 32;
1269 }
1270 
1271 unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
1272  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
1273  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
1274  return 128;
1275  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1276  AddrSpace == AMDGPUAS::REGION_ADDRESS)
1277  return 64;
1278  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
1279  return 32;
1280 
1281  if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
1282  AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
1283  (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
1284  AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
1285  return 128;
1286  llvm_unreachable("unhandled address space");
1287 }
1288 
1289 bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
1290  Align Alignment,
1291  unsigned AddrSpace) const {
1292  // We allow vectorization of flat stores, even though we may need to decompose
1293  // them later if they may access private memory. We don't have enough context
1294  // here, and legalization can handle it.
1295  return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
1296 }
1297 
1298 bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1299  Align Alignment,
1300  unsigned AddrSpace) const {
1301  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1302 }
1303 
1304 bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1305  Align Alignment,
1306  unsigned AddrSpace) const {
1307  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1308 }
1309 
1311  // Disable unrolling if the loop is not vectorized.
1312  // TODO: Enable this again.
1313  if (VF == 1)
1314  return 1;
1315 
1316  return 8;
1317 }
1318 
1321  const Instruction *I) {
1323  return Opcode == Instruction::PHI ? 0 : 1;
1324 
1325  // XXX - For some reason this isn't called for switch.
1326  switch (Opcode) {
1327  case Instruction::Br:
1328  case Instruction::Ret:
1329  return 10;
1330  default:
1331  return BaseT::getCFInstrCost(Opcode, CostKind, I);
1332  }
1333 }
1334 
1336  unsigned Index) {
1337  switch (Opcode) {
1338  case Instruction::ExtractElement:
1339  case Instruction::InsertElement: {
1340  unsigned EltSize
1341  = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
1342  if (EltSize < 32) {
1343  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1344  }
1345 
1346  // Extracts are just reads of a subregister, so are free. Inserts are
1347  // considered free because we don't want to have any cost for scalarizing
1348  // operations, and we don't have to copy into a different register class.
1349 
1350  // Dynamic indexing isn't free and is best avoided.
1351  return Index == ~0u ? 2 : 0;
1352  }
1353  default:
1354  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1355  }
1356 }
1357 
1360  CommonTTI.getUnrollingPreferences(L, SE, UP);
1361 }
1362 
1365  CommonTTI.getPeelingPreferences(L, SE, PP);
1366 }
llvm::ISD::SUB
@ SUB
Definition: ISDOpcodes.h:233
UseLegacyDA
static cl::opt< bool > UseLegacyDA("amdgpu-use-legacy-divergence-analysis", cl::desc("Enable legacy divergence analysis for AMDGPU"), cl::init(false), cl::Hidden)
llvm::InstructionCost
Definition: InstructionCost.h:26
llvm::R600TTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AMDGPUTargetTransformInfo.cpp:1319
llvm::TargetTransformInfo::UnrollingPreferences::BEInsns
unsigned BEInsns
Definition: TargetTransformInfo.h:473
llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
Attrs
Function Attrs
Definition: README_ALTIVEC.txt:215
llvm::Type::isSized
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:272
llvm::BasicTTIImplBase< AMDGPUTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:38
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:480
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:210
llvm::Loop::isLoopInvariant
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:64
llvm::BasicTTIImplBase< GCNTTIImpl >::getOperandsScalarizationOverhead
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys)
Estimate the overhead of scalarizing an instructions unique non-constant operands.
Definition: BasicTTIImpl.h:696
llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition: TargetTransformInfo.h:448
llvm
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1496
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:623
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:435
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1329
llvm::AMDGPUAS::PARAM_I_ADDRESS
@ PARAM_I_ADDRESS
Address space for indirect addressible parameter memory (VTX1).
Definition: AMDGPU.h:374
llvm::TargetTransformInfo::UnrollingPreferences::MaxCount
unsigned MaxCount
Definition: TargetTransformInfo.h:464
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
llvm::TargetOptions
Definition: TargetOptions.h:113
llvm::Function
Definition: Function.h:61
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
llvm::LoopBase::contains
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:122
llvm::AMDGPUTargetLowering::isFNegFree
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
Definition: AMDGPUISelLowering.cpp:849
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
llvm::CallBase::setCalledFunction
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1432
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:655
llvm::BasicTTIImplBase< GCNTTIImpl >::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1017
llvm::PatternMatch::m_FPOne
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:848
llvm::GCNTTIImpl::isSourceOfDivergence
bool isSourceOfDivergence(const Value *V) const
Definition: AMDGPUTargetTransformInfo.cpp:962
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1167
llvm::CallBase::isInlineAsm
bool isInlineAsm() const
Check if this call is an inline asm statement.
Definition: InstrTypes.h:1461
llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:739
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:728
llvm::IRBuilder<>
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:149
llvm::PointerType::getAddressSpace
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:689
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:443
llvm::TargetTransformInfo::RGK_Scalar
@ RGK_Scalar
Definition: TargetTransformInfo.h:929
llvm::IntrinsicCostAttributes::getInst
const IntrinsicInst * getInst() const
Definition: TargetTransformInfo.h:148
ValueTracking.h
llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent
llvm::R600TTIImpl::getHardwareNumberOfRegisters
unsigned getHardwareNumberOfRegisters(bool Vec) const
Definition: AMDGPUTargetTransformInfo.cpp:1254
llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:190
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:213
llvm::GCNTTIImpl::getMemcpyLoopLoweringType
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign) const
Definition: AMDGPUTargetTransformInfo.cpp:411
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::GCNTTIImpl::isLegalToVectorizeMemChain
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:379
llvm::R600TTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index)
Definition: AMDGPUTargetTransformInfo.cpp:1335
llvm::R600TTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AMDGPUTargetTransformInfo.cpp:1363
llvm::Depth
@ Depth
Definition: SIMachineScheduler.h:34
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:476
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
dependsOnLocalPhi
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
Definition: AMDGPUTargetTransformInfo.cpp:78
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
llvm::GCNTTIImpl::getMaxInterleaveFactor
unsigned getMaxInterleaveFactor(unsigned VF)
Definition: AMDGPUTargetTransformInfo.cpp:474
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:529
llvm::BasicTTIImplBase< GCNTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:801
llvm::SITargetLowering::getRegForInlineAsmConstraint
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
Definition: SIISelLowering.cpp:11532
llvm::AMDGPU::SIModeRegisterDefaults
Definition: AMDGPUBaseInfo.h:902
llvm::GCNTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: AMDGPUTargetTransformInfo.cpp:1163
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:40
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::GCNSubtarget
Definition: GCNSubtarget.h:38
llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:424
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:398
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:361
llvm::TargetLowering::ComputeConstraintToUse
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
Definition: TargetLowering.cpp:4963
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:116
llvm::AMDGPU::SIModeRegisterDefaults::isInlineCompatible
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Definition: AMDGPUBaseInfo.h:987
llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:160
llvm::AMDGPU::IsaInfo::getMaxNumVGPRs
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
Definition: AMDGPUBaseInfo.cpp:745
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:364
llvm::GCNSubtarget::hasPackedFP32Ops
bool hasPackedFP32Ops() const
Definition: GCNSubtarget.h:841
llvm::AMDGPU::isIntrinsicSourceOfDivergence
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
Definition: AMDGPUBaseInfo.cpp:1976
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition: TargetTransformInfo.h:861
llvm::BasicTTIImplBase< GCNTTIImpl >::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsPairwise, bool IsUnsigned, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
Definition: BasicTTIImpl.h:2069
llvm::Type::getInt8Ty
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:195
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1567
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:197
llvm::R600TTIImpl::isLegalToVectorizeStoreChain
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:1304
llvm::GCNSubtarget::getRegisterInfo
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:230
llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:851
F
#define F(x, y, z)
Definition: MD5.cpp:56
KnownBits.h
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::IntrinsicCostAttributes::getFlags
FastMathFlags getFlags() const
Definition: TargetTransformInfo.h:150
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:124
llvm::MVT::SimpleValueType
SimpleValueType
Definition: MachineValueType.h:32
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:247
Context
LLVMContext & Context
Definition: NVVMIntrRange.cpp:66
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
llvm::AMDGPUTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: AMDGPUTargetTransformInfo.cpp:103
llvm::IntrinsicCostAttributes::getArgTypes
const SmallVectorImpl< Type * > & getArgTypes() const
Definition: TargetTransformInfo.h:153
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:77
llvm::AllocaInst::isStaticAlloca
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Definition: Instructions.cpp:1374
llvm::LoopBase::getSubLoops
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
Definition: LoopInfo.h:143
llvm::MinAlign
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition: MathExtras.h:672
llvm::AMDGPUTargetMachine
Definition: AMDGPUTargetMachine.h:27
llvm::GCNSubtarget::hasFastFMAF32
bool hasFastFMAF32() const
Definition: GCNSubtarget.h:304
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::AllocaInst::getAllocatedType
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:112
llvm::BasicTTIImplBase< GCNTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:1070
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:850
llvm::R600TTIImpl::getMaxInterleaveFactor
unsigned getMaxInterleaveFactor(unsigned VF)
Definition: AMDGPUTargetTransformInfo.cpp:1310
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:35
llvm::GCNTTIImpl::isLegalToVectorizeStoreChain
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:398
llvm::MVT::f64
@ f64
Definition: MachineValueType.h:53
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:57
llvm::GCNTTIImpl::getMinVectorRegisterBitWidth
unsigned getMinVectorRegisterBitWidth() const
Definition: AMDGPUTargetTransformInfo.cpp:331
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
llvm::ISD::SRA
@ SRA
Definition: ISDOpcodes.h:648
llvm::GCNTTIImpl::getMemcpyLoopResidualLoweringType
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign) const
Definition: AMDGPUTargetTransformInfo.cpp:439
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:235
llvm::TargetTransformInfo::UnrollingPreferences::MaxIterationsCountToAnalyze
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
Definition: TargetTransformInfo.h:502
llvm::GCNSubtarget::getMaxPrivateElementSize
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:282
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:360
llvm::GCNTTIImpl::getLoadVectorFactor
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: AMDGPUTargetTransformInfo.cpp:343
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:147
Options
const char LLVMTargetMachineRef LLVMPassBuilderOptionsRef Options
Definition: PassBuilderBindings.cpp:48
llvm::AMDGPUAS::PARAM_D_ADDRESS
@ PARAM_D_ADDRESS
Address space for direct addressible parameter memory (CONST0).
Definition: AMDGPU.h:372
llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
Definition: ValueTracking.cpp:4314
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:29
llvm::GCNTTIImpl::isAlwaysUniform
bool isAlwaysUniform(const Value *V) const
Definition: AMDGPUTargetTransformInfo.cpp:1000
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:622
llvm::AMDGPUSubtarget::has16BitInsts
bool has16BitInsts() const
Definition: AMDGPUSubtarget.h:132
PatternMatch.h
llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition: TargetTransformInfo.h:929
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:644
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::AMDGPUTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AMDGPUTargetTransformInfo.cpp:260
llvm::LinearPolySize< TypeSize >::getFixed
static TypeSize getFixed(ScalarTy MinVal)
Definition: TypeSize.h:284
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:86
llvm::lltok::Kind
Kind
Definition: LLToken.h:18
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:117
llvm::AMDGPUAS::BUFFER_FAT_POINTER
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
Definition: AMDGPU.h:369
LoopInfo.h
InlineMaxBB
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
AMDGPUTargetTransformInfo.h
llvm::GCNTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: AMDGPUTargetTransformInfo.cpp:1232
llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:371
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:852
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:391
llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition: AtomicOrdering.h:56
llvm::cl::opt
Definition: CommandLine.h:1422
llvm::R600Subtarget
Definition: R600Subtarget.h:36
llvm::GCNTTIImpl::getTgtMemIntrinsic
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
Definition: AMDGPUTargetTransformInfo.cpp:483
llvm::TargetRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(unsigned i) const
Returns the register class associated with the enumeration value.
Definition: TargetRegisterInfo.h:737
llvm::GCNTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: AMDGPUTargetTransformInfo.cpp:514
Index
uint32_t Index
Definition: ELFObjHandler.cpp:84
llvm::KnownBits::countMinLeadingOnes
unsigned countMinLeadingOnes() const
Returns the minimum number of leading one bits.
Definition: KnownBits.h:234
llvm::AMDGPUTTIImpl::AMDGPUTTIImpl
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition: AMDGPUTargetTransformInfo.cpp:97
llvm::Instruction::hasAllowContract
bool hasAllowContract() const
Determine whether the allow-contract flag is set.
Definition: Instruction.cpp:253
llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:572
llvm::R600TTIImpl::isLegalToVectorizeMemChain
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:1289
const
aarch64 promote const
Definition: AArch64PromoteConstant.cpp:232
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, bool IsPairwise, TTI::TargetCostKind CostKind)
Try to calculate arithmetic and shuffle op costs for reduction operations.
Definition: BasicTTIImpl.h:2006
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
llvm::TargetLoweringBase::isOperationLegalOrPromote
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
Definition: TargetLowering.h:1128
llvm::TargetLowering::ParseConstraints
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
Definition: TargetLowering.cpp:4613
llvm::AMDGPU::getIntegerAttribute
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
Definition: AMDGPUBaseInfo.cpp:852
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:423
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:908
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:931
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
llvm::AMDGPUSubtarget::hasMadMacF32Insts
bool hasMadMacF32Insts() const
Definition: AMDGPUSubtarget.h:140
llvm::PointerType
Class to represent pointers.
Definition: DerivedTypes.h:634
llvm::BasicTTIImplBase< AMDGPUTTIImpl >
llvm::AMDGPUAS::CONSTANT_BUFFER_0
@ CONSTANT_BUFFER_0
Definition: AMDGPU.h:382
intrinsicHasPackedVectorBenefit
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
Definition: AMDGPUTargetTransformInfo.cpp:718
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:212
llvm::TargetMachine::Options
TargetOptions Options
Definition: TargetMachine.h:115
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
llvm::LoopBase::getLoopDepth
unsigned getLoopDepth() const
Return the nesting level of this loop.
Definition: LoopInfo.h:96
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::R600TTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: AMDGPUTargetTransformInfo.cpp:1358
llvm::GCNSubtarget::hasUnalignedScratchAccess
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:516
Mode
SI Whole Quad Mode
Definition: SIWholeQuadMode.cpp:262
llvm::Type::isVoidTy
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
UnrollMaxBlockToAnalyze
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
llvm::MVT
Machine Value Type.
Definition: MachineValueType.h:30
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
llvm::MDNode
Metadata node.
Definition: Metadata.h:897
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:900
llvm::AMDGPUAS::CONSTANT_BUFFER_15
@ CONSTANT_BUFFER_15
Definition: AMDGPU.h:397
llvm::GCNTTIImpl::getHardwareNumberOfRegisters
unsigned getHardwareNumberOfRegisters(bool Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:299
UnrollThresholdPrivate
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
llvm::R600TTIImpl::isLegalToVectorizeLoadChain
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:1298
llvm::GCNTargetMachine
Definition: AMDGPUTargetMachine.h:95
llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition: TargetTransformInfo.h:261
llvm::AMDGPUSubtarget::hasVOP3PInsts
bool hasVOP3PInsts() const
Definition: AMDGPUSubtarget.h:152
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1489
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:44
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:167
llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:345
llvm::BasicTTIImplBase< GCNTTIImpl >::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:660
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:214
llvm::GCNTTIImpl::isLegalToVectorizeLoadChain
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:392
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
llvm::GCNTTIImpl::isInlineAsmSourceOfDivergence
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
Definition: AMDGPUTargetTransformInfo.cpp:912
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
getParent
static const Function * getParent(const Value *V)
Definition: BasicAliasAnalysis.cpp:767
UnrollThresholdIf
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
llvm::TargetLoweringBase::InstructionOpcodeToISD
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Definition: TargetLoweringBase.cpp:1756
llvm::AMDGPU::isGraphics
bool isGraphics(CallingConv::ID cc)
Definition: AMDGPUBaseInfo.cpp:1362
llvm::findOptionMDForLoop
MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
Definition: LoopInfo.cpp:1045
llvm::GCNTTIImpl::getRegisterBitWidth
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:319
llvm::ConstantInt::getSExtValue
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:146
llvm::BasicTTIImplBase< AMDGPUTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:548
llvm::AMDGPU::HSAMD::Kernel::Arg::Key::IsVolatile
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
Definition: AMDGPUMetadata.h:194
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:281
llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:174
llvm::ISD::FMUL
@ FMUL
Definition: ISDOpcodes.h:373
llvm::CallBase::setArgOperand
void setArgOperand(unsigned i, Value *v)
Definition: InstrTypes.h:1346
UnrollThresholdLocal
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
llvm::ConstantInt::getFalse
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:853
llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:624
Callee
amdgpu Simplify well known AMD library false FunctionCallee Callee
Definition: AMDGPULibCalls.cpp:206
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:359
llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition: TargetLowering.h:339
llvm::LoopBase::isInnermost
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:165
ArgAllocaCost
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
llvm::R600TTIImpl::getLoadStoreVecRegBitWidth
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:1271
llvm::GCNTTIImpl::getMaximumVF
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
Definition: AMDGPUTargetTransformInfo.cpp:335
llvm::GCNTTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AMDGPUTargetTransformInfo.cpp:814
llvm::Type::getInt64Ty
static IntegerType * getInt64Ty(LLVMContext &C)
Definition: Type.cpp:198
llvm::GCNTTIImpl::getNumberOfRegisters
unsigned getNumberOfRegisters(bool Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:305
llvm::AMDGPU::isArgPassedInSGPR
bool isArgPassedInSGPR(const Argument *A)
Definition: AMDGPUBaseInfo.cpp:1787
llvm::ConstantInt::getTrue
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:846
std
Definition: BitVector.h:838
llvm::KnownBits
Definition: KnownBits.h:23
llvm::SITargetLowering::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Definition: SIISelLowering.cpp:12312
llvm::TargetRegisterInfo::getRegSizeInBits
unsigned getRegSizeInBits(const TargetRegisterClass &RC) const
Return the size in bits of a register from class RC.
Definition: TargetRegisterInfo.h:274
llvm::GCNSubtarget::hasFullRate64Ops
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:312
llvm::GCNTTIImpl::collectFlatAddressOperands
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
Definition: AMDGPUTargetTransformInfo.cpp:1050
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:314
llvm::GCNTTIImpl::getLoadStoreVecRegBitWidth
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:364
llvm::ExtractValueInst
This instruction extracts a struct member or array element value from an aggregate value.
Definition: Instructions.h:2346
llvm::GCNTTIImpl::getStoreVectorFactor
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: AMDGPUTargetTransformInfo.cpp:354
llvm::TypeSize
Definition: TypeSize.h:417
llvm::R600TTIImpl::getRegisterBitWidth
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:1263
llvm::TargetLoweringBase::isOperationExpand
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
Definition: TargetLowering.h:1213
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
llvm::LinearPolySize< TypeSize >::getScalable
static TypeSize getScalable(ScalarTy MinVal)
Definition: TypeSize.h:287
llvm::GCNTTIImpl::adjustInliningThreshold
unsigned adjustInliningThreshold(const CallBase *CB) const
Definition: AMDGPUTargetTransformInfo.cpp:1202
llvm::SDValue
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
Definition: SelectionDAGNodes.h:138
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:232
llvm::TargetTransformInfo::RGK_ScalableVector
@ RGK_ScalableVector
Definition: TargetTransformInfo.h:929
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
llvm::InstructionCost::getInvalid
static InstructionCost getInvalid(CostType Val=0)
Definition: InstructionCost.h:54
llvm::GCNTTIImpl::rewriteIntrinsicWithAddressSpace
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
Definition: AMDGPUTargetTransformInfo.cpp:1067
llvm::R600TTIImpl::getMinVectorRegisterBitWidth
unsigned getMinVectorRegisterBitWidth() const
Definition: AMDGPUTargetTransformInfo.cpp:1267
llvm::InlineAsm::isOutput
@ isOutput
Definition: InlineAsm.h:95
llvm::FPOpFusion::Fast
@ Fast
Definition: TargetOptions.h:37
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::RecurKind::FAdd
@ FAdd
Sum of floats.
llvm::ISD::FSUB
@ FSUB
Definition: ISDOpcodes.h:372
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:147
llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:647
llvm::TargetTransformInfo::RegisterKind
RegisterKind
Definition: TargetTransformInfo.h:929
llvm::ISD::FREM
@ FREM
Definition: ISDOpcodes.h:375
llvm::GCNTTIImpl::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
Definition: AMDGPUTargetTransformInfo.cpp:866
llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:365
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:234
llvm::MVT::f16
@ f16
Definition: MachineValueType.h:51
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1341
llvm::BasicTTIImplBase< GCNTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: BasicTTIImpl.h:833
llvm::BasicTTIImplBase< GCNTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1287
llvm::R600TTIImpl::R600TTIImpl
R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition: AMDGPUTargetTransformInfo.cpp:1249
llvm::TargetTransformInfo::UnrollingPreferences::Threshold
unsigned Threshold
The cost threshold for the unrolled loop.
Definition: TargetTransformInfo.h:431
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:94
llvm::ISD::SRL
@ SRL
Definition: ISDOpcodes.h:649
llvm::AMDGPU::IsaInfo::getWavesPerEUForWorkGroup
unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize)
Definition: AMDGPUBaseInfo.cpp:571
llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
llvm::GCNTTIImpl::GCNTTIImpl
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition: AMDGPUTargetTransformInfo.cpp:285
llvm::PHINode
Definition: Instructions.h:2600
Threshold
static cl::opt< unsigned > Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), cl::init(100), cl::Hidden)
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:43
llvm::MemIntrinsicInfo
Information about a load/store intrinsic defined by the target.
Definition: TargetTransformInfo.h:69
llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1164
llvm::Type::getInt16Ty
static IntegerType * getInt16Ty(LLVMContext &C)
Definition: Type.cpp:196
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:397
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::DataLayout::getPointerSizeInBits
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:404
llvm::AMDGPU::HSAMD::Kernel::CodeProps::Key::NumVGPRs
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
Definition: AMDGPUMetadata.h:255
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:42
llvm::LoopBase::isLoopExiting
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
Definition: LoopInfo.h:225
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1478
llvm::BasicTTIImplBase< GCNTTIImpl >::getTypeBasedIntrinsicInstrCost
InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on argument types.
Definition: BasicTTIImpl.h:1462
llvm::ISD::FNEG
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:856
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:171
llvm::GCNTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, bool IsPairwise, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
Definition: AMDGPUTargetTransformInfo.cpp:849
llvm::GCNTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: AMDGPUTargetTransformInfo.cpp:1139
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
llvm::GCNTTIImpl::useGPUDivergenceAnalysis
bool useGPUDivergenceAnalysis() const
Definition: AMDGPUTargetTransformInfo.cpp:956
llvm::AllocaInst
an instruction to allocate memory on the stack
Definition: Instructions.h:61
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:414
llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:363
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1396
ArgAllocaCutoff
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
llvm::IntrinsicCostAttributes::isTypeBasedOnly
bool isTypeBasedOnly() const
Definition: TargetTransformInfo.h:155
llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition: Instructions.h:3035
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:84
llvm::MVT::f32
@ f32
Definition: MachineValueType.h:52
UnrollRuntimeLocal
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
llvm::ExtractValueInst::getIndices
ArrayRef< unsigned > getIndices() const
Definition: Instructions.h:2411
llvm::R600TTIImpl::getNumberOfRegisters
unsigned getNumberOfRegisters(bool Vec) const
Definition: AMDGPUTargetTransformInfo.cpp:1258
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:211
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:367
llvm::GCNTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: AMDGPUTargetTransformInfo.cpp:734
llvm::IntrinsicCostAttributes::getArgs
const SmallVectorImpl< const Value * > & getArgs() const
Definition: TargetTransformInfo.h:152
AMDGPUTargetMachine.h
llvm::CallBase::args
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1322
llvm::GCNTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index)
Definition: AMDGPUTargetTransformInfo.cpp:883
llvm::TargetLowering::AsmOperandInfoVector
std::vector< AsmOperandInfo > AsmOperandInfoVector
Definition: TargetLowering.h:4175
llvm::GCNSubtarget::hasUsableDivScaleConditionOutput
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:410
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:498
llvm::ISD::FDIV
@ FDIV
Definition: ISDOpcodes.h:374
llvm::GCNTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AMDGPUTargetTransformInfo.cpp:1237