LLVM  13.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
18 #include "AMDGPUTargetMachine.h"
19 #include "llvm/Analysis/LoopInfo.h"
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
22 #include "llvm/IR/PatternMatch.h"
23 #include "llvm/Support/KnownBits.h"
24 
25 using namespace llvm;
26 
27 #define DEBUG_TYPE "AMDGPUtti"
28 
30  "amdgpu-unroll-threshold-private",
31  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
32  cl::init(2700), cl::Hidden);
33 
35  "amdgpu-unroll-threshold-local",
36  cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
37  cl::init(1000), cl::Hidden);
38 
40  "amdgpu-unroll-threshold-if",
41  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
42  cl::init(200), cl::Hidden);
43 
45  "amdgpu-unroll-runtime-local",
46  cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
47  cl::init(true), cl::Hidden);
48 
50  "amdgpu-use-legacy-divergence-analysis",
51  cl::desc("Enable legacy divergence analysis for AMDGPU"),
52  cl::init(false), cl::Hidden);
53 
55  "amdgpu-unroll-max-block-to-analyze",
56  cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
57  cl::init(32), cl::Hidden);
58 
59 static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
60  cl::Hidden, cl::init(4000),
61  cl::desc("Cost of alloca argument"));
62 
63 // If the amount of scratch memory to eliminate exceeds our ability to allocate
64 // it into registers we gain nothing by aggressively inlining functions for that
65 // heuristic.
66 static cl::opt<unsigned>
67  ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
68  cl::init(256),
69  cl::desc("Maximum alloca size to use for inline cost"));
70 
71 // Inliner constraint to achieve reasonable compilation time.
73  "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
74  cl::desc("Maximum number of BBs allowed in a function after inlining"
75  " (compile time constraint)"));
76 
77 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
78  unsigned Depth = 0) {
79  const Instruction *I = dyn_cast<Instruction>(Cond);
80  if (!I)
81  return false;
82 
83  for (const Value *V : I->operand_values()) {
84  if (!L->contains(I))
85  continue;
86  if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
87  if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
88  return SubLoop->contains(PHI); }))
89  return true;
90  } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
91  return true;
92  }
93  return false;
94 }
95 
97  : BaseT(TM, F.getParent()->getDataLayout()),
98  TargetTriple(TM->getTargetTriple()),
99  ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
100  TLI(ST->getTargetLowering()) {}
101 
104  const Function &F = *L->getHeader()->getParent();
105  UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
107  UP.Partial = true;
108 
109  // Conditional branch in a loop back edge needs 3 additional exec
110  // manipulations in average.
111  UP.BEInsns += 3;
112 
113  // TODO: Do we want runtime unrolling?
114 
115  // Maximum alloca size than can fit registers. Reserve 16 registers.
116  const unsigned MaxAlloca = (256 - 16) * 4;
117  unsigned ThresholdPrivate = UnrollThresholdPrivate;
118  unsigned ThresholdLocal = UnrollThresholdLocal;
119 
120  // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
121  // provided threshold value as the default for Threshold
122  if (MDNode *LoopUnrollThreshold =
123  findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
124  if (LoopUnrollThreshold->getNumOperands() == 2) {
125  ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
126  LoopUnrollThreshold->getOperand(1));
127  if (MetaThresholdValue) {
128  // We will also use the supplied value for PartialThreshold for now.
129  // We may introduce additional metadata if it becomes necessary in the
130  // future.
131  UP.Threshold = MetaThresholdValue->getSExtValue();
132  UP.PartialThreshold = UP.Threshold;
133  ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
134  ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
135  }
136  }
137  }
138 
139  unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
140  for (const BasicBlock *BB : L->getBlocks()) {
141  const DataLayout &DL = BB->getModule()->getDataLayout();
142  unsigned LocalGEPsSeen = 0;
143 
144  if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
145  return SubLoop->contains(BB); }))
146  continue; // Block belongs to an inner loop.
147 
148  for (const Instruction &I : *BB) {
149  // Unroll a loop which contains an "if" statement whose condition
150  // defined by a PHI belonging to the loop. This may help to eliminate
151  // if region and potentially even PHI itself, saving on both divergence
152  // and registers used for the PHI.
153  // Add a small bonus for each of such "if" statements.
154  if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
155  if (UP.Threshold < MaxBoost && Br->isConditional()) {
156  BasicBlock *Succ0 = Br->getSuccessor(0);
157  BasicBlock *Succ1 = Br->getSuccessor(1);
158  if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
159  (L->contains(Succ1) && L->isLoopExiting(Succ1)))
160  continue;
161  if (dependsOnLocalPhi(L, Br->getCondition())) {
163  LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
164  << " for loop:\n"
165  << *L << " due to " << *Br << '\n');
166  if (UP.Threshold >= MaxBoost)
167  return;
168  }
169  }
170  continue;
171  }
172 
173  const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
174  if (!GEP)
175  continue;
176 
177  unsigned AS = GEP->getAddressSpace();
178  unsigned Threshold = 0;
179  if (AS == AMDGPUAS::PRIVATE_ADDRESS)
180  Threshold = ThresholdPrivate;
181  else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
182  Threshold = ThresholdLocal;
183  else
184  continue;
185 
186  if (UP.Threshold >= Threshold)
187  continue;
188 
189  if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
190  const Value *Ptr = GEP->getPointerOperand();
191  const AllocaInst *Alloca =
192  dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
193  if (!Alloca || !Alloca->isStaticAlloca())
194  continue;
195  Type *Ty = Alloca->getAllocatedType();
196  unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
197  if (AllocaSize > MaxAlloca)
198  continue;
199  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
200  AS == AMDGPUAS::REGION_ADDRESS) {
201  LocalGEPsSeen++;
202  // Inhibit unroll for local memory if we have seen addressing not to
203  // a variable, most likely we will be unable to combine it.
204  // Do not unroll too deep inner loops for local memory to give a chance
205  // to unroll an outer loop for a more important reason.
206  if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
207  (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
208  !isa<Argument>(GEP->getPointerOperand())))
209  continue;
210  LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
211  << *L << " due to LDS use.\n");
213  }
214 
215  // Check if GEP depends on a value defined by this loop itself.
216  bool HasLoopDef = false;
217  for (const Value *Op : GEP->operands()) {
218  const Instruction *Inst = dyn_cast<Instruction>(Op);
219  if (!Inst || L->isLoopInvariant(Op))
220  continue;
221 
222  if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
223  return SubLoop->contains(Inst); }))
224  continue;
225  HasLoopDef = true;
226  break;
227  }
228  if (!HasLoopDef)
229  continue;
230 
231  // We want to do whatever we can to limit the number of alloca
232  // instructions that make it through to the code generator. allocas
233  // require us to use indirect addressing, which is slow and prone to
234  // compiler bugs. If this loop does an address calculation on an
235  // alloca ptr, then we want to use a higher than normal loop unroll
236  // threshold. This will give SROA a better chance to eliminate these
237  // allocas.
238  //
239  // We also want to have more unrolling for local memory to let ds
240  // instructions with different offsets combine.
241  //
242  // Don't use the maximum allowed value here as it will make some
243  // programs way too big.
244  UP.Threshold = Threshold;
245  LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
246  << " for loop:\n"
247  << *L << " due to " << *GEP << '\n');
248  if (UP.Threshold >= MaxBoost)
249  return;
250  }
251 
252  // If we got a GEP in a small BB from inner loop then increase max trip
253  // count to analyze for better estimation cost in unroll
254  if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
256  }
257 }
258 
261  BaseT::getPeelingPreferences(L, SE, PP);
262 }
263 
264 const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
265  // Codegen control options which don't matter.
266  AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
267  AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
268  AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
269  AMDGPU::FeatureUnalignedAccessMode,
270 
271  AMDGPU::FeatureAutoWaitcntBeforeBarrier,
272 
273  // Property of the kernel/environment which can't actually differ.
274  AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
275  AMDGPU::FeatureTrapHandler,
276 
277  // The default assumption needs to be ecc is enabled, but no directly
278  // exposed operations depend on it, so it can be safely inlined.
279  AMDGPU::FeatureSRAMECC,
280 
281  // Perf-tuning features
282  AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
283 
285  : BaseT(TM, F.getParent()->getDataLayout()),
286  ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
287  TLI(ST->getTargetLowering()), CommonTTI(TM, F),
288  IsGraphics(AMDGPU::isGraphics(F.getCallingConv())),
289  MaxVGPRs(ST->getMaxNumVGPRs(
290  std::max(ST->getWavesPerEU(F).first,
292  ST->getFlatWorkGroupSizes(F).second)))) {
294  HasFP32Denormals = Mode.allFP32Denormals();
295  HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
296 }
297 
299  // The concept of vector registers doesn't really exist. Some packed vector
300  // operations operate on the normal 32-bit registers.
301  return MaxVGPRs;
302 }
303 
304 unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
305  // This is really the number of registers to fill when vectorizing /
306  // interleaving loops, so we lie to avoid trying to use all registers.
307  return getHardwareNumberOfRegisters(Vec) >> 3;
308 }
309 
310 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
311  const SIRegisterInfo *TRI = ST->getRegisterInfo();
312  const TargetRegisterClass *RC = TRI->getRegClass(RCID);
313  unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32;
314  return getHardwareNumberOfRegisters(false) / NumVGPRs;
315 }
316 
317 TypeSize
319  switch (K) {
321  return TypeSize::getFixed(32);
323  return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
325  return TypeSize::getScalable(0);
326  }
327  llvm_unreachable("Unsupported register kind");
328 }
329 
331  return 32;
332 }
333 
334 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
335  if (Opcode == Instruction::Load || Opcode == Instruction::Store)
336  return 32 * 4 / ElemWidth;
337  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
338  : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
339  : 1;
340 }
341 
342 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
343  unsigned ChainSizeInBytes,
344  VectorType *VecTy) const {
345  unsigned VecRegBitWidth = VF * LoadSize;
346  if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
347  // TODO: Support element-size less than 32bit?
348  return 128 / LoadSize;
349 
350  return VF;
351 }
352 
353 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
354  unsigned ChainSizeInBytes,
355  VectorType *VecTy) const {
356  unsigned VecRegBitWidth = VF * StoreSize;
357  if (VecRegBitWidth > 128)
358  return 128 / StoreSize;
359 
360  return VF;
361 }
362 
363 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
364  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
365  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
366  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
367  AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
368  return 512;
369  }
370 
371  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
372  return 8 * ST->getMaxPrivateElementSize();
373 
374  // Common to flat, global, local and region. Assume for unknown addrspace.
375  return 128;
376 }
377 
378 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
379  Align Alignment,
380  unsigned AddrSpace) const {
381  // We allow vectorization of flat stores, even though we may need to decompose
382  // them later if they may access private memory. We don't have enough context
383  // here, and legalization can handle it.
384  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
385  return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
386  ChainSizeInBytes <= ST->getMaxPrivateElementSize();
387  }
388  return true;
389 }
390 
391 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
392  Align Alignment,
393  unsigned AddrSpace) const {
394  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
395 }
396 
397 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
398  Align Alignment,
399  unsigned AddrSpace) const {
400  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
401 }
402 
403 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
404 // iteration. Should we report a larger size and let it legalize?
405 //
406 // FIXME: Should we use narrower types for local/region, or account for when
407 // unaligned access is legal?
408 //
409 // FIXME: This could use fine tuning and microbenchmarks.
411  unsigned SrcAddrSpace,
412  unsigned DestAddrSpace,
413  unsigned SrcAlign,
414  unsigned DestAlign) const {
415  unsigned MinAlign = std::min(SrcAlign, DestAlign);
416 
417  // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
418  // hardware into byte accesses. If you assume all alignments are equally
419  // probable, it's more efficient on average to use short accesses for this
420  // case.
421  if (MinAlign == 2)
422  return Type::getInt16Ty(Context);
423 
424  // Not all subtargets have 128-bit DS instructions, and we currently don't
425  // form them by default.
426  if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
427  SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
428  DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
429  DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
431  }
432 
433  // Global memory works best with 16-byte accesses. Private memory will also
434  // hit this, although they'll be decomposed.
436 }
437 
440  unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
441  unsigned SrcAlign, unsigned DestAlign) const {
442  assert(RemainingBytes < 16);
443 
444  unsigned MinAlign = std::min(SrcAlign, DestAlign);
445 
446  if (MinAlign != 2) {
447  Type *I64Ty = Type::getInt64Ty(Context);
448  while (RemainingBytes >= 8) {
449  OpsOut.push_back(I64Ty);
450  RemainingBytes -= 8;
451  }
452 
453  Type *I32Ty = Type::getInt32Ty(Context);
454  while (RemainingBytes >= 4) {
455  OpsOut.push_back(I32Ty);
456  RemainingBytes -= 4;
457  }
458  }
459 
460  Type *I16Ty = Type::getInt16Ty(Context);
461  while (RemainingBytes >= 2) {
462  OpsOut.push_back(I16Ty);
463  RemainingBytes -= 2;
464  }
465 
466  Type *I8Ty = Type::getInt8Ty(Context);
467  while (RemainingBytes) {
468  OpsOut.push_back(I8Ty);
469  --RemainingBytes;
470  }
471 }
472 
473 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
474  // Disable unrolling if the loop is not vectorized.
475  // TODO: Enable this again.
476  if (VF == 1)
477  return 1;
478 
479  return 8;
480 }
481 
483  MemIntrinsicInfo &Info) const {
484  switch (Inst->getIntrinsicID()) {
485  case Intrinsic::amdgcn_atomic_inc:
486  case Intrinsic::amdgcn_atomic_dec:
487  case Intrinsic::amdgcn_ds_ordered_add:
488  case Intrinsic::amdgcn_ds_ordered_swap:
489  case Intrinsic::amdgcn_ds_fadd:
490  case Intrinsic::amdgcn_ds_fmin:
491  case Intrinsic::amdgcn_ds_fmax: {
492  auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
493  auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
494  if (!Ordering || !Volatile)
495  return false; // Invalid.
496 
497  unsigned OrderingVal = Ordering->getZExtValue();
498  if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
499  return false;
500 
501  Info.PtrVal = Inst->getArgOperand(0);
502  Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
503  Info.ReadMem = true;
504  Info.WriteMem = true;
505  Info.IsVolatile = !Volatile->isNullValue();
506  return true;
507  }
508  default:
509  return false;
510  }
511 }
512 
513 int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
515  TTI::OperandValueKind Opd1Info,
516  TTI::OperandValueKind Opd2Info,
517  TTI::OperandValueProperties Opd1PropInfo,
518  TTI::OperandValueProperties Opd2PropInfo,
520  const Instruction *CxtI) {
521  EVT OrigTy = TLI->getValueType(DL, Ty);
522  if (!OrigTy.isSimple()) {
523  // FIXME: We're having to query the throughput cost so that the basic
524  // implementation tries to generate legalize and scalarization costs. Maybe
525  // we could hoist the scalarization code here?
528  Opd1Info, Opd2Info, Opd1PropInfo,
529  Opd2PropInfo, Args, CxtI);
530  // Scalarization
531 
532  // Check if any of the operands are vector operands.
533  int ISD = TLI->InstructionOpcodeToISD(Opcode);
534  assert(ISD && "Invalid opcode");
535 
536  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
537 
538  bool IsFloat = Ty->isFPOrFPVectorTy();
539  // Assume that floating point arithmetic operations cost twice as much as
540  // integer operations.
541  unsigned OpCost = (IsFloat ? 2 : 1);
542 
543  if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
544  // The operation is legal. Assume it costs 1.
545  // TODO: Once we have extract/insert subvector cost we need to use them.
546  return LT.first * OpCost;
547  }
548 
549  if (!TLI->isOperationExpand(ISD, LT.second)) {
550  // If the operation is custom lowered, then assume that the code is twice
551  // as expensive.
552  return LT.first * 2 * OpCost;
553  }
554 
555  // Else, assume that we need to scalarize this op.
556  // TODO: If one of the types get legalized by splitting, handle this
557  // similarly to what getCastInstrCost() does.
558  if (auto *VTy = dyn_cast<VectorType>(Ty)) {
559  unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
560  unsigned Cost = getArithmeticInstrCost(
561  Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
562  Opd1PropInfo, Opd2PropInfo, Args, CxtI);
563  // Return the cost of multiple scalar invocation plus the cost of
564  // inserting and extracting the values.
565  SmallVector<Type *> Tys(Args.size(), Ty);
566  return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
567  }
568 
569  // We don't know anything about this scalar instruction.
570  return OpCost;
571  }
572 
573  // Legalize the type.
574  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
575  int ISD = TLI->InstructionOpcodeToISD(Opcode);
576 
577  // Because we don't have any legal vector operations, but the legal types, we
578  // need to account for split vectors.
579  unsigned NElts = LT.second.isVector() ?
580  LT.second.getVectorNumElements() : 1;
581 
582  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
583 
584  switch (ISD) {
585  case ISD::SHL:
586  case ISD::SRL:
587  case ISD::SRA:
588  if (SLT == MVT::i64)
589  return get64BitInstrCost(CostKind) * LT.first * NElts;
590 
591  if (ST->has16BitInsts() && SLT == MVT::i16)
592  NElts = (NElts + 1) / 2;
593 
594  // i32
595  return getFullRateInstrCost() * LT.first * NElts;
596  case ISD::ADD:
597  case ISD::SUB:
598  case ISD::AND:
599  case ISD::OR:
600  case ISD::XOR:
601  if (SLT == MVT::i64) {
602  // and, or and xor are typically split into 2 VALU instructions.
603  return 2 * getFullRateInstrCost() * LT.first * NElts;
604  }
605 
606  if (ST->has16BitInsts() && SLT == MVT::i16)
607  NElts = (NElts + 1) / 2;
608 
609  return LT.first * NElts * getFullRateInstrCost();
610  case ISD::MUL: {
611  const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
612  if (SLT == MVT::i64) {
613  const int FullRateCost = getFullRateInstrCost();
614  return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
615  }
616 
617  if (ST->has16BitInsts() && SLT == MVT::i16)
618  NElts = (NElts + 1) / 2;
619 
620  // i32
621  return QuarterRateCost * NElts * LT.first;
622  }
623  case ISD::FMUL:
624  // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
625  // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
626  // fused operation.
627  if (CxtI && CxtI->hasOneUse())
628  if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
629  const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
630  if (OPC == ISD::FADD || OPC == ISD::FSUB) {
631  if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
633  if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
635 
636  // Estimate all types may be fused with contract/unsafe flags
637  const TargetOptions &Options = TLI->getTargetMachine().Options;
638  if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
639  Options.UnsafeFPMath ||
640  (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
642  }
643  }
645  case ISD::FADD:
646  case ISD::FSUB:
647  if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
648  NElts = (NElts + 1) / 2;
649  if (SLT == MVT::f64)
650  return LT.first * NElts * get64BitInstrCost(CostKind);
651 
652  if (ST->has16BitInsts() && SLT == MVT::f16)
653  NElts = (NElts + 1) / 2;
654 
655  if (SLT == MVT::f32 || SLT == MVT::f16)
656  return LT.first * NElts * getFullRateInstrCost();
657  break;
658  case ISD::FDIV:
659  case ISD::FREM:
660  // FIXME: frem should be handled separately. The fdiv in it is most of it,
661  // but the current lowering is also not entirely correct.
662  if (SLT == MVT::f64) {
663  int Cost = 7 * get64BitInstrCost(CostKind) +
664  getQuarterRateInstrCost(CostKind) +
665  3 * getHalfRateInstrCost(CostKind);
666  // Add cost of workaround.
668  Cost += 3 * getFullRateInstrCost();
669 
670  return LT.first * Cost * NElts;
671  }
672 
673  if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
674  // TODO: This is more complicated, unsafe flags etc.
675  if ((SLT == MVT::f32 && !HasFP32Denormals) ||
676  (SLT == MVT::f16 && ST->has16BitInsts())) {
677  return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
678  }
679  }
680 
681  if (SLT == MVT::f16 && ST->has16BitInsts()) {
682  // 2 x v_cvt_f32_f16
683  // f32 rcp
684  // f32 fmul
685  // v_cvt_f16_f32
686  // f16 div_fixup
687  int Cost =
688  4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
689  return LT.first * Cost * NElts;
690  }
691 
692  if (SLT == MVT::f32 || SLT == MVT::f16) {
693  // 4 more v_cvt_* insts without f16 insts support
694  int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
695  1 * getQuarterRateInstrCost(CostKind);
696 
697  if (!HasFP32Denormals) {
698  // FP mode switches.
699  Cost += 2 * getFullRateInstrCost();
700  }
701 
702  return LT.first * NElts * Cost;
703  }
704  break;
705  case ISD::FNEG:
706  // Use the backend' estimation. If fneg is not free each element will cost
707  // one additional instruction.
708  return TLI->isFNegFree(SLT) ? 0 : NElts;
709  default:
710  break;
711  }
712 
713  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
714  Opd1PropInfo, Opd2PropInfo, Args, CxtI);
715 }
716 
717 // Return true if there's a potential benefit from using v2f16/v2i16
718 // instructions for an intrinsic, even if it requires nontrivial legalization.
720  switch (ID) {
721  case Intrinsic::fma: // TODO: fmuladd
722  // There's a small benefit to using vector ops in the legalized code.
723  case Intrinsic::round:
724  case Intrinsic::uadd_sat:
725  case Intrinsic::usub_sat:
726  case Intrinsic::sadd_sat:
727  case Intrinsic::ssub_sat:
728  return true;
729  default:
730  return false;
731  }
732 }
733 
737  if (ICA.getID() == Intrinsic::fabs)
738  return 0;
739 
742 
743  Type *RetTy = ICA.getReturnType();
744  EVT OrigTy = TLI->getValueType(DL, RetTy);
745  if (!OrigTy.isSimple()) {
748 
749  // TODO: Combine these two logic paths.
750  if (ICA.isTypeBasedOnly())
752 
753  unsigned RetVF =
754  (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
755  : 1);
756  const IntrinsicInst *I = ICA.getInst();
758  FastMathFlags FMF = ICA.getFlags();
759  // Assume that we need to scalarize this intrinsic.
760 
761  // Compute the scalarization overhead based on Args for a vector
762  // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
763  // CostModel will pass a vector RetTy and VF is 1.
764  unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
765  if (RetVF > 1) {
766  ScalarizationCost = 0;
767  if (!RetTy->isVoidTy())
768  ScalarizationCost +=
769  getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
770  ScalarizationCost +=
772  }
773 
774  IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I,
775  ScalarizationCost);
777  }
778 
779  // Legalize the type.
780  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
781 
782  unsigned NElts = LT.second.isVector() ?
783  LT.second.getVectorNumElements() : 1;
784 
785  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
786 
787  if (SLT == MVT::f64)
788  return LT.first * NElts * get64BitInstrCost(CostKind);
789 
790  if ((ST->has16BitInsts() && SLT == MVT::f16) ||
791  (ST->hasPackedFP32Ops() && SLT == MVT::f32))
792  NElts = (NElts + 1) / 2;
793 
794  // TODO: Get more refined intrinsic costs?
795  unsigned InstRate = getQuarterRateInstrCost(CostKind);
796 
797  switch (ICA.getID()) {
798  case Intrinsic::fma:
799  InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
800  : getQuarterRateInstrCost(CostKind);
801  break;
802  case Intrinsic::uadd_sat:
803  case Intrinsic::usub_sat:
804  case Intrinsic::sadd_sat:
805  case Intrinsic::ssub_sat:
806  static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
807  if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
808  NElts = 1;
809  break;
810  }
811 
812  return LT.first * NElts * InstRate;
813 }
814 
815 unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode,
817  const Instruction *I) {
818  assert((I == nullptr || I->getOpcode() == Opcode) &&
819  "Opcode should reflect passed instruction.");
820  const bool SCost =
822  const int CBrCost = SCost ? 5 : 7;
823  switch (Opcode) {
824  case Instruction::Br: {
825  // Branch instruction takes about 4 slots on gfx900.
826  auto BI = dyn_cast_or_null<BranchInst>(I);
827  if (BI && BI->isUnconditional())
828  return SCost ? 1 : 4;
829  // Suppose conditional branch takes additional 3 exec manipulations
830  // instructions in average.
831  return CBrCost;
832  }
833  case Instruction::Switch: {
834  auto SI = dyn_cast_or_null<SwitchInst>(I);
835  // Each case (including default) takes 1 cmp + 1 cbr instructions in
836  // average.
837  return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
838  }
839  case Instruction::Ret:
840  return SCost ? 1 : 10;
841  case Instruction::PHI:
842  // TODO: 1. A prediction phi won't be eliminated?
843  // 2. Estimate data copy instructions in this case.
844  return 1;
845  }
846  return BaseT::getCFInstrCost(Opcode, CostKind, I);
847 }
848 
851  bool IsPairwise,
853  EVT OrigTy = TLI->getValueType(DL, Ty);
854 
855  // Computes cost on targets that have packed math instructions(which support
856  // 16-bit types only).
857  if (IsPairwise ||
858  !ST->hasVOP3PInsts() ||
859  OrigTy.getScalarSizeInBits() != 16)
860  return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind);
861 
862  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
863  return LT.first * getFullRateInstrCost();
864 }
865 
868  bool IsPairwise, bool IsUnsigned,
870  EVT OrigTy = TLI->getValueType(DL, Ty);
871 
872  // Computes cost on targets that have packed math instructions(which support
873  // 16-bit types only).
874  if (IsPairwise ||
875  !ST->hasVOP3PInsts() ||
876  OrigTy.getScalarSizeInBits() != 16)
877  return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
878  CostKind);
879 
880  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
881  return LT.first * getHalfRateInstrCost(CostKind);
882 }
883 
884 int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
885  unsigned Index) {
886  switch (Opcode) {
887  case Instruction::ExtractElement:
888  case Instruction::InsertElement: {
889  unsigned EltSize
890  = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
891  if (EltSize < 32) {
892  if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
893  return 0;
894  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
895  }
896 
897  // Extracts are just reads of a subregister, so are free. Inserts are
898  // considered free because we don't want to have any cost for scalarizing
899  // operations, and we don't have to copy into a different register class.
900 
901  // Dynamic indexing isn't free and is best avoided.
902  return Index == ~0u ? 2 : 0;
903  }
904  default:
905  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
906  }
907 }
908 
909 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
910 /// this is analyzing the collective result of all output registers. Otherwise,
911 /// this is only querying a specific result index if this returns multiple
912 /// registers in a struct.
914  const CallInst *CI, ArrayRef<unsigned> Indices) const {
915  // TODO: Handle complex extract indices
916  if (Indices.size() > 1)
917  return true;
918 
919  const DataLayout &DL = CI->getModule()->getDataLayout();
920  const SIRegisterInfo *TRI = ST->getRegisterInfo();
921  TargetLowering::AsmOperandInfoVector TargetConstraints =
922  TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
923 
924  const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
925 
926  int OutputIdx = 0;
927  for (auto &TC : TargetConstraints) {
928  if (TC.Type != InlineAsm::isOutput)
929  continue;
930 
931  // Skip outputs we don't care about.
932  if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
933  continue;
934 
935  TLI->ComputeConstraintToUse(TC, SDValue());
936 
937  Register AssignedReg;
938  const TargetRegisterClass *RC;
939  std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
940  TRI, TC.ConstraintCode, TC.ConstraintVT);
941  if (AssignedReg) {
942  // FIXME: This is a workaround for getRegForInlineAsmConstraint
943  // returning VS_32
944  RC = TRI->getPhysRegClass(AssignedReg);
945  }
946 
947  // For AGPR constraints null is returned on subtargets without AGPRs, so
948  // assume divergent for null.
949  if (!RC || !TRI->isSGPRClass(RC))
950  return true;
951  }
952 
953  return false;
954 }
955 
956 /// \returns true if the new GPU divergence analysis is enabled.
958  return !UseLegacyDA;
959 }
960 
961 /// \returns true if the result of the value could potentially be
962 /// different across workitems in a wavefront.
964  if (const Argument *A = dyn_cast<Argument>(V))
965  return !AMDGPU::isArgPassedInSGPR(A);
966 
967  // Loads from the private and flat address spaces are divergent, because
968  // threads can execute the load instruction with the same inputs and get
969  // different results.
970  //
971  // All other loads are not divergent, because if threads issue loads with the
972  // same arguments, they will always get the same result.
973  if (const LoadInst *Load = dyn_cast<LoadInst>(V))
974  return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
975  Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
976 
977  // Atomics are divergent because they are executed sequentially: when an
978  // atomic operation refers to the same address in each thread, then each
979  // thread after the first sees the value written by the previous thread as
980  // original value.
981  if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
982  return true;
983 
984  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
985  return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
986 
987  // Assume all function calls are a source of divergence.
988  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
989  if (CI->isInlineAsm())
991  return true;
992  }
993 
994  // Assume all function calls are a source of divergence.
995  if (isa<InvokeInst>(V))
996  return true;
997 
998  return false;
999 }
1000 
1001 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
1002  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
1003  switch (Intrinsic->getIntrinsicID()) {
1004  default:
1005  return false;
1006  case Intrinsic::amdgcn_readfirstlane:
1007  case Intrinsic::amdgcn_readlane:
1008  case Intrinsic::amdgcn_icmp:
1009  case Intrinsic::amdgcn_fcmp:
1010  case Intrinsic::amdgcn_ballot:
1011  case Intrinsic::amdgcn_if_break:
1012  return true;
1013  }
1014  }
1015 
1016  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1017  if (CI->isInlineAsm())
1018  return !isInlineAsmSourceOfDivergence(CI);
1019  return false;
1020  }
1021 
1022  const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
1023  if (!ExtValue)
1024  return false;
1025 
1026  const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
1027  if (!CI)
1028  return false;
1029 
1030  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1031  switch (Intrinsic->getIntrinsicID()) {
1032  default:
1033  return false;
1034  case Intrinsic::amdgcn_if:
1035  case Intrinsic::amdgcn_else: {
1036  ArrayRef<unsigned> Indices = ExtValue->getIndices();
1037  return Indices.size() == 1 && Indices[0] == 1;
1038  }
1039  }
1040  }
1041 
1042  // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1043  // divergent for the overall struct return. We need to override it in the
1044  // case we're extracting an SGPR component here.
1045  if (CI->isInlineAsm())
1046  return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1047 
1048  return false;
1049 }
1050 
1052  Intrinsic::ID IID) const {
1053  switch (IID) {
1054  case Intrinsic::amdgcn_atomic_inc:
1055  case Intrinsic::amdgcn_atomic_dec:
1056  case Intrinsic::amdgcn_ds_fadd:
1057  case Intrinsic::amdgcn_ds_fmin:
1058  case Intrinsic::amdgcn_ds_fmax:
1059  case Intrinsic::amdgcn_is_shared:
1060  case Intrinsic::amdgcn_is_private:
1061  OpIndexes.push_back(0);
1062  return true;
1063  default:
1064  return false;
1065  }
1066 }
1067 
1069  Value *OldV,
1070  Value *NewV) const {
1071  auto IntrID = II->getIntrinsicID();
1072  switch (IntrID) {
1073  case Intrinsic::amdgcn_atomic_inc:
1074  case Intrinsic::amdgcn_atomic_dec:
1075  case Intrinsic::amdgcn_ds_fadd:
1076  case Intrinsic::amdgcn_ds_fmin:
1077  case Intrinsic::amdgcn_ds_fmax: {
1078  const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
1079  if (!IsVolatile->isZero())
1080  return nullptr;
1081  Module *M = II->getParent()->getParent()->getParent();
1082  Type *DestTy = II->getType();
1083  Type *SrcTy = NewV->getType();
1084  Function *NewDecl =
1085  Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
1086  II->setArgOperand(0, NewV);
1087  II->setCalledFunction(NewDecl);
1088  return II;
1089  }
1090  case Intrinsic::amdgcn_is_shared:
1091  case Intrinsic::amdgcn_is_private: {
1092  unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1094  unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1095  LLVMContext &Ctx = NewV->getType()->getContext();
1096  ConstantInt *NewVal = (TrueAS == NewAS) ?
1098  return NewVal;
1099  }
1100  case Intrinsic::ptrmask: {
1101  unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1102  unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1103  Value *MaskOp = II->getArgOperand(1);
1104  Type *MaskTy = MaskOp->getType();
1105 
1106  bool DoTruncate = false;
1107 
1108  const GCNTargetMachine &TM =
1109  static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1110  if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1111  // All valid 64-bit to 32-bit casts work by chopping off the high
1112  // bits. Any masking only clearing the low bits will also apply in the new
1113  // address space.
1114  if (DL.getPointerSizeInBits(OldAS) != 64 ||
1115  DL.getPointerSizeInBits(NewAS) != 32)
1116  return nullptr;
1117 
1118  // TODO: Do we need to thread more context in here?
1119  KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
1120  if (Known.countMinLeadingOnes() < 32)
1121  return nullptr;
1122 
1123  DoTruncate = true;
1124  }
1125 
1126  IRBuilder<> B(II);
1127  if (DoTruncate) {
1128  MaskTy = B.getInt32Ty();
1129  MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1130  }
1131 
1132  return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1133  {NewV, MaskOp});
1134  }
1135  default:
1136  return nullptr;
1137  }
1138 }
1139 
1141  ArrayRef<int> Mask, int Index,
1142  VectorType *SubTp) {
1143  if (ST->hasVOP3PInsts()) {
1144  if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
1145  DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1146  // With op_sel VOP3P instructions freely can access the low half or high
1147  // half of a register, so any swizzle is free.
1148 
1149  switch (Kind) {
1150  case TTI::SK_Broadcast:
1151  case TTI::SK_Reverse:
1153  return 0;
1154  default:
1155  break;
1156  }
1157  }
1158  }
1159 
1160  return BaseT::getShuffleCost(Kind, VT, Mask, Index, SubTp);
1161 }
1162 
1164  const Function *Callee) const {
1165  const TargetMachine &TM = getTLI()->getTargetMachine();
1166  const GCNSubtarget *CallerST
1167  = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1168  const GCNSubtarget *CalleeST
1169  = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1170 
1171  const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1172  const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1173 
1174  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1175  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1176  if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1177  return false;
1178 
1179  // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1180  // no way to support merge for backend defined attributes.
1181  AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
1183  if (!CallerMode.isInlineCompatible(CalleeMode))
1184  return false;
1185 
1186  if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1187  Callee->hasFnAttribute(Attribute::InlineHint))
1188  return true;
1189 
1190  // Hack to make compile times reasonable.
1191  if (InlineMaxBB) {
1192  // Single BB does not increase total BB amount.
1193  if (Callee->size() == 1)
1194  return true;
1195  size_t BBSize = Caller->size() + Callee->size() - 1;
1196  return BBSize <= InlineMaxBB;
1197  }
1198 
1199  return true;
1200 }
1201 
1203  // If we have a pointer to private array passed into a function
1204  // it will not be optimized out, leaving scratch usage.
1205  // Increase the inline threshold to allow inlining in this case.
1206  uint64_t AllocaSize = 0;
1208  for (Value *PtrArg : CB->args()) {
1209  PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1210  if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
1212  continue;
1213 
1214  PtrArg = getUnderlyingObject(PtrArg);
1215  if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
1216  if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1217  continue;
1218  AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1219  // If the amount of stack memory is excessive we will not be able
1220  // to get rid of the scratch anyway, bail out.
1221  if (AllocaSize > ArgAllocaCutoff) {
1222  AllocaSize = 0;
1223  break;
1224  }
1225  }
1226  }
1227  if (AllocaSize)
1228  return ArgAllocaCost;
1229  return 0;
1230 }
1231 
1234  CommonTTI.getUnrollingPreferences(L, SE, UP);
1235 }
1236 
1239  CommonTTI.getPeelingPreferences(L, SE, PP);
1240 }
1241 
1242 int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1243  return ST->hasFullRate64Ops()
1244  ? getFullRateInstrCost()
1245  : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1246  : getQuarterRateInstrCost(CostKind);
1247 }
1248 
1250  : BaseT(TM, F.getParent()->getDataLayout()),
1251  ST(static_cast<const R600Subtarget *>(TM->getSubtargetImpl(F))),
1252  TLI(ST->getTargetLowering()), CommonTTI(TM, F) {}
1253 
1255  return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
1256 }
1257 
1258 unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
1259  return getHardwareNumberOfRegisters(Vec);
1260 }
1261 
1262 TypeSize
1264  return TypeSize::getFixed(32);
1265 }
1266 
1268  return 32;
1269 }
1270 
1271 unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
1272  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
1273  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
1274  return 128;
1275  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1276  AddrSpace == AMDGPUAS::REGION_ADDRESS)
1277  return 64;
1278  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
1279  return 32;
1280 
1281  if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
1282  AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
1283  (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
1284  AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
1285  return 128;
1286  llvm_unreachable("unhandled address space");
1287 }
1288 
1289 bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
1290  Align Alignment,
1291  unsigned AddrSpace) const {
1292  // We allow vectorization of flat stores, even though we may need to decompose
1293  // them later if they may access private memory. We don't have enough context
1294  // here, and legalization can handle it.
1295  return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
1296 }
1297 
1298 bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1299  Align Alignment,
1300  unsigned AddrSpace) const {
1301  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1302 }
1303 
1304 bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1305  Align Alignment,
1306  unsigned AddrSpace) const {
1307  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1308 }
1309 
1311  // Disable unrolling if the loop is not vectorized.
1312  // TODO: Enable this again.
1313  if (VF == 1)
1314  return 1;
1315 
1316  return 8;
1317 }
1318 
1319 unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode,
1321  const Instruction *I) {
1323  return Opcode == Instruction::PHI ? 0 : 1;
1324 
1325  // XXX - For some reason this isn't called for switch.
1326  switch (Opcode) {
1327  case Instruction::Br:
1328  case Instruction::Ret:
1329  return 10;
1330  default:
1331  return BaseT::getCFInstrCost(Opcode, CostKind, I);
1332  }
1333 }
1334 
1335 int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
1336  unsigned Index) {
1337  switch (Opcode) {
1338  case Instruction::ExtractElement:
1339  case Instruction::InsertElement: {
1340  unsigned EltSize
1341  = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
1342  if (EltSize < 32) {
1343  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1344  }
1345 
1346  // Extracts are just reads of a subregister, so are free. Inserts are
1347  // considered free because we don't want to have any cost for scalarizing
1348  // operations, and we don't have to copy into a different register class.
1349 
1350  // Dynamic indexing isn't free and is best avoided.
1351  return Index == ~0u ? 2 : 0;
1352  }
1353  default:
1354  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1355  }
1356 }
1357 
1360  CommonTTI.getUnrollingPreferences(L, SE, UP);
1361 }
1362 
1365  CommonTTI.getPeelingPreferences(L, SE, PP);
1366 }
llvm::ISD::SUB
@ SUB
Definition: ISDOpcodes.h:233
UseLegacyDA
static cl::opt< bool > UseLegacyDA("amdgpu-use-legacy-divergence-analysis", cl::desc("Enable legacy divergence analysis for AMDGPU"), cl::init(false), cl::Hidden)
llvm::InstructionCost
Definition: InstructionCost.h:26
llvm::EngineKind::Kind
Kind
Definition: ExecutionEngine.h:524
llvm::TargetTransformInfo::UnrollingPreferences::BEInsns
unsigned BEInsns
Definition: TargetTransformInfo.h:473
llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticInstrCost
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:688
Attrs
Function Attrs
Definition: README_ALTIVEC.txt:215
llvm::Type::isSized
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:272
llvm::BasicTTIImplBase< AMDGPUTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:38
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:480
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:211
llvm::Loop::isLoopInvariant
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:63
llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition: TargetTransformInfo.h:448
llvm
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1519
llvm::BasicTTIImplBase< GCNTTIImpl >::getCFInstrCost
unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:926
llvm::SystemZISD::TM
@ TM
Definition: SystemZISelLowering.h:65
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:618
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:447
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1291
llvm::TargetTransformInfo::UnrollingPreferences::MaxCount
unsigned MaxCount
Definition: TargetTransformInfo.h:464
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
llvm::TargetOptions
Definition: TargetOptions.h:123
llvm::Function
Definition: Function.h:61
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:529
llvm::LoopBase::contains
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:122
llvm::AMDGPUTargetLowering::isFNegFree
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
Definition: AMDGPUISelLowering.cpp:847
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
llvm::CallBase::setCalledFunction
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1432
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:674
llvm::PatternMatch::m_FPOne
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:799
llvm::GCNTTIImpl::isSourceOfDivergence
bool isSourceOfDivergence(const Value *V) const
Definition: AMDGPUTargetTransformInfo.cpp:963
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::AMDGPUAS::PARAM_D_ADDRESS
@ PARAM_D_ADDRESS
Address space for direct addressible parameter memory (CONST0).
Definition: AMDGPU.h:385
llvm::CallBase::isInlineAsm
bool isInlineAsm() const
Check if this call is an inline asm statement.
Definition: InstrTypes.h:1462
llvm::BasicTTIImplBase< GCNTTIImpl >::getOperandsScalarizationOverhead
unsigned getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys)
Estimate the overhead of scalarizing an instructions unique non-constant operands.
Definition: BasicTTIImpl.h:643
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:693
llvm::IRBuilder<>
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:148
llvm::PointerType::getAddressSpace
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:662
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:443
llvm::TargetTransformInfo::RGK_Scalar
@ RGK_Scalar
Definition: TargetTransformInfo.h:924
llvm::R600TTIImpl::getCFInstrCost
unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AMDGPUTargetTransformInfo.cpp:1319
llvm::IntrinsicCostAttributes::getInst
const IntrinsicInst * getInst() const
Definition: TargetTransformInfo.h:147
ValueTracking.h
llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent
llvm::R600TTIImpl::getHardwareNumberOfRegisters
unsigned getHardwareNumberOfRegisters(bool Vec) const
Definition: AMDGPUTargetTransformInfo.cpp:1254
llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:190
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:214
llvm::GCNTTIImpl::getMemcpyLoopLoweringType
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign) const
Definition: AMDGPUTargetTransformInfo.cpp:410
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:140
llvm::GCNTTIImpl::isLegalToVectorizeMemChain
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:378
llvm::R600TTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AMDGPUTargetTransformInfo.cpp:1363
llvm::Depth
@ Depth
Definition: SIMachineScheduler.h:34
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:476
dependsOnLocalPhi
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
Definition: AMDGPUTargetTransformInfo.cpp:77
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
llvm::GCNTTIImpl::getMaxInterleaveFactor
unsigned getMaxInterleaveFactor(unsigned VF)
Definition: AMDGPUTargetTransformInfo.cpp:473
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:529
llvm::SITargetLowering::getRegForInlineAsmConstraint
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
Definition: SIISelLowering.cpp:11444
llvm::AMDGPU::SIModeRegisterDefaults
Definition: AMDGPUBaseInfo.h:885
llvm::GCNTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: AMDGPUTargetTransformInfo.cpp:1163
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:40
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::GCNSubtarget
Definition: GCNSubtarget.h:38
llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:424
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:410
llvm::TargetLowering::ComputeConstraintToUse
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
Definition: TargetLowering.cpp:4957
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:116
llvm::AMDGPU::SIModeRegisterDefaults::isInlineCompatible
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Definition: AMDGPUBaseInfo.h:970
llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:160
llvm::AMDGPU::IsaInfo::getMaxNumVGPRs
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
Definition: AMDGPUBaseInfo.cpp:711
llvm::GCNSubtarget::hasPackedFP32Ops
bool hasPackedFP32Ops() const
Definition: GCNSubtarget.h:837
llvm::AMDGPU::isIntrinsicSourceOfDivergence
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
Definition: AMDGPUBaseInfo.cpp:1938
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition: TargetTransformInfo.h:856
llvm::BasicTTIImplBase< GCNTTIImpl >::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsPairwise, bool IsUnsigned, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
Definition: BasicTTIImpl.h:2003
llvm::Type::getInt8Ty
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:202
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1567
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:204
llvm::R600TTIImpl::isLegalToVectorizeStoreChain
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:1304
llvm::GCNSubtarget::getRegisterInfo
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:226
llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:158
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:846
F
#define F(x, y, z)
Definition: MD5.cpp:56
KnownBits.h
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::IntrinsicCostAttributes::getFlags
FastMathFlags getFlags() const
Definition: TargetTransformInfo.h:149
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:124
llvm::MVT::SimpleValueType
SimpleValueType
Definition: MachineValueType.h:32
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:247
Context
LLVMContext & Context
Definition: NVVMIntrRange.cpp:66
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
llvm::AMDGPUTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: AMDGPUTargetTransformInfo.cpp:102
llvm::IntrinsicCostAttributes::getArgTypes
const SmallVectorImpl< Type * > & getArgTypes() const
Definition: TargetTransformInfo.h:152
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:77
llvm::AllocaInst::isStaticAlloca
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Definition: Instructions.cpp:1374
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:380
llvm::LoopBase::getSubLoops
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
Definition: LoopInfo.h:143
llvm::MinAlign
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition: MathExtras.h:673
llvm::AMDGPUTargetMachine
Definition: AMDGPUTargetMachine.h:27
llvm::GCNSubtarget::hasFastFMAF32
bool hasFastFMAF32() const
Definition: GCNSubtarget.h:300
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::AllocaInst::getAllocatedType
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:112
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:845
llvm::R600TTIImpl::getMaxInterleaveFactor
unsigned getMaxInterleaveFactor(unsigned VF)
Definition: AMDGPUTargetTransformInfo.cpp:1310
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:35
llvm::GCNTTIImpl::isLegalToVectorizeStoreChain
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:397
llvm::MVT::f64
@ f64
Definition: MachineValueType.h:53
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:57
llvm::SITargetLowering::getTypeLegalizationCost
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Definition: SIISelLowering.cpp:12204
llvm::GCNTTIImpl::getMinVectorRegisterBitWidth
unsigned getMinVectorRegisterBitWidth() const
Definition: AMDGPUTargetTransformInfo.cpp:330
SI
@ SI
Definition: SIInstrInfo.cpp:7342
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
llvm::ISD::SRA
@ SRA
Definition: ISDOpcodes.h:643
llvm::GCNTTIImpl::getMemcpyLoopResidualLoweringType
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign) const
Definition: AMDGPUTargetTransformInfo.cpp:438
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:373
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:235
llvm::TargetTransformInfo::UnrollingPreferences::MaxIterationsCountToAnalyze
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
Definition: TargetTransformInfo.h:502
llvm::GCNSubtarget::getMaxPrivateElementSize
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:278
llvm::GCNTTIImpl::getLoadVectorFactor
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: AMDGPUTargetTransformInfo.cpp:342
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:154
llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
Definition: ValueTracking.cpp:4296
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:26
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:29
llvm::GCNTTIImpl::isAlwaysUniform
bool isAlwaysUniform(const Value *V) const
Definition: AMDGPUTargetTransformInfo.cpp:1001
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:617
llvm::AMDGPUSubtarget::has16BitInsts
bool has16BitInsts() const
Definition: AMDGPUSubtarget.h:132
PatternMatch.h
llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition: TargetTransformInfo.h:924
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:650
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::AMDGPUAS::CONSTANT_BUFFER_0
@ CONSTANT_BUFFER_0
Definition: AMDGPU.h:395
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:374
llvm::AMDGPUTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AMDGPUTargetTransformInfo.cpp:259
llvm::LinearPolySize< TypeSize >::getFixed
static TypeSize getFixed(ScalarTy MinVal)
Definition: TypeSize.h:284
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:86
llvm::AMDGPUAS::CONSTANT_BUFFER_15
@ CONSTANT_BUFFER_15
Definition: AMDGPU.h:410
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:116
LoopInfo.h
InlineMaxBB
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:378
AMDGPUTargetTransformInfo.h
llvm::GCNTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: AMDGPUTargetTransformInfo.cpp:1232
llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:371
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:847
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:391
llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition: AtomicOrdering.h:56
llvm::cl::opt
Definition: CommandLine.h:1419
llvm::R600Subtarget
Definition: R600Subtarget.h:36
llvm::GCNTTIImpl::getTgtMemIntrinsic
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
Definition: AMDGPUTargetTransformInfo.cpp:482
llvm::TargetRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(unsigned i) const
Returns the register class associated with the enumeration value.
Definition: TargetRegisterInfo.h:723
Index
uint32_t Index
Definition: ELFObjHandler.cpp:84
llvm::KnownBits::countMinLeadingOnes
unsigned countMinLeadingOnes() const
Returns the minimum number of leading one bits.
Definition: KnownBits.h:234
llvm::AMDGPUTTIImpl::AMDGPUTTIImpl
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition: AMDGPUTargetTransformInfo.cpp:96
llvm::Instruction::hasAllowContract
bool hasAllowContract() const
Determine whether the allow-contract flag is set.
Definition: Instruction.cpp:249
llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:572
llvm::R600TTIImpl::isLegalToVectorizeMemChain
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:1289
const
aarch64 promote const
Definition: AArch64PromoteConstant.cpp:232
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, bool IsPairwise, TTI::TargetCostKind CostKind)
Try to calculate arithmetic and shuffle op costs for reduction operations.
Definition: BasicTTIImpl.h:1940
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
llvm::TargetLoweringBase::isOperationLegalOrPromote
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
Definition: TargetLowering.h:1114
llvm::TargetLowering::ParseConstraints
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
Definition: TargetLowering.cpp:4607
llvm::AMDGPU::getIntegerAttribute
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
Definition: AMDGPUBaseInfo.cpp:818
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:423
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:903
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:905
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:440
llvm::AMDGPUSubtarget::hasMadMacF32Insts
bool hasMadMacF32Insts() const
Definition: AMDGPUSubtarget.h:140
llvm::PointerType
Class to represent pointers.
Definition: DerivedTypes.h:634
llvm::BasicTTIImplBase< AMDGPUTTIImpl >
intrinsicHasPackedVectorBenefit
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
Definition: AMDGPUTargetTransformInfo.cpp:719
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:238
llvm::GCNTTIImpl::getArithmeticInstrCost
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: AMDGPUTargetTransformInfo.cpp:513
llvm::TargetMachine::Options
TargetOptions Options
Definition: TargetMachine.h:115
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
llvm::LoopBase::getLoopDepth
unsigned getLoopDepth() const
Return the nesting level of this loop.
Definition: LoopInfo.h:96
llvm::R600TTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: AMDGPUTargetTransformInfo.cpp:1358
llvm::GCNSubtarget::hasUnalignedScratchAccess
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:512
Mode
SI Whole Quad Mode
Definition: SIWholeQuadMode.cpp:262
llvm::Type::isVoidTy
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
UnrollMaxBlockToAnalyze
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
llvm::MVT
Machine Value Type.
Definition: MachineValueType.h:30
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
llvm::MDNode
Metadata node.
Definition: Metadata.h:897
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:895
llvm::GCNTTIImpl::getHardwareNumberOfRegisters
unsigned getHardwareNumberOfRegisters(bool Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:298
UnrollThresholdPrivate
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
llvm::R600TTIImpl::isLegalToVectorizeLoadChain
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:1298
llvm::GCNTargetMachine
Definition: AMDGPUTargetMachine.h:95
llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition: TargetTransformInfo.h:262
llvm::AMDGPUAS::PARAM_I_ADDRESS
@ PARAM_I_ADDRESS
Address space for indirect addressible parameter memory (VTX1).
Definition: AMDGPU.h:387
llvm::AMDGPUSubtarget::hasVOP3PInsts
bool hasVOP3PInsts() const
Definition: AMDGPUSubtarget.h:152
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1512
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:44
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:167
llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:345
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:215
llvm::GCNTTIImpl::isLegalToVectorizeLoadChain
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:391
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
llvm::GCNTTIImpl::isInlineAsmSourceOfDivergence
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
Definition: AMDGPUTargetTransformInfo.cpp:913
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
getParent
static const Function * getParent(const Value *V)
Definition: BasicAliasAnalysis.cpp:759
UnrollThresholdIf
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
llvm::TargetLoweringBase::InstructionOpcodeToISD
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Definition: TargetLoweringBase.cpp:1731
llvm::AMDGPU::isGraphics
bool isGraphics(CallingConv::ID cc)
Definition: AMDGPUBaseInfo.cpp:1328
llvm::findOptionMDForLoop
MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
Definition: LoopInfo.cpp:1038
llvm::GCNTTIImpl::getRegisterBitWidth
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:318
llvm::ConstantInt::getSExtValue
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:146
llvm::BasicTTIImplBase< AMDGPUTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:496
llvm::AMDGPU::HSAMD::Kernel::Arg::Key::IsVolatile
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
Definition: AMDGPUMetadata.h:194
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:281
llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:174
llvm::ISD::FMUL
@ FMUL
Definition: ISDOpcodes.h:373
llvm::CallBase::setArgOperand
void setArgOperand(unsigned i, Value *v)
Definition: InstrTypes.h:1346
UnrollThresholdLocal
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::ConstantInt::getFalse
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:840
llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:619
Callee
amdgpu Simplify well known AMD library false FunctionCallee Callee
Definition: AMDGPULibCalls.cpp:205
llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition: TargetLowering.h:335
llvm::LoopBase::isInnermost
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:165
ArgAllocaCost
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
llvm::R600TTIImpl::getLoadStoreVecRegBitWidth
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:1271
llvm::GCNTTIImpl::getMaximumVF
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
Definition: AMDGPUTargetTransformInfo.cpp:334
llvm::Type::getInt64Ty
static IntegerType * getInt64Ty(LLVMContext &C)
Definition: Type.cpp:205
llvm::GCNTTIImpl::getNumberOfRegisters
unsigned getNumberOfRegisters(bool Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:304
llvm::AMDGPU::isArgPassedInSGPR
bool isArgPassedInSGPR(const Argument *A)
Definition: AMDGPUBaseInfo.cpp:1749
llvm::ConstantInt::getTrue
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:833
std
Definition: BitVector.h:955
llvm::KnownBits
Definition: KnownBits.h:23
llvm::TargetRegisterInfo::getRegSizeInBits
unsigned getRegSizeInBits(const TargetRegisterClass &RC) const
Return the size in bits of a register from class RC.
Definition: TargetRegisterInfo.h:274
llvm::GCNSubtarget::hasFullRate64Ops
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:308
llvm::GCNTTIImpl::collectFlatAddressOperands
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
Definition: AMDGPUTargetTransformInfo.cpp:1051
llvm::GCNTTIImpl::getShuffleCost
unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: AMDGPUTargetTransformInfo.cpp:1140
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:314
llvm::GCNTTIImpl::getLoadStoreVecRegBitWidth
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:363
llvm::GCNTTIImpl::getCFInstrCost
unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AMDGPUTargetTransformInfo.cpp:815
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:377
llvm::ExtractValueInst
This instruction extracts a struct member or array element value from an aggregate value.
Definition: Instructions.h:2318
llvm::GCNTTIImpl::getStoreVectorFactor
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: AMDGPUTargetTransformInfo.cpp:353
llvm::TypeSize
Definition: TypeSize.h:417
llvm::R600TTIImpl::getRegisterBitWidth
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:1263
llvm::TargetLoweringBase::isOperationExpand
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
Definition: TargetLowering.h:1199
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
llvm::LinearPolySize< TypeSize >::getScalable
static TypeSize getScalable(ScalarTy MinVal)
Definition: TypeSize.h:287
llvm::GCNTTIImpl::adjustInliningThreshold
unsigned adjustInliningThreshold(const CallBase *CB) const
Definition: AMDGPUTargetTransformInfo.cpp:1202
llvm::SDValue
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
Definition: SelectionDAGNodes.h:138
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:232
llvm::TargetTransformInfo::RGK_ScalableVector
@ RGK_ScalableVector
Definition: TargetTransformInfo.h:924
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
llvm::GCNTTIImpl::rewriteIntrinsicWithAddressSpace
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
Definition: AMDGPUTargetTransformInfo.cpp:1068
llvm::R600TTIImpl::getVectorInstrCost
int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index)
Definition: AMDGPUTargetTransformInfo.cpp:1335
llvm::R600TTIImpl::getMinVectorRegisterBitWidth
unsigned getMinVectorRegisterBitWidth() const
Definition: AMDGPUTargetTransformInfo.cpp:1267
llvm::InlineAsm::isOutput
@ isOutput
Definition: InlineAsm.h:93
llvm::FPOpFusion::Fast
@ Fast
Definition: TargetOptions.h:37
llvm::RecurKind::FAdd
@ FAdd
Sum of floats.
llvm::ISD::FSUB
@ FSUB
Definition: ISDOpcodes.h:372
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:146
llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:642
llvm::TargetTransformInfo::RegisterKind
RegisterKind
Definition: TargetTransformInfo.h:924
llvm::ISD::FREM
@ FREM
Definition: ISDOpcodes.h:375
llvm::GCNTTIImpl::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
Definition: AMDGPUTargetTransformInfo.cpp:867
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:234
llvm::MVT::f16
@ f16
Definition: MachineValueType.h:51
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1341
llvm::BasicTTIImplBase< GCNTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1226
llvm::R600TTIImpl::R600TTIImpl
R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition: AMDGPUTargetTransformInfo.cpp:1249
llvm::TargetTransformInfo::UnrollingPreferences::Threshold
unsigned Threshold
The cost threshold for the unrolled loop.
Definition: TargetTransformInfo.h:431
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:94
llvm::ISD::SRL
@ SRL
Definition: ISDOpcodes.h:644
llvm::AMDGPU::IsaInfo::getWavesPerEUForWorkGroup
unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize)
Definition: AMDGPUBaseInfo.cpp:537
llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:163
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:350
llvm::GCNTTIImpl::GCNTTIImpl
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition: AMDGPUTargetTransformInfo.cpp:284
llvm::PHINode
Definition: Instructions.h:2572
Threshold
static cl::opt< unsigned > Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), cl::init(100), cl::Hidden)
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:43
llvm::TargetOptions::UnsafeFPMath
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
Definition: TargetOptions.h:159
llvm::MemIntrinsicInfo
Information about a load/store intrinsic defined by the target.
Definition: TargetTransformInfo.h:68
llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1164
llvm::Type::getInt16Ty
static IntegerType * getInt16Ty(LLVMContext &C)
Definition: Type.cpp:203
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:397
llvm::DataLayout::getPointerSizeInBits
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:404
llvm::AMDGPU::HSAMD::Kernel::CodeProps::Key::NumVGPRs
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
Definition: AMDGPUMetadata.h:255
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:42
llvm::LoopBase::isLoopExiting
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
Definition: LoopInfo.h:225
llvm::BasicTTIImplBase< GCNTTIImpl >::getScalarizationOverhead
unsigned getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:608
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1450
llvm::BasicTTIImplBase< GCNTTIImpl >::getTypeBasedIntrinsicInstrCost
InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on argument types.
Definition: BasicTTIImpl.h:1401
llvm::ISD::FNEG
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:851
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:171
llvm::GCNTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, bool IsPairwise, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
Definition: AMDGPUTargetTransformInfo.cpp:850
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:372
llvm::BasicTTIImplBase< GCNTTIImpl >::getShuffleCost
unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: BasicTTIImpl.h:746
llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:376
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
llvm::GCNTTIImpl::useGPUDivergenceAnalysis
bool useGPUDivergenceAnalysis() const
Definition: AMDGPUTargetTransformInfo.cpp:957
llvm::AllocaInst
an instruction to allocate memory on the stack
Definition: Instructions.h:61
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:411
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1382
ArgAllocaCutoff
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
llvm::IntrinsicCostAttributes::isTypeBasedOnly
bool isTypeBasedOnly() const
Definition: TargetTransformInfo.h:154
llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition: Instructions.h:3005
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:84
llvm::GCNTTIImpl::getVectorInstrCost
int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index)
Definition: AMDGPUTargetTransformInfo.cpp:884
llvm::TargetOptions::AllowFPOpFusion
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fuse-fp-ops=xxx option.
Definition: TargetOptions.h:368
llvm::MVT::f32
@ f32
Definition: MachineValueType.h:52
UnrollRuntimeLocal
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
llvm::ExtractValueInst::getIndices
ArrayRef< unsigned > getIndices() const
Definition: Instructions.h:2383
llvm::R600TTIImpl::getNumberOfRegisters
unsigned getNumberOfRegisters(bool Vec) const
Definition: AMDGPUTargetTransformInfo.cpp:1258
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:212
llvm::GCNTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: AMDGPUTargetTransformInfo.cpp:735
llvm::IntrinsicCostAttributes::getArgs
const SmallVectorImpl< const Value * > & getArgs() const
Definition: TargetTransformInfo.h:151
AMDGPUTargetMachine.h
llvm::CallBase::args
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1322
llvm::TargetLowering::AsmOperandInfoVector
std::vector< AsmOperandInfo > AsmOperandInfoVector
Definition: TargetLowering.h:4158
llvm::AMDGPUAS::BUFFER_FAT_POINTER
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
Definition: AMDGPU.h:382
llvm::GCNSubtarget::hasUsableDivScaleConditionOutput
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:406
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
llvm::BasicTTIImplBase< GCNTTIImpl >::getVectorInstrCost
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:978
llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:498
llvm::ISD::FDIV
@ FDIV
Definition: ISDOpcodes.h:374
llvm::GCNTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AMDGPUTargetTransformInfo.cpp:1237