LLVM  14.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
18 #include "AMDGPUTargetMachine.h"
20 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/IR/IntrinsicsAMDGPU.h"
24 #include "llvm/IR/PatternMatch.h"
25 #include "llvm/Support/KnownBits.h"
26 
27 using namespace llvm;
28 
29 #define DEBUG_TYPE "AMDGPUtti"
30 
32  "amdgpu-unroll-threshold-private",
33  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
34  cl::init(2700), cl::Hidden);
35 
37  "amdgpu-unroll-threshold-local",
38  cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
39  cl::init(1000), cl::Hidden);
40 
42  "amdgpu-unroll-threshold-if",
43  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
44  cl::init(200), cl::Hidden);
45 
47  "amdgpu-unroll-runtime-local",
48  cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
49  cl::init(true), cl::Hidden);
50 
52  "amdgpu-use-legacy-divergence-analysis",
53  cl::desc("Enable legacy divergence analysis for AMDGPU"),
54  cl::init(false), cl::Hidden);
55 
57  "amdgpu-unroll-max-block-to-analyze",
58  cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
59  cl::init(32), cl::Hidden);
60 
61 static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
62  cl::Hidden, cl::init(4000),
63  cl::desc("Cost of alloca argument"));
64 
65 // If the amount of scratch memory to eliminate exceeds our ability to allocate
66 // it into registers we gain nothing by aggressively inlining functions for that
67 // heuristic.
68 static cl::opt<unsigned>
69  ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
70  cl::init(256),
71  cl::desc("Maximum alloca size to use for inline cost"));
72 
73 // Inliner constraint to achieve reasonable compilation time.
75  "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
76  cl::desc("Maximum number of BBs allowed in a function after inlining"
77  " (compile time constraint)"));
78 
79 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
80  unsigned Depth = 0) {
81  const Instruction *I = dyn_cast<Instruction>(Cond);
82  if (!I)
83  return false;
84 
85  for (const Value *V : I->operand_values()) {
86  if (!L->contains(I))
87  continue;
88  if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
89  if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
90  return SubLoop->contains(PHI); }))
91  return true;
92  } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
93  return true;
94  }
95  return false;
96 }
97 
99  : BaseT(TM, F.getParent()->getDataLayout()),
100  TargetTriple(TM->getTargetTriple()),
101  ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
102  TLI(ST->getTargetLowering()) {}
103 
107  const Function &F = *L->getHeader()->getParent();
108  UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
110  UP.Partial = true;
111 
112  // Conditional branch in a loop back edge needs 3 additional exec
113  // manipulations in average.
114  UP.BEInsns += 3;
115 
116  // TODO: Do we want runtime unrolling?
117 
118  // Maximum alloca size than can fit registers. Reserve 16 registers.
119  const unsigned MaxAlloca = (256 - 16) * 4;
120  unsigned ThresholdPrivate = UnrollThresholdPrivate;
121  unsigned ThresholdLocal = UnrollThresholdLocal;
122 
123  // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
124  // provided threshold value as the default for Threshold
125  if (MDNode *LoopUnrollThreshold =
126  findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
127  if (LoopUnrollThreshold->getNumOperands() == 2) {
128  ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
129  LoopUnrollThreshold->getOperand(1));
130  if (MetaThresholdValue) {
131  // We will also use the supplied value for PartialThreshold for now.
132  // We may introduce additional metadata if it becomes necessary in the
133  // future.
134  UP.Threshold = MetaThresholdValue->getSExtValue();
135  UP.PartialThreshold = UP.Threshold;
136  ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
137  ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
138  }
139  }
140  }
141 
142  unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
143  for (const BasicBlock *BB : L->getBlocks()) {
144  const DataLayout &DL = BB->getModule()->getDataLayout();
145  unsigned LocalGEPsSeen = 0;
146 
147  if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
148  return SubLoop->contains(BB); }))
149  continue; // Block belongs to an inner loop.
150 
151  for (const Instruction &I : *BB) {
152  // Unroll a loop which contains an "if" statement whose condition
153  // defined by a PHI belonging to the loop. This may help to eliminate
154  // if region and potentially even PHI itself, saving on both divergence
155  // and registers used for the PHI.
156  // Add a small bonus for each of such "if" statements.
157  if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
158  if (UP.Threshold < MaxBoost && Br->isConditional()) {
159  BasicBlock *Succ0 = Br->getSuccessor(0);
160  BasicBlock *Succ1 = Br->getSuccessor(1);
161  if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
162  (L->contains(Succ1) && L->isLoopExiting(Succ1)))
163  continue;
164  if (dependsOnLocalPhi(L, Br->getCondition())) {
166  LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
167  << " for loop:\n"
168  << *L << " due to " << *Br << '\n');
169  if (UP.Threshold >= MaxBoost)
170  return;
171  }
172  }
173  continue;
174  }
175 
176  const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
177  if (!GEP)
178  continue;
179 
180  unsigned AS = GEP->getAddressSpace();
181  unsigned Threshold = 0;
182  if (AS == AMDGPUAS::PRIVATE_ADDRESS)
183  Threshold = ThresholdPrivate;
184  else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
185  Threshold = ThresholdLocal;
186  else
187  continue;
188 
189  if (UP.Threshold >= Threshold)
190  continue;
191 
192  if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
193  const Value *Ptr = GEP->getPointerOperand();
194  const AllocaInst *Alloca =
195  dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
196  if (!Alloca || !Alloca->isStaticAlloca())
197  continue;
198  Type *Ty = Alloca->getAllocatedType();
199  unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
200  if (AllocaSize > MaxAlloca)
201  continue;
202  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
203  AS == AMDGPUAS::REGION_ADDRESS) {
204  LocalGEPsSeen++;
205  // Inhibit unroll for local memory if we have seen addressing not to
206  // a variable, most likely we will be unable to combine it.
207  // Do not unroll too deep inner loops for local memory to give a chance
208  // to unroll an outer loop for a more important reason.
209  if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
210  (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
211  !isa<Argument>(GEP->getPointerOperand())))
212  continue;
213  LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
214  << *L << " due to LDS use.\n");
216  }
217 
218  // Check if GEP depends on a value defined by this loop itself.
219  bool HasLoopDef = false;
220  for (const Value *Op : GEP->operands()) {
221  const Instruction *Inst = dyn_cast<Instruction>(Op);
222  if (!Inst || L->isLoopInvariant(Op))
223  continue;
224 
225  if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
226  return SubLoop->contains(Inst); }))
227  continue;
228  HasLoopDef = true;
229  break;
230  }
231  if (!HasLoopDef)
232  continue;
233 
234  // We want to do whatever we can to limit the number of alloca
235  // instructions that make it through to the code generator. allocas
236  // require us to use indirect addressing, which is slow and prone to
237  // compiler bugs. If this loop does an address calculation on an
238  // alloca ptr, then we want to use a higher than normal loop unroll
239  // threshold. This will give SROA a better chance to eliminate these
240  // allocas.
241  //
242  // We also want to have more unrolling for local memory to let ds
243  // instructions with different offsets combine.
244  //
245  // Don't use the maximum allowed value here as it will make some
246  // programs way too big.
247  UP.Threshold = Threshold;
248  LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
249  << " for loop:\n"
250  << *L << " due to " << *GEP << '\n');
251  if (UP.Threshold >= MaxBoost)
252  return;
253  }
254 
255  // If we got a GEP in a small BB from inner loop then increase max trip
256  // count to analyze for better estimation cost in unroll
257  if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
259  }
260 }
261 
264  BaseT::getPeelingPreferences(L, SE, PP);
265 }
266 
267 const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
268  // Codegen control options which don't matter.
269  AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
270  AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
271  AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
272  AMDGPU::FeatureUnalignedAccessMode,
273 
274  AMDGPU::FeatureAutoWaitcntBeforeBarrier,
275 
276  // Property of the kernel/environment which can't actually differ.
277  AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
278  AMDGPU::FeatureTrapHandler,
279 
280  // The default assumption needs to be ecc is enabled, but no directly
281  // exposed operations depend on it, so it can be safely inlined.
282  AMDGPU::FeatureSRAMECC,
283 
284  // Perf-tuning features
285  AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
286 
288  : BaseT(TM, F.getParent()->getDataLayout()),
289  ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
290  TLI(ST->getTargetLowering()), CommonTTI(TM, F),
291  IsGraphics(AMDGPU::isGraphics(F.getCallingConv())),
292  MaxVGPRs(ST->getMaxNumVGPRs(
293  std::max(ST->getWavesPerEU(F).first,
295  ST->getFlatWorkGroupSizes(F).second)))) {
297  HasFP32Denormals = Mode.allFP32Denormals();
298  HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
299 }
300 
302  // The concept of vector registers doesn't really exist. Some packed vector
303  // operations operate on the normal 32-bit registers.
304  return MaxVGPRs;
305 }
306 
307 unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
308  // This is really the number of registers to fill when vectorizing /
309  // interleaving loops, so we lie to avoid trying to use all registers.
310  return getHardwareNumberOfRegisters(Vec) >> 3;
311 }
312 
313 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
314  const SIRegisterInfo *TRI = ST->getRegisterInfo();
315  const TargetRegisterClass *RC = TRI->getRegClass(RCID);
316  unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32;
317  return getHardwareNumberOfRegisters(false) / NumVGPRs;
318 }
319 
320 TypeSize
322  switch (K) {
324  return TypeSize::getFixed(32);
326  return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
328  return TypeSize::getScalable(0);
329  }
330  llvm_unreachable("Unsupported register kind");
331 }
332 
334  return 32;
335 }
336 
337 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
338  if (Opcode == Instruction::Load || Opcode == Instruction::Store)
339  return 32 * 4 / ElemWidth;
340  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
341  : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
342  : 1;
343 }
344 
345 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
346  unsigned ChainSizeInBytes,
347  VectorType *VecTy) const {
348  unsigned VecRegBitWidth = VF * LoadSize;
349  if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
350  // TODO: Support element-size less than 32bit?
351  return 128 / LoadSize;
352 
353  return VF;
354 }
355 
356 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
357  unsigned ChainSizeInBytes,
358  VectorType *VecTy) const {
359  unsigned VecRegBitWidth = VF * StoreSize;
360  if (VecRegBitWidth > 128)
361  return 128 / StoreSize;
362 
363  return VF;
364 }
365 
366 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
367  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
368  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
369  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
370  AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
371  return 512;
372  }
373 
374  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
375  return 8 * ST->getMaxPrivateElementSize();
376 
377  // Common to flat, global, local and region. Assume for unknown addrspace.
378  return 128;
379 }
380 
381 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
382  Align Alignment,
383  unsigned AddrSpace) const {
384  // We allow vectorization of flat stores, even though we may need to decompose
385  // them later if they may access private memory. We don't have enough context
386  // here, and legalization can handle it.
387  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
388  return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
389  ChainSizeInBytes <= ST->getMaxPrivateElementSize();
390  }
391  return true;
392 }
393 
394 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
395  Align Alignment,
396  unsigned AddrSpace) const {
397  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
398 }
399 
400 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
401  Align Alignment,
402  unsigned AddrSpace) const {
403  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
404 }
405 
406 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
407 // iteration. Should we report a larger size and let it legalize?
408 //
409 // FIXME: Should we use narrower types for local/region, or account for when
410 // unaligned access is legal?
411 //
412 // FIXME: This could use fine tuning and microbenchmarks.
414  unsigned SrcAddrSpace,
415  unsigned DestAddrSpace,
416  unsigned SrcAlign,
417  unsigned DestAlign) const {
418  unsigned MinAlign = std::min(SrcAlign, DestAlign);
419 
420  // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
421  // hardware into byte accesses. If you assume all alignments are equally
422  // probable, it's more efficient on average to use short accesses for this
423  // case.
424  if (MinAlign == 2)
425  return Type::getInt16Ty(Context);
426 
427  // Not all subtargets have 128-bit DS instructions, and we currently don't
428  // form them by default.
429  if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
430  SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
431  DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
432  DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
434  }
435 
436  // Global memory works best with 16-byte accesses. Private memory will also
437  // hit this, although they'll be decomposed.
439 }
440 
443  unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
444  unsigned SrcAlign, unsigned DestAlign) const {
445  assert(RemainingBytes < 16);
446 
447  unsigned MinAlign = std::min(SrcAlign, DestAlign);
448 
449  if (MinAlign != 2) {
450  Type *I64Ty = Type::getInt64Ty(Context);
451  while (RemainingBytes >= 8) {
452  OpsOut.push_back(I64Ty);
453  RemainingBytes -= 8;
454  }
455 
456  Type *I32Ty = Type::getInt32Ty(Context);
457  while (RemainingBytes >= 4) {
458  OpsOut.push_back(I32Ty);
459  RemainingBytes -= 4;
460  }
461  }
462 
463  Type *I16Ty = Type::getInt16Ty(Context);
464  while (RemainingBytes >= 2) {
465  OpsOut.push_back(I16Ty);
466  RemainingBytes -= 2;
467  }
468 
469  Type *I8Ty = Type::getInt8Ty(Context);
470  while (RemainingBytes) {
471  OpsOut.push_back(I8Ty);
472  --RemainingBytes;
473  }
474 }
475 
476 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
477  // Disable unrolling if the loop is not vectorized.
478  // TODO: Enable this again.
479  if (VF == 1)
480  return 1;
481 
482  return 8;
483 }
484 
486  MemIntrinsicInfo &Info) const {
487  switch (Inst->getIntrinsicID()) {
488  case Intrinsic::amdgcn_atomic_inc:
489  case Intrinsic::amdgcn_atomic_dec:
490  case Intrinsic::amdgcn_ds_ordered_add:
491  case Intrinsic::amdgcn_ds_ordered_swap:
492  case Intrinsic::amdgcn_ds_fadd:
493  case Intrinsic::amdgcn_ds_fmin:
494  case Intrinsic::amdgcn_ds_fmax: {
495  auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
496  auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
497  if (!Ordering || !Volatile)
498  return false; // Invalid.
499 
500  unsigned OrderingVal = Ordering->getZExtValue();
501  if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
502  return false;
503 
504  Info.PtrVal = Inst->getArgOperand(0);
505  Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
506  Info.ReadMem = true;
507  Info.WriteMem = true;
508  Info.IsVolatile = !Volatile->isZero();
509  return true;
510  }
511  default:
512  return false;
513  }
514 }
515 
517  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
518  TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
519  TTI::OperandValueProperties Opd1PropInfo,
521  const Instruction *CxtI) {
522  EVT OrigTy = TLI->getValueType(DL, Ty);
523  if (!OrigTy.isSimple()) {
524  // FIXME: We're having to query the throughput cost so that the basic
525  // implementation tries to generate legalize and scalarization costs. Maybe
526  // we could hoist the scalarization code here?
529  Opd1Info, Opd2Info, Opd1PropInfo,
530  Opd2PropInfo, Args, CxtI);
531  // Scalarization
532 
533  // Check if any of the operands are vector operands.
534  int ISD = TLI->InstructionOpcodeToISD(Opcode);
535  assert(ISD && "Invalid opcode");
536 
537  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
538 
539  bool IsFloat = Ty->isFPOrFPVectorTy();
540  // Assume that floating point arithmetic operations cost twice as much as
541  // integer operations.
542  unsigned OpCost = (IsFloat ? 2 : 1);
543 
544  if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
545  // The operation is legal. Assume it costs 1.
546  // TODO: Once we have extract/insert subvector cost we need to use them.
547  return LT.first * OpCost;
548  }
549 
550  if (!TLI->isOperationExpand(ISD, LT.second)) {
551  // If the operation is custom lowered, then assume that the code is twice
552  // as expensive.
553  return LT.first * 2 * OpCost;
554  }
555 
556  // Else, assume that we need to scalarize this op.
557  // TODO: If one of the types get legalized by splitting, handle this
558  // similarly to what getCastInstrCost() does.
559  if (auto *VTy = dyn_cast<VectorType>(Ty)) {
560  unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
562  Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
563  Opd1PropInfo, Opd2PropInfo, Args, CxtI);
564  // Return the cost of multiple scalar invocation plus the cost of
565  // inserting and extracting the values.
566  SmallVector<Type *> Tys(Args.size(), Ty);
567  return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
568  }
569 
570  // We don't know anything about this scalar instruction.
571  return OpCost;
572  }
573 
574  // Legalize the type.
575  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
576  int ISD = TLI->InstructionOpcodeToISD(Opcode);
577 
578  // Because we don't have any legal vector operations, but the legal types, we
579  // need to account for split vectors.
580  unsigned NElts = LT.second.isVector() ?
581  LT.second.getVectorNumElements() : 1;
582 
583  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
584 
585  switch (ISD) {
586  case ISD::SHL:
587  case ISD::SRL:
588  case ISD::SRA:
589  if (SLT == MVT::i64)
590  return get64BitInstrCost(CostKind) * LT.first * NElts;
591 
592  if (ST->has16BitInsts() && SLT == MVT::i16)
593  NElts = (NElts + 1) / 2;
594 
595  // i32
596  return getFullRateInstrCost() * LT.first * NElts;
597  case ISD::ADD:
598  case ISD::SUB:
599  case ISD::AND:
600  case ISD::OR:
601  case ISD::XOR:
602  if (SLT == MVT::i64) {
603  // and, or and xor are typically split into 2 VALU instructions.
604  return 2 * getFullRateInstrCost() * LT.first * NElts;
605  }
606 
607  if (ST->has16BitInsts() && SLT == MVT::i16)
608  NElts = (NElts + 1) / 2;
609 
610  return LT.first * NElts * getFullRateInstrCost();
611  case ISD::MUL: {
612  const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
613  if (SLT == MVT::i64) {
614  const int FullRateCost = getFullRateInstrCost();
615  return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
616  }
617 
618  if (ST->has16BitInsts() && SLT == MVT::i16)
619  NElts = (NElts + 1) / 2;
620 
621  // i32
622  return QuarterRateCost * NElts * LT.first;
623  }
624  case ISD::FMUL:
625  // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
626  // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
627  // fused operation.
628  if (CxtI && CxtI->hasOneUse())
629  if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
630  const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
631  if (OPC == ISD::FADD || OPC == ISD::FSUB) {
632  if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
634  if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
636 
637  // Estimate all types may be fused with contract/unsafe flags
639  if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
640  Options.UnsafeFPMath ||
641  (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
643  }
644  }
646  case ISD::FADD:
647  case ISD::FSUB:
648  if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
649  NElts = (NElts + 1) / 2;
650  if (SLT == MVT::f64)
651  return LT.first * NElts * get64BitInstrCost(CostKind);
652 
653  if (ST->has16BitInsts() && SLT == MVT::f16)
654  NElts = (NElts + 1) / 2;
655 
656  if (SLT == MVT::f32 || SLT == MVT::f16)
657  return LT.first * NElts * getFullRateInstrCost();
658  break;
659  case ISD::FDIV:
660  case ISD::FREM:
661  // FIXME: frem should be handled separately. The fdiv in it is most of it,
662  // but the current lowering is also not entirely correct.
663  if (SLT == MVT::f64) {
664  int Cost = 7 * get64BitInstrCost(CostKind) +
665  getQuarterRateInstrCost(CostKind) +
666  3 * getHalfRateInstrCost(CostKind);
667  // Add cost of workaround.
669  Cost += 3 * getFullRateInstrCost();
670 
671  return LT.first * Cost * NElts;
672  }
673 
674  if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
675  // TODO: This is more complicated, unsafe flags etc.
676  if ((SLT == MVT::f32 && !HasFP32Denormals) ||
677  (SLT == MVT::f16 && ST->has16BitInsts())) {
678  return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
679  }
680  }
681 
682  if (SLT == MVT::f16 && ST->has16BitInsts()) {
683  // 2 x v_cvt_f32_f16
684  // f32 rcp
685  // f32 fmul
686  // v_cvt_f16_f32
687  // f16 div_fixup
688  int Cost =
689  4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
690  return LT.first * Cost * NElts;
691  }
692 
693  if (SLT == MVT::f32 || SLT == MVT::f16) {
694  // 4 more v_cvt_* insts without f16 insts support
695  int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
696  1 * getQuarterRateInstrCost(CostKind);
697 
698  if (!HasFP32Denormals) {
699  // FP mode switches.
700  Cost += 2 * getFullRateInstrCost();
701  }
702 
703  return LT.first * NElts * Cost;
704  }
705  break;
706  case ISD::FNEG:
707  // Use the backend' estimation. If fneg is not free each element will cost
708  // one additional instruction.
709  return TLI->isFNegFree(SLT) ? 0 : NElts;
710  default:
711  break;
712  }
713 
714  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
715  Opd1PropInfo, Opd2PropInfo, Args, CxtI);
716 }
717 
718 // Return true if there's a potential benefit from using v2f16/v2i16
719 // instructions for an intrinsic, even if it requires nontrivial legalization.
721  switch (ID) {
722  case Intrinsic::fma: // TODO: fmuladd
723  // There's a small benefit to using vector ops in the legalized code.
724  case Intrinsic::round:
725  case Intrinsic::uadd_sat:
726  case Intrinsic::usub_sat:
727  case Intrinsic::sadd_sat:
728  case Intrinsic::ssub_sat:
729  return true;
730  default:
731  return false;
732  }
733 }
734 
738  if (ICA.getID() == Intrinsic::fabs)
739  return 0;
740 
743 
744  Type *RetTy = ICA.getReturnType();
745  EVT OrigTy = TLI->getValueType(DL, RetTy);
746  if (!OrigTy.isSimple()) {
749 
750  // TODO: Combine these two logic paths.
751  if (ICA.isTypeBasedOnly())
753 
754  unsigned RetVF =
755  (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
756  : 1);
757  const IntrinsicInst *I = ICA.getInst();
759  FastMathFlags FMF = ICA.getFlags();
760  // Assume that we need to scalarize this intrinsic.
761 
762  // Compute the scalarization overhead based on Args for a vector
763  // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
764  // CostModel will pass a vector RetTy and VF is 1.
765  InstructionCost ScalarizationCost = InstructionCost::getInvalid();
766  if (RetVF > 1) {
767  ScalarizationCost = 0;
768  if (!RetTy->isVoidTy())
769  ScalarizationCost +=
770  getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
771  ScalarizationCost +=
773  }
774 
775  IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I,
776  ScalarizationCost);
778  }
779 
780  // Legalize the type.
781  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
782 
783  unsigned NElts = LT.second.isVector() ?
784  LT.second.getVectorNumElements() : 1;
785 
786  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
787 
788  if (SLT == MVT::f64)
789  return LT.first * NElts * get64BitInstrCost(CostKind);
790 
791  if ((ST->has16BitInsts() && SLT == MVT::f16) ||
792  (ST->hasPackedFP32Ops() && SLT == MVT::f32))
793  NElts = (NElts + 1) / 2;
794 
795  // TODO: Get more refined intrinsic costs?
796  unsigned InstRate = getQuarterRateInstrCost(CostKind);
797 
798  switch (ICA.getID()) {
799  case Intrinsic::fma:
800  InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
801  : getQuarterRateInstrCost(CostKind);
802  break;
803  case Intrinsic::uadd_sat:
804  case Intrinsic::usub_sat:
805  case Intrinsic::sadd_sat:
806  case Intrinsic::ssub_sat:
807  static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
808  if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
809  NElts = 1;
810  break;
811  }
812 
813  return LT.first * NElts * InstRate;
814 }
815 
818  const Instruction *I) {
819  assert((I == nullptr || I->getOpcode() == Opcode) &&
820  "Opcode should reflect passed instruction.");
821  const bool SCost =
823  const int CBrCost = SCost ? 5 : 7;
824  switch (Opcode) {
825  case Instruction::Br: {
826  // Branch instruction takes about 4 slots on gfx900.
827  auto BI = dyn_cast_or_null<BranchInst>(I);
828  if (BI && BI->isUnconditional())
829  return SCost ? 1 : 4;
830  // Suppose conditional branch takes additional 3 exec manipulations
831  // instructions in average.
832  return CBrCost;
833  }
834  case Instruction::Switch: {
835  auto SI = dyn_cast_or_null<SwitchInst>(I);
836  // Each case (including default) takes 1 cmp + 1 cbr instructions in
837  // average.
838  return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
839  }
840  case Instruction::Ret:
841  return SCost ? 1 : 10;
842  }
843  return BaseT::getCFInstrCost(Opcode, CostKind, I);
844 }
845 
851  return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
852 
853  EVT OrigTy = TLI->getValueType(DL, Ty);
854 
855  // Computes cost on targets that have packed math instructions(which support
856  // 16-bit types only).
857  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
858  return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
859 
860  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
861  return LT.first * getFullRateInstrCost();
862 }
863 
866  bool IsUnsigned,
868  EVT OrigTy = TLI->getValueType(DL, Ty);
869 
870  // Computes cost on targets that have packed math instructions(which support
871  // 16-bit types only).
872  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
873  return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
874 
875  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
876  return LT.first * getHalfRateInstrCost(CostKind);
877 }
878 
880  unsigned Index) {
881  switch (Opcode) {
882  case Instruction::ExtractElement:
883  case Instruction::InsertElement: {
884  unsigned EltSize
885  = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
886  if (EltSize < 32) {
887  if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
888  return 0;
889  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
890  }
891 
892  // Extracts are just reads of a subregister, so are free. Inserts are
893  // considered free because we don't want to have any cost for scalarizing
894  // operations, and we don't have to copy into a different register class.
895 
896  // Dynamic indexing isn't free and is best avoided.
897  return Index == ~0u ? 2 : 0;
898  }
899  default:
900  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
901  }
902 }
903 
904 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
905 /// this is analyzing the collective result of all output registers. Otherwise,
906 /// this is only querying a specific result index if this returns multiple
907 /// registers in a struct.
909  const CallInst *CI, ArrayRef<unsigned> Indices) const {
910  // TODO: Handle complex extract indices
911  if (Indices.size() > 1)
912  return true;
913 
914  const DataLayout &DL = CI->getModule()->getDataLayout();
915  const SIRegisterInfo *TRI = ST->getRegisterInfo();
916  TargetLowering::AsmOperandInfoVector TargetConstraints =
917  TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
918 
919  const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
920 
921  int OutputIdx = 0;
922  for (auto &TC : TargetConstraints) {
923  if (TC.Type != InlineAsm::isOutput)
924  continue;
925 
926  // Skip outputs we don't care about.
927  if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
928  continue;
929 
930  TLI->ComputeConstraintToUse(TC, SDValue());
931 
932  Register AssignedReg;
933  const TargetRegisterClass *RC;
934  std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
935  TRI, TC.ConstraintCode, TC.ConstraintVT);
936  if (AssignedReg) {
937  // FIXME: This is a workaround for getRegForInlineAsmConstraint
938  // returning VS_32
939  RC = TRI->getPhysRegClass(AssignedReg);
940  }
941 
942  // For AGPR constraints null is returned on subtargets without AGPRs, so
943  // assume divergent for null.
944  if (!RC || !TRI->isSGPRClass(RC))
945  return true;
946  }
947 
948  return false;
949 }
950 
951 /// \returns true if the new GPU divergence analysis is enabled.
953  return !UseLegacyDA;
954 }
955 
956 /// \returns true if the result of the value could potentially be
957 /// different across workitems in a wavefront.
959  if (const Argument *A = dyn_cast<Argument>(V))
960  return !AMDGPU::isArgPassedInSGPR(A);
961 
962  // Loads from the private and flat address spaces are divergent, because
963  // threads can execute the load instruction with the same inputs and get
964  // different results.
965  //
966  // All other loads are not divergent, because if threads issue loads with the
967  // same arguments, they will always get the same result.
968  if (const LoadInst *Load = dyn_cast<LoadInst>(V))
969  return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
970  Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
971 
972  // Atomics are divergent because they are executed sequentially: when an
973  // atomic operation refers to the same address in each thread, then each
974  // thread after the first sees the value written by the previous thread as
975  // original value.
976  if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
977  return true;
978 
979  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
980  return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
981 
982  // Assume all function calls are a source of divergence.
983  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
984  if (CI->isInlineAsm())
986  return true;
987  }
988 
989  // Assume all function calls are a source of divergence.
990  if (isa<InvokeInst>(V))
991  return true;
992 
993  return false;
994 }
995 
996 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
997  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
998  switch (Intrinsic->getIntrinsicID()) {
999  default:
1000  return false;
1001  case Intrinsic::amdgcn_readfirstlane:
1002  case Intrinsic::amdgcn_readlane:
1003  case Intrinsic::amdgcn_icmp:
1004  case Intrinsic::amdgcn_fcmp:
1005  case Intrinsic::amdgcn_ballot:
1006  case Intrinsic::amdgcn_if_break:
1007  return true;
1008  }
1009  }
1010 
1011  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1012  if (CI->isInlineAsm())
1013  return !isInlineAsmSourceOfDivergence(CI);
1014  return false;
1015  }
1016 
1017  const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
1018  if (!ExtValue)
1019  return false;
1020 
1021  const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
1022  if (!CI)
1023  return false;
1024 
1025  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1026  switch (Intrinsic->getIntrinsicID()) {
1027  default:
1028  return false;
1029  case Intrinsic::amdgcn_if:
1030  case Intrinsic::amdgcn_else: {
1031  ArrayRef<unsigned> Indices = ExtValue->getIndices();
1032  return Indices.size() == 1 && Indices[0] == 1;
1033  }
1034  }
1035  }
1036 
1037  // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1038  // divergent for the overall struct return. We need to override it in the
1039  // case we're extracting an SGPR component here.
1040  if (CI->isInlineAsm())
1041  return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1042 
1043  return false;
1044 }
1045 
1047  Intrinsic::ID IID) const {
1048  switch (IID) {
1049  case Intrinsic::amdgcn_atomic_inc:
1050  case Intrinsic::amdgcn_atomic_dec:
1051  case Intrinsic::amdgcn_ds_fadd:
1052  case Intrinsic::amdgcn_ds_fmin:
1053  case Intrinsic::amdgcn_ds_fmax:
1054  case Intrinsic::amdgcn_is_shared:
1055  case Intrinsic::amdgcn_is_private:
1056  OpIndexes.push_back(0);
1057  return true;
1058  default:
1059  return false;
1060  }
1061 }
1062 
1064  Value *OldV,
1065  Value *NewV) const {
1066  auto IntrID = II->getIntrinsicID();
1067  switch (IntrID) {
1068  case Intrinsic::amdgcn_atomic_inc:
1069  case Intrinsic::amdgcn_atomic_dec:
1070  case Intrinsic::amdgcn_ds_fadd:
1071  case Intrinsic::amdgcn_ds_fmin:
1072  case Intrinsic::amdgcn_ds_fmax: {
1073  const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
1074  if (!IsVolatile->isZero())
1075  return nullptr;
1076  Module *M = II->getParent()->getParent()->getParent();
1077  Type *DestTy = II->getType();
1078  Type *SrcTy = NewV->getType();
1079  Function *NewDecl =
1080  Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
1081  II->setArgOperand(0, NewV);
1082  II->setCalledFunction(NewDecl);
1083  return II;
1084  }
1085  case Intrinsic::amdgcn_is_shared:
1086  case Intrinsic::amdgcn_is_private: {
1087  unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1089  unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1090  LLVMContext &Ctx = NewV->getType()->getContext();
1091  ConstantInt *NewVal = (TrueAS == NewAS) ?
1093  return NewVal;
1094  }
1095  case Intrinsic::ptrmask: {
1096  unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1097  unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1098  Value *MaskOp = II->getArgOperand(1);
1099  Type *MaskTy = MaskOp->getType();
1100 
1101  bool DoTruncate = false;
1102 
1103  const GCNTargetMachine &TM =
1104  static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1105  if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1106  // All valid 64-bit to 32-bit casts work by chopping off the high
1107  // bits. Any masking only clearing the low bits will also apply in the new
1108  // address space.
1109  if (DL.getPointerSizeInBits(OldAS) != 64 ||
1110  DL.getPointerSizeInBits(NewAS) != 32)
1111  return nullptr;
1112 
1113  // TODO: Do we need to thread more context in here?
1114  KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
1115  if (Known.countMinLeadingOnes() < 32)
1116  return nullptr;
1117 
1118  DoTruncate = true;
1119  }
1120 
1121  IRBuilder<> B(II);
1122  if (DoTruncate) {
1123  MaskTy = B.getInt32Ty();
1124  MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1125  }
1126 
1127  return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1128  {NewV, MaskOp});
1129  }
1130  default:
1131  return nullptr;
1132  }
1133 }
1134 
1137  int Index, VectorType *SubTp) {
1139  if (ST->hasVOP3PInsts()) {
1140  if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
1141  DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1142  // With op_sel VOP3P instructions freely can access the low half or high
1143  // half of a register, so any swizzle is free.
1144 
1145  switch (Kind) {
1146  case TTI::SK_Broadcast:
1147  case TTI::SK_Reverse:
1149  return 0;
1150  default:
1151  break;
1152  }
1153  }
1154  }
1155 
1156  return BaseT::getShuffleCost(Kind, VT, Mask, Index, SubTp);
1157 }
1158 
1160  const Function *Callee) const {
1161  const TargetMachine &TM = getTLI()->getTargetMachine();
1162  const GCNSubtarget *CallerST
1163  = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1164  const GCNSubtarget *CalleeST
1165  = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1166 
1167  const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1168  const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1169 
1170  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1171  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1172  if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1173  return false;
1174 
1175  // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1176  // no way to support merge for backend defined attributes.
1177  AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
1179  if (!CallerMode.isInlineCompatible(CalleeMode))
1180  return false;
1181 
1182  if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1183  Callee->hasFnAttribute(Attribute::InlineHint))
1184  return true;
1185 
1186  // Hack to make compile times reasonable.
1187  if (InlineMaxBB) {
1188  // Single BB does not increase total BB amount.
1189  if (Callee->size() == 1)
1190  return true;
1191  size_t BBSize = Caller->size() + Callee->size() - 1;
1192  return BBSize <= InlineMaxBB;
1193  }
1194 
1195  return true;
1196 }
1197 
1199  // If we have a pointer to private array passed into a function
1200  // it will not be optimized out, leaving scratch usage.
1201  // Increase the inline threshold to allow inlining in this case.
1202  uint64_t AllocaSize = 0;
1204  for (Value *PtrArg : CB->args()) {
1205  PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1206  if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
1208  continue;
1209 
1210  PtrArg = getUnderlyingObject(PtrArg);
1211  if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
1212  if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1213  continue;
1214  AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1215  // If the amount of stack memory is excessive we will not be able
1216  // to get rid of the scratch anyway, bail out.
1217  if (AllocaSize > ArgAllocaCutoff) {
1218  AllocaSize = 0;
1219  break;
1220  }
1221  }
1222  }
1223  if (AllocaSize)
1224  return ArgAllocaCost;
1225  return 0;
1226 }
1227 
1231  CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1232 }
1233 
1236  CommonTTI.getPeelingPreferences(L, SE, PP);
1237 }
1238 
1239 int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1240  return ST->hasFullRate64Ops()
1241  ? getFullRateInstrCost()
1242  : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1243  : getQuarterRateInstrCost(CostKind);
1244 }
llvm::ISD::SUB
@ SUB
Definition: ISDOpcodes.h:240
UseLegacyDA
static cl::opt< bool > UseLegacyDA("amdgpu-use-legacy-divergence-analysis", cl::desc("Enable legacy divergence analysis for AMDGPU"), cl::init(false), cl::Hidden)
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::TargetTransformInfo::UnrollingPreferences::BEInsns
unsigned BEInsns
Definition: TargetTransformInfo.h:478
llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
Attrs
Function Attrs
Definition: README_ALTIVEC.txt:215
llvm::Type::isSized
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:274
llvm::BasicTTIImplBase< AMDGPUTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:38
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:485
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:211
llvm::Loop::isLoopInvariant
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:64
llvm::BasicTTIImplBase< GCNTTIImpl >::getOperandsScalarizationOverhead
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys)
Estimate the overhead of scalarizing an instructions unique non-constant operands.
Definition: BasicTTIImpl.h:707
llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition: TargetTransformInfo.h:453
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
llvm::GCNTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: AMDGPUTargetTransformInfo.cpp:1228
llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1561
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:633
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:435
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1379
llvm::TargetTransformInfo::UnrollingPreferences::MaxCount
unsigned MaxCount
Definition: TargetTransformInfo.h:469
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
llvm::TargetOptions
Definition: TargetOptions.h:124
llvm::Function
Definition: Function.h:61
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
llvm::LoopBase::contains
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:122
llvm::AMDGPUTargetLowering::isFNegFree
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
Definition: AMDGPUISelLowering.cpp:894
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
llvm::CallBase::setCalledFunction
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1429
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:655
llvm::BasicTTIImplBase< GCNTTIImpl >::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1055
llvm::PatternMatch::m_FPOne
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:848
llvm::GCNTTIImpl::isSourceOfDivergence
bool isSourceOfDivergence(const Value *V) const
Definition: AMDGPUTargetTransformInfo.cpp:958
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::CallBase::isInlineAsm
bool isInlineAsm() const
Check if this call is an inline asm statement.
Definition: InstrTypes.h:1458
llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:750
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:734
llvm::IRBuilder<>
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:150
llvm::PointerType::getAddressSpace
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:687
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:461
llvm::TargetTransformInfo::RGK_Scalar
@ RGK_Scalar
Definition: TargetTransformInfo.h:907
llvm::IntrinsicCostAttributes::getInst
const IntrinsicInst * getInst() const
Definition: TargetTransformInfo.h:149
ValueTracking.h
llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent
llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:189
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:214
llvm::GCNTTIImpl::getMemcpyLoopLoweringType
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign) const
Definition: AMDGPUTargetTransformInfo.cpp:413
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::GCNTTIImpl::isLegalToVectorizeMemChain
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:381
llvm::Depth
@ Depth
Definition: SIMachineScheduler.h:36
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:481
dependsOnLocalPhi
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
Definition: AMDGPUTargetTransformInfo.cpp:79
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::GCNTTIImpl::getMaxInterleaveFactor
unsigned getMaxInterleaveFactor(unsigned VF)
Definition: AMDGPUTargetTransformInfo.cpp:476
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:535
llvm::BasicTTIImplBase< GCNTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:833
llvm::SITargetLowering::getRegForInlineAsmConstraint
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
Definition: SIISelLowering.cpp:11585
llvm::Optional
Definition: APInt.h:33
llvm::AMDGPU::SIModeRegisterDefaults
Definition: AMDGPUBaseInfo.h:911
llvm::GCNTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: AMDGPUTargetTransformInfo.cpp:1159
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:40
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:422
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:398
llvm::TargetLowering::ComputeConstraintToUse
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
Definition: TargetLowering.cpp:5010
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:116
llvm::AMDGPU::SIModeRegisterDefaults::isInlineCompatible
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Definition: AMDGPUBaseInfo.h:996
llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:161
llvm::AMDGPU::IsaInfo::getMaxNumVGPRs
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
Definition: AMDGPUBaseInfo.cpp:745
llvm::GCNSubtarget::hasPackedFP32Ops
bool hasPackedFP32Ops() const
Definition: GCNSubtarget.h:839
llvm::AMDGPU::isIntrinsicSourceOfDivergence
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
Definition: AMDGPUBaseInfo.cpp:1998
llvm::GCNTTIImpl::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
Definition: AMDGPUTargetTransformInfo.cpp:865
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition: TargetTransformInfo.h:870
llvm::Type::getInt8Ty
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:201
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1567
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:203
llvm::GCNSubtarget::getRegisterInfo
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:225
llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:860
F
#define F(x, y, z)
Definition: MD5.cpp:56
KnownBits.h
llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(Optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition: TargetTransformInfo.h:1160
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::IntrinsicCostAttributes::getFlags
FastMathFlags getFlags() const
Definition: TargetTransformInfo.h:151
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:130
llvm::MVT::SimpleValueType
SimpleValueType
Definition: MachineValueType.h:33
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:266
Context
LLVMContext & Context
Definition: NVVMIntrRange.cpp:66
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::IntrinsicCostAttributes::getArgTypes
const SmallVectorImpl< Type * > & getArgTypes() const
Definition: TargetTransformInfo.h:154
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::AllocaInst::isStaticAlloca
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Definition: Instructions.cpp:1385
llvm::LoopBase::getSubLoops
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
Definition: LoopInfo.h:143
llvm::MinAlign
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition: MathExtras.h:672
llvm::AMDGPUAS::BUFFER_FAT_POINTER
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
Definition: AMDGPU.h:359
llvm::AMDGPUTargetMachine
Definition: AMDGPUTargetMachine.h:29
llvm::GCNSubtarget::hasFastFMAF32
bool hasFastFMAF32() const
Definition: GCNSubtarget.h:304
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::AllocaInst::getAllocatedType
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:113
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:357
llvm::BasicTTIImplBase< GCNTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:1108
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:859
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:35
llvm::GCNTTIImpl::isLegalToVectorizeStoreChain
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:400
llvm::MVT::f64
@ f64
Definition: MachineValueType.h:56
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:57
llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2119
llvm::GCNTTIImpl::getMinVectorRegisterBitWidth
unsigned getMinVectorRegisterBitWidth() const
Definition: AMDGPUTargetTransformInfo.cpp:333
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
llvm::ISD::SRA
@ SRA
Definition: ISDOpcodes.h:658
llvm::GCNTTIImpl::getMemcpyLoopResidualLoweringType
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign) const
Definition: AMDGPUTargetTransformInfo.cpp:441
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:237
llvm::TargetTransformInfo::UnrollingPreferences::MaxIterationsCountToAnalyze
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
Definition: TargetTransformInfo.h:507
llvm::GCNSubtarget::getMaxPrivateElementSize
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:277
llvm::GCNTTIImpl::getLoadVectorFactor
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: AMDGPUTargetTransformInfo.cpp:345
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:153
Options
const char LLVMTargetMachineRef LLVMPassBuilderOptionsRef Options
Definition: PassBuilderBindings.cpp:48
llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
Definition: ValueTracking.cpp:4356
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:28
llvm::GCNTTIImpl::isAlwaysUniform
bool isAlwaysUniform(const Value *V) const
Definition: AMDGPUTargetTransformInfo.cpp:996
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:632
llvm::AMDGPUSubtarget::has16BitInsts
bool has16BitInsts() const
Definition: AMDGPUSubtarget.h:133
PatternMatch.h
llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition: TargetTransformInfo.h:907
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:648
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::AMDGPUTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AMDGPUTargetTransformInfo.cpp:262
llvm::LinearPolySize< TypeSize >::getFixed
static TypeSize getFixed(ScalarTy MinVal)
Definition: TypeSize.h:284
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:91
llvm::lltok::Kind
Kind
Definition: LLToken.h:18
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:118
LoopInfo.h
InlineMaxBB
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
AMDGPUTargetTransformInfo.h
llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:377
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:861
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition: AtomicOrdering.h:56
llvm::cl::opt
Definition: CommandLine.h:1434
llvm::GCNTTIImpl::getTgtMemIntrinsic
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
Definition: AMDGPUTargetTransformInfo.cpp:485
AMDGPUMCTargetDesc.h
llvm::TargetRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(unsigned i) const
Returns the register class associated with the enumeration value.
Definition: TargetRegisterInfo.h:739
llvm::GCNTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: AMDGPUTargetTransformInfo.cpp:516
Index
uint32_t Index
Definition: ELFObjHandler.cpp:84
llvm::KnownBits::countMinLeadingOnes
unsigned countMinLeadingOnes() const
Returns the minimum number of leading one bits.
Definition: KnownBits.h:241
uint64_t
llvm::AMDGPUTTIImpl::AMDGPUTTIImpl
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition: AMDGPUTargetTransformInfo.cpp:98
llvm::Instruction::hasAllowContract
bool hasAllowContract() const
Determine whether the allow-contract flag is set.
Definition: Instruction.cpp:270
llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:572
const
aarch64 promote const
Definition: AArch64PromoteConstant.cpp:232
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
llvm::TargetLoweringBase::isOperationLegalOrPromote
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
Definition: TargetLowering.h:1128
llvm::TargetLowering::ParseConstraints
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
Definition: TargetLowering.cpp:4659
llvm::AMDGPU::getIntegerAttribute
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
Definition: AMDGPUBaseInfo.cpp:852
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:428
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:886
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:928
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
llvm::AMDGPUSubtarget::hasMadMacF32Insts
bool hasMadMacF32Insts() const
Definition: AMDGPUSubtarget.h:141
llvm::PointerType
Class to represent pointers.
Definition: DerivedTypes.h:632
llvm::BasicTTIImplBase< AMDGPUTTIImpl >
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:354
intrinsicHasPackedVectorBenefit
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
Definition: AMDGPUTargetTransformInfo.cpp:720
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:213
llvm::TargetMachine::Options
TargetOptions Options
Definition: TargetMachine.h:120
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:79
llvm::LoopBase::getLoopDepth
unsigned getLoopDepth() const
Return the nesting level of this loop.
Definition: LoopInfo.h:96
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::GCNSubtarget::hasUnalignedScratchAccess
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:516
Mode
SI Whole Quad Mode
Definition: SIWholeQuadMode.cpp:262
llvm::Type::isVoidTy
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:138
UnrollMaxBlockToAnalyze
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
llvm::MVT
Machine Value Type.
Definition: MachineValueType.h:31
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
llvm::MDNode
Metadata node.
Definition: Metadata.h:901
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:878
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:351
llvm::GCNTTIImpl::getHardwareNumberOfRegisters
unsigned getHardwareNumberOfRegisters(bool Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:301
UnrollThresholdPrivate
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
llvm::GCNTargetMachine
Definition: AMDGPUTargetMachine.h:72
llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition: TargetTransformInfo.h:262
llvm::AMDGPUSubtarget::hasVOP3PInsts
bool hasVOP3PInsts() const
Definition: AMDGPUSubtarget.h:153
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1554
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:47
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:179
llvm::AMDGPUTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: AMDGPUTargetTransformInfo.cpp:104
llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:353
llvm::BasicTTIImplBase< GCNTTIImpl >::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:671
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:215
llvm::GCNTTIImpl::isLegalToVectorizeLoadChain
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:394
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:353
llvm::GCNTTIImpl::isInlineAsmSourceOfDivergence
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
Definition: AMDGPUTargetTransformInfo.cpp:908
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
getParent
static const Function * getParent(const Value *V)
Definition: BasicAliasAnalysis.cpp:776
UnrollThresholdIf
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
llvm::TargetLoweringBase::InstructionOpcodeToISD
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Definition: TargetLoweringBase.cpp:1762
llvm::AMDGPU::isGraphics
bool isGraphics(CallingConv::ID cc)
Definition: AMDGPUBaseInfo.cpp:1373
llvm::findOptionMDForLoop
MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
Definition: LoopInfo.cpp:1045
llvm::GCNTTIImpl::getRegisterBitWidth
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:321
llvm::ConstantInt::getSExtValue
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:148
llvm::BasicTTIImplBase< AMDGPUTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:559
llvm::AMDGPU::HSAMD::Kernel::Arg::Key::IsVolatile
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
Definition: AMDGPUMetadata.h:194
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:273
llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:127
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:175
llvm::ISD::FMUL
@ FMUL
Definition: ISDOpcodes.h:379
llvm::CallBase::setArgOperand
void setArgOperand(unsigned i, Value *v)
Definition: InstrTypes.h:1343
UnrollThresholdLocal
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
llvm::ConstantInt::getFalse
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:855
llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:634
llvm::GCNTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
Definition: AMDGPUTargetTransformInfo.cpp:847
Callee
amdgpu Simplify well known AMD library false FunctionCallee Callee
Definition: AMDGPULibCalls.cpp:206
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:349
llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition: TargetLowering.h:339
llvm::LoopBase::isInnermost
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:165
ArgAllocaCost
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:355
llvm::GCNTTIImpl::getMaximumVF
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
Definition: AMDGPUTargetTransformInfo.cpp:337
llvm::GCNTTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AMDGPUTargetTransformInfo.cpp:816
llvm::Type::getInt64Ty
static IntegerType * getInt64Ty(LLVMContext &C)
Definition: Type.cpp:204
llvm::GCNTTIImpl::getNumberOfRegisters
unsigned getNumberOfRegisters(bool Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:307
llvm::AMDGPU::isArgPassedInSGPR
bool isArgPassedInSGPR(const Argument *A)
Definition: AMDGPUBaseInfo.cpp:1809
llvm::ConstantInt::getTrue
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:848
std
Definition: BitVector.h:838
llvm::KnownBits
Definition: KnownBits.h:23
llvm::SITargetLowering::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Definition: SIISelLowering.cpp:12388
llvm::TargetRegisterInfo::getRegSizeInBits
unsigned getRegSizeInBits(const TargetRegisterClass &RC) const
Return the size in bits of a register from class RC.
Definition: TargetRegisterInfo.h:276
llvm::GCNSubtarget::hasFullRate64Ops
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:312
llvm::GCNTTIImpl::collectFlatAddressOperands
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
Definition: AMDGPUTargetTransformInfo.cpp:1046
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:321
llvm::GCNTTIImpl::getLoadStoreVecRegBitWidth
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:366
llvm::ExtractValueInst
This instruction extracts a struct member or array element value from an aggregate value.
Definition: Instructions.h:2372
llvm::GCNTTIImpl::getStoreVectorFactor
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: AMDGPUTargetTransformInfo.cpp:356
llvm::TypeSize
Definition: TypeSize.h:417
llvm::TargetLoweringBase::isOperationExpand
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
Definition: TargetLowering.h:1213
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
llvm::LinearPolySize< TypeSize >::getScalable
static TypeSize getScalable(ScalarTy MinVal)
Definition: TypeSize.h:287
llvm::GCNTTIImpl::adjustInliningThreshold
unsigned adjustInliningThreshold(const CallBase *CB) const
Definition: AMDGPUTargetTransformInfo.cpp:1198
llvm::SDValue
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
Definition: SelectionDAGNodes.h:138
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
llvm::TargetTransformInfo::RGK_ScalableVector
@ RGK_ScalableVector
Definition: TargetTransformInfo.h:907
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
llvm::InstructionCost::getInvalid
static InstructionCost getInvalid(CostType Val=0)
Definition: InstructionCost.h:73
llvm::GCNTTIImpl::rewriteIntrinsicWithAddressSpace
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
Definition: AMDGPUTargetTransformInfo.cpp:1063
llvm::InlineAsm::isOutput
@ isOutput
Definition: InlineAsm.h:95
llvm::FPOpFusion::Fast
@ Fast
Definition: TargetOptions.h:37
llvm::BasicTTIImplBase< GCNTTIImpl >::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
Definition: BasicTTIImpl.h:2129
llvm::RecurKind::FAdd
@ FAdd
Sum of floats.
llvm::ISD::FSUB
@ FSUB
Definition: ISDOpcodes.h:378
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:148
llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:657
llvm::TargetTransformInfo::RegisterKind
RegisterKind
Definition: TargetTransformInfo.h:907
llvm::ISD::FREM
@ FREM
Definition: ISDOpcodes.h:381
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:241
llvm::MVT::f16
@ f16
Definition: MachineValueType.h:54
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1338
llvm::BasicTTIImplBase< GCNTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: BasicTTIImpl.h:866
llvm::BasicTTIImplBase< GCNTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1332
llvm::TargetTransformInfo::UnrollingPreferences::Threshold
unsigned Threshold
The cost threshold for the unrolled loop.
Definition: TargetTransformInfo.h:436
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:94
llvm::ISD::SRL
@ SRL
Definition: ISDOpcodes.h:659
llvm::AMDGPU::IsaInfo::getWavesPerEUForWorkGroup
unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize)
Definition: AMDGPUBaseInfo.cpp:571
llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
llvm::GCNTTIImpl::GCNTTIImpl
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition: AMDGPUTargetTransformInfo.cpp:287
llvm::PHINode
Definition: Instructions.h:2625
Threshold
static cl::opt< unsigned > Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), cl::init(100), cl::Hidden)
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:43
llvm::MemIntrinsicInfo
Information about a load/store intrinsic defined by the target.
Definition: TargetTransformInfo.h:70
llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1161
llvm::Type::getInt16Ty
static IntegerType * getInt16Ty(LLVMContext &C)
Definition: Type.cpp:202
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:401
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::DataLayout::getPointerSizeInBits
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:404
llvm::AMDGPU::HSAMD::Kernel::CodeProps::Key::NumVGPRs
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
Definition: AMDGPUMetadata.h:255
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:45
llvm::LoopBase::isLoopExiting
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
Definition: LoopInfo.h:225
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1475
llvm::BasicTTIImplBase< GCNTTIImpl >::getTypeBasedIntrinsicInstrCost
InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on argument types.
Definition: BasicTTIImpl.h:1513
llvm::ISD::FNEG
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:866
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:172
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:350
llvm::GCNTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: AMDGPUTargetTransformInfo.cpp:1135
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
llvm::GCNTTIImpl::useGPUDivergenceAnalysis
bool useGPUDivergenceAnalysis() const
Definition: AMDGPUTargetTransformInfo.cpp:952
llvm::AllocaInst
an instruction to allocate memory on the stack
Definition: Instructions.h:62
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:414
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1409
ArgAllocaCutoff
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
llvm::IntrinsicCostAttributes::isTypeBasedOnly
bool isTypeBasedOnly() const
Definition: TargetTransformInfo.h:156
llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition: Instructions.h:3060
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:89
llvm::MVT::f32
@ f32
Definition: MachineValueType.h:55
UnrollRuntimeLocal
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
llvm::ExtractValueInst::getIndices
ArrayRef< unsigned > getIndices() const
Definition: Instructions.h:2437
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:212
llvm::GCNTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: AMDGPUTargetTransformInfo.cpp:736
llvm::IntrinsicCostAttributes::getArgs
const SmallVectorImpl< const Value * > & getArgs() const
Definition: TargetTransformInfo.h:153
AMDGPUTargetMachine.h
llvm::CallBase::args
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1319
llvm::GCNTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index)
Definition: AMDGPUTargetTransformInfo.cpp:879
llvm::TargetLowering::AsmOperandInfoVector
std::vector< AsmOperandInfo > AsmOperandInfoVector
Definition: TargetLowering.h:4214
llvm::GCNSubtarget::hasUsableDivScaleConditionOutput
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:410
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37
llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:498
llvm::ISD::FDIV
@ FDIV
Definition: ISDOpcodes.h:380
llvm::GCNTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AMDGPUTargetTransformInfo.cpp:1234