LLVM  15.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
18 #include "AMDGPUTargetMachine.h"
20 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/IR/IntrinsicsAMDGPU.h"
24 #include "llvm/IR/PatternMatch.h"
25 #include "llvm/Support/KnownBits.h"
26 
27 using namespace llvm;
28 
29 #define DEBUG_TYPE "AMDGPUtti"
30 
32  "amdgpu-unroll-threshold-private",
33  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
34  cl::init(2700), cl::Hidden);
35 
37  "amdgpu-unroll-threshold-local",
38  cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
39  cl::init(1000), cl::Hidden);
40 
42  "amdgpu-unroll-threshold-if",
43  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
44  cl::init(200), cl::Hidden);
45 
47  "amdgpu-unroll-runtime-local",
48  cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
49  cl::init(true), cl::Hidden);
50 
52  "amdgpu-use-legacy-divergence-analysis",
53  cl::desc("Enable legacy divergence analysis for AMDGPU"),
54  cl::init(false), cl::Hidden);
55 
57  "amdgpu-unroll-max-block-to-analyze",
58  cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
59  cl::init(32), cl::Hidden);
60 
61 static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
62  cl::Hidden, cl::init(4000),
63  cl::desc("Cost of alloca argument"));
64 
65 // If the amount of scratch memory to eliminate exceeds our ability to allocate
66 // it into registers we gain nothing by aggressively inlining functions for that
67 // heuristic.
68 static cl::opt<unsigned>
69  ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
70  cl::init(256),
71  cl::desc("Maximum alloca size to use for inline cost"));
72 
73 // Inliner constraint to achieve reasonable compilation time.
75  "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
76  cl::desc("Maximum number of BBs allowed in a function after inlining"
77  " (compile time constraint)"));
78 
79 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
80  unsigned Depth = 0) {
81  const Instruction *I = dyn_cast<Instruction>(Cond);
82  if (!I)
83  return false;
84 
85  for (const Value *V : I->operand_values()) {
86  if (!L->contains(I))
87  continue;
88  if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
89  if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
90  return SubLoop->contains(PHI); }))
91  return true;
92  } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
93  return true;
94  }
95  return false;
96 }
97 
99  : BaseT(TM, F.getParent()->getDataLayout()),
100  TargetTriple(TM->getTargetTriple()),
101  ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
102  TLI(ST->getTargetLowering()) {}
103 
107  const Function &F = *L->getHeader()->getParent();
108  UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
110  UP.Partial = true;
111 
112  // Conditional branch in a loop back edge needs 3 additional exec
113  // manipulations in average.
114  UP.BEInsns += 3;
115 
116  // TODO: Do we want runtime unrolling?
117 
118  // Maximum alloca size than can fit registers. Reserve 16 registers.
119  const unsigned MaxAlloca = (256 - 16) * 4;
120  unsigned ThresholdPrivate = UnrollThresholdPrivate;
121  unsigned ThresholdLocal = UnrollThresholdLocal;
122 
123  // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
124  // provided threshold value as the default for Threshold
125  if (MDNode *LoopUnrollThreshold =
126  findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
127  if (LoopUnrollThreshold->getNumOperands() == 2) {
128  ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
129  LoopUnrollThreshold->getOperand(1));
130  if (MetaThresholdValue) {
131  // We will also use the supplied value for PartialThreshold for now.
132  // We may introduce additional metadata if it becomes necessary in the
133  // future.
134  UP.Threshold = MetaThresholdValue->getSExtValue();
135  UP.PartialThreshold = UP.Threshold;
136  ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
137  ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
138  }
139  }
140  }
141 
142  unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
143  for (const BasicBlock *BB : L->getBlocks()) {
144  const DataLayout &DL = BB->getModule()->getDataLayout();
145  unsigned LocalGEPsSeen = 0;
146 
147  if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
148  return SubLoop->contains(BB); }))
149  continue; // Block belongs to an inner loop.
150 
151  for (const Instruction &I : *BB) {
152  // Unroll a loop which contains an "if" statement whose condition
153  // defined by a PHI belonging to the loop. This may help to eliminate
154  // if region and potentially even PHI itself, saving on both divergence
155  // and registers used for the PHI.
156  // Add a small bonus for each of such "if" statements.
157  if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
158  if (UP.Threshold < MaxBoost && Br->isConditional()) {
159  BasicBlock *Succ0 = Br->getSuccessor(0);
160  BasicBlock *Succ1 = Br->getSuccessor(1);
161  if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
162  (L->contains(Succ1) && L->isLoopExiting(Succ1)))
163  continue;
164  if (dependsOnLocalPhi(L, Br->getCondition())) {
166  LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
167  << " for loop:\n"
168  << *L << " due to " << *Br << '\n');
169  if (UP.Threshold >= MaxBoost)
170  return;
171  }
172  }
173  continue;
174  }
175 
176  const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
177  if (!GEP)
178  continue;
179 
180  unsigned AS = GEP->getAddressSpace();
181  unsigned Threshold = 0;
182  if (AS == AMDGPUAS::PRIVATE_ADDRESS)
183  Threshold = ThresholdPrivate;
184  else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
185  Threshold = ThresholdLocal;
186  else
187  continue;
188 
189  if (UP.Threshold >= Threshold)
190  continue;
191 
192  if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
193  const Value *Ptr = GEP->getPointerOperand();
194  const AllocaInst *Alloca =
195  dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
196  if (!Alloca || !Alloca->isStaticAlloca())
197  continue;
198  Type *Ty = Alloca->getAllocatedType();
199  unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
200  if (AllocaSize > MaxAlloca)
201  continue;
202  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
203  AS == AMDGPUAS::REGION_ADDRESS) {
204  LocalGEPsSeen++;
205  // Inhibit unroll for local memory if we have seen addressing not to
206  // a variable, most likely we will be unable to combine it.
207  // Do not unroll too deep inner loops for local memory to give a chance
208  // to unroll an outer loop for a more important reason.
209  if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
210  (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
211  !isa<Argument>(GEP->getPointerOperand())))
212  continue;
213  LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
214  << *L << " due to LDS use.\n");
216  }
217 
218  // Check if GEP depends on a value defined by this loop itself.
219  bool HasLoopDef = false;
220  for (const Value *Op : GEP->operands()) {
221  const Instruction *Inst = dyn_cast<Instruction>(Op);
222  if (!Inst || L->isLoopInvariant(Op))
223  continue;
224 
225  if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
226  return SubLoop->contains(Inst); }))
227  continue;
228  HasLoopDef = true;
229  break;
230  }
231  if (!HasLoopDef)
232  continue;
233 
234  // We want to do whatever we can to limit the number of alloca
235  // instructions that make it through to the code generator. allocas
236  // require us to use indirect addressing, which is slow and prone to
237  // compiler bugs. If this loop does an address calculation on an
238  // alloca ptr, then we want to use a higher than normal loop unroll
239  // threshold. This will give SROA a better chance to eliminate these
240  // allocas.
241  //
242  // We also want to have more unrolling for local memory to let ds
243  // instructions with different offsets combine.
244  //
245  // Don't use the maximum allowed value here as it will make some
246  // programs way too big.
247  UP.Threshold = Threshold;
248  LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
249  << " for loop:\n"
250  << *L << " due to " << *GEP << '\n');
251  if (UP.Threshold >= MaxBoost)
252  return;
253  }
254 
255  // If we got a GEP in a small BB from inner loop then increase max trip
256  // count to analyze for better estimation cost in unroll
257  if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
259  }
260 }
261 
264  BaseT::getPeelingPreferences(L, SE, PP);
265 }
266 
267 const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
268  // Codegen control options which don't matter.
269  AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
270  AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
271  AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
272  AMDGPU::FeatureUnalignedAccessMode,
273 
274  AMDGPU::FeatureAutoWaitcntBeforeBarrier,
275 
276  // Property of the kernel/environment which can't actually differ.
277  AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
278  AMDGPU::FeatureTrapHandler,
279 
280  // The default assumption needs to be ecc is enabled, but no directly
281  // exposed operations depend on it, so it can be safely inlined.
282  AMDGPU::FeatureSRAMECC,
283 
284  // Perf-tuning features
285  AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
286 
288  : BaseT(TM, F.getParent()->getDataLayout()),
289  ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
290  TLI(ST->getTargetLowering()), CommonTTI(TM, F),
291  IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
293  HasFP32Denormals = Mode.allFP32Denormals();
294  HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
295 }
296 
297 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
298  // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
299  // registers. See getRegisterClassForType for the implementation.
300  // In this case vector registers are not vector in terms of
301  // VGPRs, but those which can hold multiple values.
302 
303  // This is really the number of registers to fill when vectorizing /
304  // interleaving loops, so we lie to avoid trying to use all registers.
305  return 4;
306 }
307 
308 TypeSize
310  switch (K) {
312  return TypeSize::getFixed(32);
314  return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
316  return TypeSize::getScalable(0);
317  }
318  llvm_unreachable("Unsupported register kind");
319 }
320 
322  return 32;
323 }
324 
325 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
326  if (Opcode == Instruction::Load || Opcode == Instruction::Store)
327  return 32 * 4 / ElemWidth;
328  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
329  : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
330  : 1;
331 }
332 
333 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
334  unsigned ChainSizeInBytes,
335  VectorType *VecTy) const {
336  unsigned VecRegBitWidth = VF * LoadSize;
337  if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
338  // TODO: Support element-size less than 32bit?
339  return 128 / LoadSize;
340 
341  return VF;
342 }
343 
344 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
345  unsigned ChainSizeInBytes,
346  VectorType *VecTy) const {
347  unsigned VecRegBitWidth = VF * StoreSize;
348  if (VecRegBitWidth > 128)
349  return 128 / StoreSize;
350 
351  return VF;
352 }
353 
354 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
355  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
356  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
357  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
358  AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
359  return 512;
360  }
361 
362  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
363  return 8 * ST->getMaxPrivateElementSize();
364 
365  // Common to flat, global, local and region. Assume for unknown addrspace.
366  return 128;
367 }
368 
369 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
370  Align Alignment,
371  unsigned AddrSpace) const {
372  // We allow vectorization of flat stores, even though we may need to decompose
373  // them later if they may access private memory. We don't have enough context
374  // here, and legalization can handle it.
375  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
376  return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
377  ChainSizeInBytes <= ST->getMaxPrivateElementSize();
378  }
379  return true;
380 }
381 
382 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
383  Align Alignment,
384  unsigned AddrSpace) const {
385  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
386 }
387 
388 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
389  Align Alignment,
390  unsigned AddrSpace) const {
391  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
392 }
393 
394 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
395 // iteration. Should we report a larger size and let it legalize?
396 //
397 // FIXME: Should we use narrower types for local/region, or account for when
398 // unaligned access is legal?
399 //
400 // FIXME: This could use fine tuning and microbenchmarks.
402  LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
403  unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
404  Optional<uint32_t> AtomicElementSize) const {
405 
406  if (AtomicElementSize)
407  return Type::getIntNTy(Context, *AtomicElementSize * 8);
408 
409  unsigned MinAlign = std::min(SrcAlign, DestAlign);
410 
411  // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
412  // hardware into byte accesses. If you assume all alignments are equally
413  // probable, it's more efficient on average to use short accesses for this
414  // case.
415  if (MinAlign == 2)
416  return Type::getInt16Ty(Context);
417 
418  // Not all subtargets have 128-bit DS instructions, and we currently don't
419  // form them by default.
420  if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
421  SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
422  DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
423  DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
425  }
426 
427  // Global memory works best with 16-byte accesses. Private memory will also
428  // hit this, although they'll be decomposed.
430 }
431 
434  unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
435  unsigned SrcAlign, unsigned DestAlign,
436  Optional<uint32_t> AtomicCpySize) const {
437  assert(RemainingBytes < 16);
438 
439  if (AtomicCpySize)
441  OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
442  DestAlign, AtomicCpySize);
443 
444  unsigned MinAlign = std::min(SrcAlign, DestAlign);
445 
446  if (MinAlign != 2) {
447  Type *I64Ty = Type::getInt64Ty(Context);
448  while (RemainingBytes >= 8) {
449  OpsOut.push_back(I64Ty);
450  RemainingBytes -= 8;
451  }
452 
453  Type *I32Ty = Type::getInt32Ty(Context);
454  while (RemainingBytes >= 4) {
455  OpsOut.push_back(I32Ty);
456  RemainingBytes -= 4;
457  }
458  }
459 
460  Type *I16Ty = Type::getInt16Ty(Context);
461  while (RemainingBytes >= 2) {
462  OpsOut.push_back(I16Ty);
463  RemainingBytes -= 2;
464  }
465 
466  Type *I8Ty = Type::getInt8Ty(Context);
467  while (RemainingBytes) {
468  OpsOut.push_back(I8Ty);
469  --RemainingBytes;
470  }
471 }
472 
473 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
474  // Disable unrolling if the loop is not vectorized.
475  // TODO: Enable this again.
476  if (VF == 1)
477  return 1;
478 
479  return 8;
480 }
481 
483  MemIntrinsicInfo &Info) const {
484  switch (Inst->getIntrinsicID()) {
485  case Intrinsic::amdgcn_atomic_inc:
486  case Intrinsic::amdgcn_atomic_dec:
487  case Intrinsic::amdgcn_ds_ordered_add:
488  case Intrinsic::amdgcn_ds_ordered_swap:
489  case Intrinsic::amdgcn_ds_fadd:
490  case Intrinsic::amdgcn_ds_fmin:
491  case Intrinsic::amdgcn_ds_fmax: {
492  auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
493  auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
494  if (!Ordering || !Volatile)
495  return false; // Invalid.
496 
497  unsigned OrderingVal = Ordering->getZExtValue();
498  if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
499  return false;
500 
501  Info.PtrVal = Inst->getArgOperand(0);
502  Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
503  Info.ReadMem = true;
504  Info.WriteMem = true;
505  Info.IsVolatile = !Volatile->isZero();
506  return true;
507  }
508  default:
509  return false;
510  }
511 }
512 
514  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
515  TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
516  TTI::OperandValueProperties Opd1PropInfo,
518  const Instruction *CxtI) {
519 
520  // Legalize the type.
521  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
522  int ISD = TLI->InstructionOpcodeToISD(Opcode);
523 
524  // Because we don't have any legal vector operations, but the legal types, we
525  // need to account for split vectors.
526  unsigned NElts = LT.second.isVector() ?
527  LT.second.getVectorNumElements() : 1;
528 
529  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
530 
531  switch (ISD) {
532  case ISD::SHL:
533  case ISD::SRL:
534  case ISD::SRA:
535  if (SLT == MVT::i64)
536  return get64BitInstrCost(CostKind) * LT.first * NElts;
537 
538  if (ST->has16BitInsts() && SLT == MVT::i16)
539  NElts = (NElts + 1) / 2;
540 
541  // i32
542  return getFullRateInstrCost() * LT.first * NElts;
543  case ISD::ADD:
544  case ISD::SUB:
545  case ISD::AND:
546  case ISD::OR:
547  case ISD::XOR:
548  if (SLT == MVT::i64) {
549  // and, or and xor are typically split into 2 VALU instructions.
550  return 2 * getFullRateInstrCost() * LT.first * NElts;
551  }
552 
553  if (ST->has16BitInsts() && SLT == MVT::i16)
554  NElts = (NElts + 1) / 2;
555 
556  return LT.first * NElts * getFullRateInstrCost();
557  case ISD::MUL: {
558  const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
559  if (SLT == MVT::i64) {
560  const int FullRateCost = getFullRateInstrCost();
561  return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
562  }
563 
564  if (ST->has16BitInsts() && SLT == MVT::i16)
565  NElts = (NElts + 1) / 2;
566 
567  // i32
568  return QuarterRateCost * NElts * LT.first;
569  }
570  case ISD::FMUL:
571  // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
572  // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
573  // fused operation.
574  if (CxtI && CxtI->hasOneUse())
575  if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
576  const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
577  if (OPC == ISD::FADD || OPC == ISD::FSUB) {
578  if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
580  if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
582 
583  // Estimate all types may be fused with contract/unsafe flags
585  if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
586  Options.UnsafeFPMath ||
587  (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
589  }
590  }
592  case ISD::FADD:
593  case ISD::FSUB:
594  if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
595  NElts = (NElts + 1) / 2;
596  if (SLT == MVT::f64)
597  return LT.first * NElts * get64BitInstrCost(CostKind);
598 
599  if (ST->has16BitInsts() && SLT == MVT::f16)
600  NElts = (NElts + 1) / 2;
601 
602  if (SLT == MVT::f32 || SLT == MVT::f16)
603  return LT.first * NElts * getFullRateInstrCost();
604  break;
605  case ISD::FDIV:
606  case ISD::FREM:
607  // FIXME: frem should be handled separately. The fdiv in it is most of it,
608  // but the current lowering is also not entirely correct.
609  if (SLT == MVT::f64) {
610  int Cost = 7 * get64BitInstrCost(CostKind) +
611  getQuarterRateInstrCost(CostKind) +
612  3 * getHalfRateInstrCost(CostKind);
613  // Add cost of workaround.
615  Cost += 3 * getFullRateInstrCost();
616 
617  return LT.first * Cost * NElts;
618  }
619 
620  if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
621  // TODO: This is more complicated, unsafe flags etc.
622  if ((SLT == MVT::f32 && !HasFP32Denormals) ||
623  (SLT == MVT::f16 && ST->has16BitInsts())) {
624  return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
625  }
626  }
627 
628  if (SLT == MVT::f16 && ST->has16BitInsts()) {
629  // 2 x v_cvt_f32_f16
630  // f32 rcp
631  // f32 fmul
632  // v_cvt_f16_f32
633  // f16 div_fixup
634  int Cost =
635  4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
636  return LT.first * Cost * NElts;
637  }
638 
639  if (SLT == MVT::f32 || SLT == MVT::f16) {
640  // 4 more v_cvt_* insts without f16 insts support
641  int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
642  1 * getQuarterRateInstrCost(CostKind);
643 
644  if (!HasFP32Denormals) {
645  // FP mode switches.
646  Cost += 2 * getFullRateInstrCost();
647  }
648 
649  return LT.first * NElts * Cost;
650  }
651  break;
652  case ISD::FNEG:
653  // Use the backend' estimation. If fneg is not free each element will cost
654  // one additional instruction.
655  return TLI->isFNegFree(SLT) ? 0 : NElts;
656  default:
657  break;
658  }
659 
660  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
661  Opd1PropInfo, Opd2PropInfo, Args, CxtI);
662 }
663 
664 // Return true if there's a potential benefit from using v2f16/v2i16
665 // instructions for an intrinsic, even if it requires nontrivial legalization.
667  switch (ID) {
668  case Intrinsic::fma: // TODO: fmuladd
669  // There's a small benefit to using vector ops in the legalized code.
670  case Intrinsic::round:
671  case Intrinsic::uadd_sat:
672  case Intrinsic::usub_sat:
673  case Intrinsic::sadd_sat:
674  case Intrinsic::ssub_sat:
675  return true;
676  default:
677  return false;
678  }
679 }
680 
684  if (ICA.getID() == Intrinsic::fabs)
685  return 0;
686 
689 
690  Type *RetTy = ICA.getReturnType();
691 
692  // Legalize the type.
693  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
694 
695  unsigned NElts = LT.second.isVector() ?
696  LT.second.getVectorNumElements() : 1;
697 
698  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
699 
700  if (SLT == MVT::f64)
701  return LT.first * NElts * get64BitInstrCost(CostKind);
702 
703  if ((ST->has16BitInsts() && SLT == MVT::f16) ||
704  (ST->hasPackedFP32Ops() && SLT == MVT::f32))
705  NElts = (NElts + 1) / 2;
706 
707  // TODO: Get more refined intrinsic costs?
708  unsigned InstRate = getQuarterRateInstrCost(CostKind);
709 
710  switch (ICA.getID()) {
711  case Intrinsic::fma:
712  InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
713  : getQuarterRateInstrCost(CostKind);
714  break;
715  case Intrinsic::uadd_sat:
716  case Intrinsic::usub_sat:
717  case Intrinsic::sadd_sat:
718  case Intrinsic::ssub_sat:
719  static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
720  if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
721  NElts = 1;
722  break;
723  }
724 
725  return LT.first * NElts * InstRate;
726 }
727 
730  const Instruction *I) {
731  assert((I == nullptr || I->getOpcode() == Opcode) &&
732  "Opcode should reflect passed instruction.");
733  const bool SCost =
735  const int CBrCost = SCost ? 5 : 7;
736  switch (Opcode) {
737  case Instruction::Br: {
738  // Branch instruction takes about 4 slots on gfx900.
739  auto BI = dyn_cast_or_null<BranchInst>(I);
740  if (BI && BI->isUnconditional())
741  return SCost ? 1 : 4;
742  // Suppose conditional branch takes additional 3 exec manipulations
743  // instructions in average.
744  return CBrCost;
745  }
746  case Instruction::Switch: {
747  auto SI = dyn_cast_or_null<SwitchInst>(I);
748  // Each case (including default) takes 1 cmp + 1 cbr instructions in
749  // average.
750  return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
751  }
752  case Instruction::Ret:
753  return SCost ? 1 : 10;
754  }
755  return BaseT::getCFInstrCost(Opcode, CostKind, I);
756 }
757 
763  return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
764 
765  EVT OrigTy = TLI->getValueType(DL, Ty);
766 
767  // Computes cost on targets that have packed math instructions(which support
768  // 16-bit types only).
769  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
770  return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
771 
772  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
773  return LT.first * getFullRateInstrCost();
774 }
775 
778  bool IsUnsigned,
780  EVT OrigTy = TLI->getValueType(DL, Ty);
781 
782  // Computes cost on targets that have packed math instructions(which support
783  // 16-bit types only).
784  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
785  return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
786 
787  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
788  return LT.first * getHalfRateInstrCost(CostKind);
789 }
790 
792  unsigned Index) {
793  switch (Opcode) {
794  case Instruction::ExtractElement:
795  case Instruction::InsertElement: {
796  unsigned EltSize
797  = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
798  if (EltSize < 32) {
799  if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
800  return 0;
801  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
802  }
803 
804  // Extracts are just reads of a subregister, so are free. Inserts are
805  // considered free because we don't want to have any cost for scalarizing
806  // operations, and we don't have to copy into a different register class.
807 
808  // Dynamic indexing isn't free and is best avoided.
809  return Index == ~0u ? 2 : 0;
810  }
811  default:
812  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
813  }
814 }
815 
816 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
817 /// this is analyzing the collective result of all output registers. Otherwise,
818 /// this is only querying a specific result index if this returns multiple
819 /// registers in a struct.
821  const CallInst *CI, ArrayRef<unsigned> Indices) const {
822  // TODO: Handle complex extract indices
823  if (Indices.size() > 1)
824  return true;
825 
826  const DataLayout &DL = CI->getModule()->getDataLayout();
827  const SIRegisterInfo *TRI = ST->getRegisterInfo();
828  TargetLowering::AsmOperandInfoVector TargetConstraints =
829  TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
830 
831  const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
832 
833  int OutputIdx = 0;
834  for (auto &TC : TargetConstraints) {
835  if (TC.Type != InlineAsm::isOutput)
836  continue;
837 
838  // Skip outputs we don't care about.
839  if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
840  continue;
841 
842  TLI->ComputeConstraintToUse(TC, SDValue());
843 
845  TRI, TC.ConstraintCode, TC.ConstraintVT).second;
846 
847  // For AGPR constraints null is returned on subtargets without AGPRs, so
848  // assume divergent for null.
849  if (!RC || !TRI->isSGPRClass(RC))
850  return true;
851  }
852 
853  return false;
854 }
855 
856 /// \returns true if the new GPU divergence analysis is enabled.
858  return !UseLegacyDA;
859 }
860 
861 /// \returns true if the result of the value could potentially be
862 /// different across workitems in a wavefront.
864  if (const Argument *A = dyn_cast<Argument>(V))
865  return !AMDGPU::isArgPassedInSGPR(A);
866 
867  // Loads from the private and flat address spaces are divergent, because
868  // threads can execute the load instruction with the same inputs and get
869  // different results.
870  //
871  // All other loads are not divergent, because if threads issue loads with the
872  // same arguments, they will always get the same result.
873  if (const LoadInst *Load = dyn_cast<LoadInst>(V))
874  return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
875  Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
876 
877  // Atomics are divergent because they are executed sequentially: when an
878  // atomic operation refers to the same address in each thread, then each
879  // thread after the first sees the value written by the previous thread as
880  // original value.
881  if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
882  return true;
883 
884  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
885  return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
886 
887  // Assume all function calls are a source of divergence.
888  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
889  if (CI->isInlineAsm())
891  return true;
892  }
893 
894  // Assume all function calls are a source of divergence.
895  if (isa<InvokeInst>(V))
896  return true;
897 
898  return false;
899 }
900 
901 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
902  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
903  switch (Intrinsic->getIntrinsicID()) {
904  default:
905  return false;
906  case Intrinsic::amdgcn_readfirstlane:
907  case Intrinsic::amdgcn_readlane:
908  case Intrinsic::amdgcn_icmp:
909  case Intrinsic::amdgcn_fcmp:
910  case Intrinsic::amdgcn_ballot:
911  case Intrinsic::amdgcn_if_break:
912  return true;
913  }
914  }
915 
916  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
917  if (CI->isInlineAsm())
918  return !isInlineAsmSourceOfDivergence(CI);
919  return false;
920  }
921 
922  const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
923  if (!ExtValue)
924  return false;
925 
926  const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
927  if (!CI)
928  return false;
929 
930  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
931  switch (Intrinsic->getIntrinsicID()) {
932  default:
933  return false;
934  case Intrinsic::amdgcn_if:
935  case Intrinsic::amdgcn_else: {
936  ArrayRef<unsigned> Indices = ExtValue->getIndices();
937  return Indices.size() == 1 && Indices[0] == 1;
938  }
939  }
940  }
941 
942  // If we have inline asm returning mixed SGPR and VGPR results, we inferred
943  // divergent for the overall struct return. We need to override it in the
944  // case we're extracting an SGPR component here.
945  if (CI->isInlineAsm())
946  return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
947 
948  return false;
949 }
950 
952  Intrinsic::ID IID) const {
953  switch (IID) {
954  case Intrinsic::amdgcn_atomic_inc:
955  case Intrinsic::amdgcn_atomic_dec:
956  case Intrinsic::amdgcn_ds_fadd:
957  case Intrinsic::amdgcn_ds_fmin:
958  case Intrinsic::amdgcn_ds_fmax:
959  case Intrinsic::amdgcn_is_shared:
960  case Intrinsic::amdgcn_is_private:
961  OpIndexes.push_back(0);
962  return true;
963  default:
964  return false;
965  }
966 }
967 
969  Value *OldV,
970  Value *NewV) const {
971  auto IntrID = II->getIntrinsicID();
972  switch (IntrID) {
973  case Intrinsic::amdgcn_atomic_inc:
974  case Intrinsic::amdgcn_atomic_dec:
975  case Intrinsic::amdgcn_ds_fadd:
976  case Intrinsic::amdgcn_ds_fmin:
977  case Intrinsic::amdgcn_ds_fmax: {
978  const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
979  if (!IsVolatile->isZero())
980  return nullptr;
981  Module *M = II->getParent()->getParent()->getParent();
982  Type *DestTy = II->getType();
983  Type *SrcTy = NewV->getType();
984  Function *NewDecl =
985  Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
986  II->setArgOperand(0, NewV);
987  II->setCalledFunction(NewDecl);
988  return II;
989  }
990  case Intrinsic::amdgcn_is_shared:
991  case Intrinsic::amdgcn_is_private: {
992  unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
994  unsigned NewAS = NewV->getType()->getPointerAddressSpace();
995  LLVMContext &Ctx = NewV->getType()->getContext();
996  ConstantInt *NewVal = (TrueAS == NewAS) ?
998  return NewVal;
999  }
1000  case Intrinsic::ptrmask: {
1001  unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1002  unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1003  Value *MaskOp = II->getArgOperand(1);
1004  Type *MaskTy = MaskOp->getType();
1005 
1006  bool DoTruncate = false;
1007 
1008  const GCNTargetMachine &TM =
1009  static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1010  if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1011  // All valid 64-bit to 32-bit casts work by chopping off the high
1012  // bits. Any masking only clearing the low bits will also apply in the new
1013  // address space.
1014  if (DL.getPointerSizeInBits(OldAS) != 64 ||
1015  DL.getPointerSizeInBits(NewAS) != 32)
1016  return nullptr;
1017 
1018  // TODO: Do we need to thread more context in here?
1019  KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
1020  if (Known.countMinLeadingOnes() < 32)
1021  return nullptr;
1022 
1023  DoTruncate = true;
1024  }
1025 
1026  IRBuilder<> B(II);
1027  if (DoTruncate) {
1028  MaskTy = B.getInt32Ty();
1029  MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1030  }
1031 
1032  return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1033  {NewV, MaskOp});
1034  }
1035  default:
1036  return nullptr;
1037  }
1038 }
1039 
1042  int Index, VectorType *SubTp,
1045  if (ST->hasVOP3PInsts()) {
1046  if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
1047  DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1048  // With op_sel VOP3P instructions freely can access the low half or high
1049  // half of a register, so any swizzle is free.
1050 
1051  switch (Kind) {
1052  case TTI::SK_Broadcast:
1053  case TTI::SK_Reverse:
1055  return 0;
1056  default:
1057  break;
1058  }
1059  }
1060  }
1061 
1062  return BaseT::getShuffleCost(Kind, VT, Mask, Index, SubTp);
1063 }
1064 
1066  const Function *Callee) const {
1067  const TargetMachine &TM = getTLI()->getTargetMachine();
1068  const GCNSubtarget *CallerST
1069  = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1070  const GCNSubtarget *CalleeST
1071  = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1072 
1073  const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1074  const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1075 
1076  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1077  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1078  if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1079  return false;
1080 
1081  // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1082  // no way to support merge for backend defined attributes.
1083  AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
1085  if (!CallerMode.isInlineCompatible(CalleeMode))
1086  return false;
1087 
1088  if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1089  Callee->hasFnAttribute(Attribute::InlineHint))
1090  return true;
1091 
1092  // Hack to make compile times reasonable.
1093  if (InlineMaxBB) {
1094  // Single BB does not increase total BB amount.
1095  if (Callee->size() == 1)
1096  return true;
1097  size_t BBSize = Caller->size() + Callee->size() - 1;
1098  return BBSize <= InlineMaxBB;
1099  }
1100 
1101  return true;
1102 }
1103 
1105  // If we have a pointer to private array passed into a function
1106  // it will not be optimized out, leaving scratch usage.
1107  // Increase the inline threshold to allow inlining in this case.
1108  uint64_t AllocaSize = 0;
1110  for (Value *PtrArg : CB->args()) {
1111  PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1112  if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
1114  continue;
1115 
1116  PtrArg = getUnderlyingObject(PtrArg);
1117  if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
1118  if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1119  continue;
1120  AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1121  // If the amount of stack memory is excessive we will not be able
1122  // to get rid of the scratch anyway, bail out.
1123  if (AllocaSize > ArgAllocaCutoff) {
1124  AllocaSize = 0;
1125  break;
1126  }
1127  }
1128  }
1129  if (AllocaSize)
1130  return ArgAllocaCost;
1131  return 0;
1132 }
1133 
1137  CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1138 }
1139 
1142  CommonTTI.getPeelingPreferences(L, SE, PP);
1143 }
1144 
1145 int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1146  return ST->hasFullRate64Ops()
1147  ? getFullRateInstrCost()
1148  : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1149  : getQuarterRateInstrCost(CostKind);
1150 }
llvm::ISD::SUB
@ SUB
Definition: ISDOpcodes.h:240
UseLegacyDA
static cl::opt< bool > UseLegacyDA("amdgpu-use-legacy-divergence-analysis", cl::desc("Enable legacy divergence analysis for AMDGPU"), cl::init(false), cl::Hidden)
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::GCNTTIImpl::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
Definition: AMDGPUTargetTransformInfo.cpp:777
llvm::TargetTransformInfo::UnrollingPreferences::BEInsns
unsigned BEInsns
Definition: TargetTransformInfo.h:480
llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
llvm::Type::isSized
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:264
llvm::BasicTTIImplBase< AMDGPUTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:37
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:487
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:210
llvm::Loop::isLoopInvariant
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:60
llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition: TargetTransformInfo.h:455
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:65
llvm::GCNTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: AMDGPUTargetTransformInfo.cpp:1134
llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1621
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:667
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1410
llvm::TargetTransformInfo::UnrollingPreferences::MaxCount
unsigned MaxCount
Definition: TargetTransformInfo.h:471
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:104
llvm::TargetOptions
Definition: TargetOptions.h:124
llvm::Function
Definition: Function.h:60
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
llvm::GCNTTIImpl::getMemcpyLoopResidualLoweringType
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, Optional< uint32_t > AtomicCpySize) const
Definition: AMDGPUTargetTransformInfo.cpp:432
llvm::LoopBase::contains
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:122
llvm::AMDGPUTargetLowering::isFNegFree
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
Definition: AMDGPUISelLowering.cpp:761
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:53
llvm::CallBase::setCalledFunction
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1435
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:673
llvm::BasicTTIImplBase< GCNTTIImpl >::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1085
llvm::PatternMatch::m_FPOne
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:826
llvm::GCNTTIImpl::isSourceOfDivergence
bool isSourceOfDivergence(const Value *V) const
Definition: AMDGPUTargetTransformInfo.cpp:863
llvm::CallBase::isInlineAsm
bool isInlineAsm() const
Check if this call is an inline asm statement.
Definition: InstrTypes.h:1464
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:729
llvm::IRBuilder<>
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:149
llvm::PointerType::getAddressSpace
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:682
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:449
llvm::TargetTransformInfo::RGK_Scalar
@ RGK_Scalar
Definition: TargetTransformInfo.h:919
llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:776
ValueTracking.h
llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:213
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
llvm::GCNTTIImpl::isLegalToVectorizeMemChain
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:369
llvm::Depth
@ Depth
Definition: SIMachineScheduler.h:36
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:483
dependsOnLocalPhi
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
Definition: AMDGPUTargetTransformInfo.cpp:79
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::GCNTTIImpl::getMaxInterleaveFactor
unsigned getMaxInterleaveFactor(unsigned VF)
Definition: AMDGPUTargetTransformInfo.cpp:473
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:537
llvm::BasicTTIImplBase< GCNTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:858
llvm::SITargetLowering::getRegForInlineAsmConstraint
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
Definition: SIISelLowering.cpp:11896
llvm::Optional< uint32_t >
llvm::AMDGPU::SIModeRegisterDefaults
Definition: AMDGPUBaseInfo.h:982
llvm::GCNTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: AMDGPUTargetTransformInfo.cpp:1065
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:40
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
llvm::GCNTTIImpl::getNumberOfRegisters
unsigned getNumberOfRegisters(unsigned RCID) const
Definition: AMDGPUTargetTransformInfo.cpp:297
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:422
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
llvm::TargetLowering::ComputeConstraintToUse
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
Definition: TargetLowering.cpp:5459
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:119
llvm::AMDGPU::SIModeRegisterDefaults::isInlineCompatible
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Definition: AMDGPUBaseInfo.h:1067
llvm::GCNTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: AMDGPUTargetTransformInfo.cpp:759
llvm::GCNSubtarget::hasPackedFP32Ops
bool hasPackedFP32Ops() const
Definition: GCNSubtarget.h:854
llvm::AMDGPU::isIntrinsicSourceOfDivergence
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
Definition: AMDGPUBaseInfo.cpp:2302
llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition: TargetTransformInfo.h:882
llvm::Type::getInt8Ty
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:237
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1620
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:239
llvm::GCNSubtarget::getRegisterInfo
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:226
llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:159
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:872
F
#define F(x, y, z)
Definition: MD5.cpp:55
KnownBits.h
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:366
llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(Optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition: TargetTransformInfo.h:1207
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
llvm::MVT::SimpleValueType
SimpleValueType
Definition: MachineValueType.h:33
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:266
Context
LLVMContext & Context
Definition: NVVMIntrRange.cpp:66
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::AllocaInst::isStaticAlloca
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Definition: Instructions.cpp:1419
llvm::LoopBase::getSubLoops
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
Definition: LoopInfo.h:143
llvm::MinAlign
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition: MathExtras.h:699
llvm::AMDGPUTargetMachine
Definition: AMDGPUTargetMachine.h:28
llvm::GCNSubtarget::hasFastFMAF32
bool hasFastFMAF32() const
Definition: GCNSubtarget.h:305
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::AllocaInst::getAllocatedType
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:114
llvm::BasicTTIImplBase< GCNTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:1138
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:871
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:34
llvm::GCNTTIImpl::isLegalToVectorizeStoreChain
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:388
llvm::MVT::f64
@ f64
Definition: MachineValueType.h:56
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:56
llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2218
llvm::GCNTTIImpl::getMinVectorRegisterBitWidth
unsigned getMinVectorRegisterBitWidth() const
Definition: AMDGPUTargetTransformInfo.cpp:321
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:45
llvm::ISD::SRA
@ SRA
Definition: ISDOpcodes.h:692
llvm::TargetTransformInfo::UnrollingPreferences::MaxIterationsCountToAnalyze
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
Definition: TargetTransformInfo.h:509
llvm::GCNSubtarget::getMaxPrivateElementSize
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:278
llvm::GCNTTIImpl::getLoadVectorFactor
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: AMDGPUTargetTransformInfo.cpp:333
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
AMDGPU
Definition: AMDGPUReplaceLDSUseWithPointer.cpp:114
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:363
llvm::Instruction
Definition: Instruction.h:42
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:189
llvm::TargetTransformInfoImplBase::getMemcpyLoopResidualLoweringType
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, Optional< uint32_t > AtomicCpySize) const
Definition: TargetTransformInfoImpl.h:713
Options
const char LLVMTargetMachineRef LLVMPassBuilderOptionsRef Options
Definition: PassBuilderBindings.cpp:48
llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
Definition: ValueTracking.cpp:4362
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:30
llvm::GCNTTIImpl::isAlwaysUniform
bool isAlwaysUniform(const Value *V) const
Definition: AMDGPUTargetTransformInfo.cpp:901
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:666
llvm::AMDGPUSubtarget::has16BitInsts
bool has16BitInsts() const
Definition: AMDGPUSubtarget.h:146
PatternMatch.h
llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition: TargetTransformInfo.h:919
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:684
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::AMDGPUTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AMDGPUTargetTransformInfo.cpp:262
llvm::LinearPolySize< TypeSize >::getFixed
static TypeSize getFixed(ScalarTy MinVal)
Definition: TypeSize.h:283
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:91
llvm::lltok::Kind
Kind
Definition: LLToken.h:18
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:117
LoopInfo.h
InlineMaxBB
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
AMDGPUTargetTransformInfo.h
llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:873
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition: AtomicOrdering.h:56
llvm::cl::opt
Definition: CommandLine.h:1392
llvm::GCNTTIImpl::getTgtMemIntrinsic
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
Definition: AMDGPUTargetTransformInfo.cpp:482
AMDGPUMCTargetDesc.h
llvm::KnownBits::countMinLeadingOnes
unsigned countMinLeadingOnes() const
Returns the minimum number of leading one bits.
Definition: KnownBits.h:241
uint64_t
llvm::AMDGPUTTIImpl::AMDGPUTTIImpl
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition: AMDGPUTargetTransformInfo.cpp:98
llvm::Instruction::hasAllowContract
bool hasAllowContract() const
Determine whether the allow-contract flag is set.
Definition: Instruction.cpp:279
llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:577
const
aarch64 promote const
Definition: AArch64PromoteConstant.cpp:232
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::GCNTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=None)
Definition: AMDGPUTargetTransformInfo.cpp:1040
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
llvm::TargetLowering::ParseConstraints
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
Definition: TargetLowering.cpp:5111
llvm::AMDGPU::getIntegerAttribute
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
Definition: AMDGPUBaseInfo.cpp:921
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:430
llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:362
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:898
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:929
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
llvm::AMDGPUSubtarget::hasMadMacF32Insts
bool hasMadMacF32Insts() const
Definition: AMDGPUSubtarget.h:156
llvm::PointerType
Class to represent pointers.
Definition: DerivedTypes.h:632
llvm::BasicTTIImplBase< AMDGPUTTIImpl >
intrinsicHasPackedVectorBenefit
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
Definition: AMDGPUTargetTransformInfo.cpp:666
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:222
llvm::TargetMachine::Options
TargetOptions Options
Definition: TargetMachine.h:118
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
llvm::LoopBase::getLoopDepth
unsigned getLoopDepth() const
Return the nesting level of this loop.
Definition: LoopInfo.h:96
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::GCNSubtarget::hasUnalignedScratchAccess
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:517
Mode
SI Whole Quad Mode
Definition: SIWholeQuadMode.cpp:262
UnrollMaxBlockToAnalyze
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
llvm::MVT
Machine Value Type.
Definition: MachineValueType.h:31
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
llvm::MDNode
Metadata node.
Definition: Metadata.h:937
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:890
llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:364
UnrollThresholdPrivate
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
llvm::GCNTargetMachine
Definition: AMDGPUTargetMachine.h:73
llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition: TargetTransformInfo.h:261
llvm::AMDGPUSubtarget::hasVOP3PInsts
bool hasVOP3PInsts() const
Definition: AMDGPUSubtarget.h:168
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1614
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:47
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:137
llvm::AMDGPUTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: AMDGPUTargetTransformInfo.cpp:104
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:359
llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:352
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:214
llvm::GCNTTIImpl::isLegalToVectorizeLoadChain
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:382
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::GCNTTIImpl::isInlineAsmSourceOfDivergence
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
Definition: AMDGPUTargetTransformInfo.cpp:820
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
getParent
static const Function * getParent(const Value *V)
Definition: BasicAliasAnalysis.cpp:868
UnrollThresholdIf
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
llvm::TargetLoweringBase::InstructionOpcodeToISD
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Definition: TargetLoweringBase.cpp:1728
llvm::AMDGPU::isGraphics
bool isGraphics(CallingConv::ID cc)
Definition: AMDGPUBaseInfo.cpp:1605
llvm::findOptionMDForLoop
MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
Definition: LoopInfo.cpp:1042
llvm::GCNTTIImpl::getRegisterBitWidth
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:309
llvm::ConstantInt::getSExtValue
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:148
llvm::BasicTTIImplBase< AMDGPUTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:584
llvm::AMDGPU::HSAMD::Kernel::Arg::Key::IsVolatile
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
Definition: AMDGPUMetadata.h:199
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:280
llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:176
llvm::ISD::FMUL
@ FMUL
Definition: ISDOpcodes.h:392
llvm::CallBase::setArgOperand
void setArgOperand(unsigned i, Value *v)
Definition: InstrTypes.h:1346
UnrollThresholdLocal
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
llvm::ConstantInt::getFalse
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:874
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:668
Callee
amdgpu Simplify well known AMD library false FunctionCallee Callee
Definition: AMDGPULibCalls.cpp:186
llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition: TargetLowering.h:347
llvm::GCNTTIImpl::getMemcpyLoopLoweringType
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, Optional< uint32_t > AtomicElementSize) const
Definition: AMDGPUTargetTransformInfo.cpp:401
llvm::LoopBase::isInnermost
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:165
ArgAllocaCost
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
llvm::GCNTTIImpl::getMaximumVF
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
Definition: AMDGPUTargetTransformInfo.cpp:325
llvm::GCNTTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AMDGPUTargetTransformInfo.cpp:728
llvm::Type::getInt64Ty
static IntegerType * getInt64Ty(LLVMContext &C)
Definition: Type.cpp:240
llvm::AMDGPU::isArgPassedInSGPR
bool isArgPassedInSGPR(const Argument *A)
Definition: AMDGPUBaseInfo.cpp:2112
llvm::ConstantInt::getTrue
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:867
llvm::KnownBits
Definition: KnownBits.h:23
llvm::Type::getIntNTy
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:243
llvm::SITargetLowering::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Definition: SIISelLowering.cpp:12740
llvm::AMDGPUAS::BUFFER_FAT_POINTER
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
Definition: AMDGPU.h:368
llvm::GCNSubtarget::hasFullRate64Ops
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:313
llvm::GCNTTIImpl::collectFlatAddressOperands
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
Definition: AMDGPUTargetTransformInfo.cpp:951
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:344
llvm::GCNTTIImpl::getLoadStoreVecRegBitWidth
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:354
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:360
llvm::ExtractValueInst
This instruction extracts a struct member or array element value from an aggregate value.
Definition: Instructions.h:2411
llvm::GCNTTIImpl::getStoreVectorFactor
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: AMDGPUTargetTransformInfo.cpp:344
llvm::TypeSize
Definition: TypeSize.h:421
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
llvm::LinearPolySize< TypeSize >::getScalable
static TypeSize getScalable(ScalarTy MinVal)
Definition: TypeSize.h:286
llvm::GCNTTIImpl::adjustInliningThreshold
unsigned adjustInliningThreshold(const CallBase *CB) const
Definition: AMDGPUTargetTransformInfo.cpp:1104
llvm::GCNTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: AMDGPUTargetTransformInfo.cpp:513
llvm::SDValue
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
Definition: SelectionDAGNodes.h:137
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
llvm::TargetTransformInfo::RGK_ScalableVector
@ RGK_ScalableVector
Definition: TargetTransformInfo.h:919
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:46
llvm::GCNTTIImpl::rewriteIntrinsicWithAddressSpace
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
Definition: AMDGPUTargetTransformInfo.cpp:968
llvm::InlineAsm::isOutput
@ isOutput
Definition: InlineAsm.h:95
llvm::FPOpFusion::Fast
@ Fast
Definition: TargetOptions.h:37
llvm::BasicTTIImplBase< GCNTTIImpl >::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
Definition: BasicTTIImpl.h:2228
llvm::RecurKind::FAdd
@ FAdd
Sum of floats.
llvm::ISD::FSUB
@ FSUB
Definition: ISDOpcodes.h:391
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:147
llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:691
llvm::TargetTransformInfo::RegisterKind
RegisterKind
Definition: TargetTransformInfo.h:919
llvm::ISD::FREM
@ FREM
Definition: ISDOpcodes.h:394
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:241
llvm::MVT::f16
@ f16
Definition: MachineValueType.h:54
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1341
llvm::BasicTTIImplBase< GCNTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1379
llvm::TargetTransformInfo::UnrollingPreferences::Threshold
unsigned Threshold
The cost threshold for the unrolled loop.
Definition: TargetTransformInfo.h:438
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:358
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:91
llvm::ISD::SRL
@ SRL
Definition: ISDOpcodes.h:693
llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:164
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
llvm::GCNTTIImpl::GCNTTIImpl
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition: AMDGPUTargetTransformInfo.cpp:287
llvm::PHINode
Definition: Instructions.h:2664
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:42
llvm::MemIntrinsicInfo
Information about a load/store intrinsic defined by the target.
Definition: TargetTransformInfo.h:69
llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1174
llvm::Type::getInt16Ty
static IntegerType * getInt16Ty(LLVMContext &C)
Definition: Type.cpp:238
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:398
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::DataLayout::getPointerSizeInBits
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:412
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:45
llvm::LoopBase::isLoopExiting
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
Definition: LoopInfo.h:225
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1474
llvm::ISD::FNEG
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:904
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:172
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:394
llvm::GCNTTIImpl::useGPUDivergenceAnalysis
bool useGPUDivergenceAnalysis() const
Definition: AMDGPUTargetTransformInfo.cpp:857
llvm::AllocaInst
an instruction to allocate memory on the stack
Definition: Instructions.h:58
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:405
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1459
ArgAllocaCutoff
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition: Instructions.h:3099
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:89
llvm::MVT::f32
@ f32
Definition: MachineValueType.h:55
UnrollRuntimeLocal
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
llvm::ExtractValueInst::getIndices
ArrayRef< unsigned > getIndices() const
Definition: Instructions.h:2476
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::GCNTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: AMDGPUTargetTransformInfo.cpp:682
AMDGPUTargetMachine.h
llvm::CallBase::args
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1332
llvm::GCNTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index)
Definition: AMDGPUTargetTransformInfo.cpp:791
llvm::TargetLowering::AsmOperandInfoVector
std::vector< AsmOperandInfo > AsmOperandInfoVector
Definition: TargetLowering.h:4453
llvm::GCNSubtarget::hasUsableDivScaleConditionOutput
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:411
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37
llvm::BasicTTIImplBase< GCNTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=None)
Definition: BasicTTIImpl.h:891
llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:506
llvm::ISD::FDIV
@ FDIV
Definition: ISDOpcodes.h:393
llvm::GCNTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AMDGPUTargetTransformInfo.cpp:1140