LLVM  14.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
18 #include "AMDGPUTargetMachine.h"
20 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/IR/IntrinsicsAMDGPU.h"
24 #include "llvm/IR/PatternMatch.h"
25 #include "llvm/Support/KnownBits.h"
26 
27 using namespace llvm;
28 
29 #define DEBUG_TYPE "AMDGPUtti"
30 
32  "amdgpu-unroll-threshold-private",
33  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
34  cl::init(2700), cl::Hidden);
35 
37  "amdgpu-unroll-threshold-local",
38  cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
39  cl::init(1000), cl::Hidden);
40 
42  "amdgpu-unroll-threshold-if",
43  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
44  cl::init(200), cl::Hidden);
45 
47  "amdgpu-unroll-runtime-local",
48  cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
49  cl::init(true), cl::Hidden);
50 
52  "amdgpu-use-legacy-divergence-analysis",
53  cl::desc("Enable legacy divergence analysis for AMDGPU"),
54  cl::init(false), cl::Hidden);
55 
57  "amdgpu-unroll-max-block-to-analyze",
58  cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
59  cl::init(32), cl::Hidden);
60 
61 static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
62  cl::Hidden, cl::init(4000),
63  cl::desc("Cost of alloca argument"));
64 
65 // If the amount of scratch memory to eliminate exceeds our ability to allocate
66 // it into registers we gain nothing by aggressively inlining functions for that
67 // heuristic.
68 static cl::opt<unsigned>
69  ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
70  cl::init(256),
71  cl::desc("Maximum alloca size to use for inline cost"));
72 
73 // Inliner constraint to achieve reasonable compilation time.
75  "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
76  cl::desc("Maximum number of BBs allowed in a function after inlining"
77  " (compile time constraint)"));
78 
79 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
80  unsigned Depth = 0) {
81  const Instruction *I = dyn_cast<Instruction>(Cond);
82  if (!I)
83  return false;
84 
85  for (const Value *V : I->operand_values()) {
86  if (!L->contains(I))
87  continue;
88  if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
89  if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
90  return SubLoop->contains(PHI); }))
91  return true;
92  } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
93  return true;
94  }
95  return false;
96 }
97 
99  : BaseT(TM, F.getParent()->getDataLayout()),
100  TargetTriple(TM->getTargetTriple()),
101  ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
102  TLI(ST->getTargetLowering()) {}
103 
107  const Function &F = *L->getHeader()->getParent();
108  UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
110  UP.Partial = true;
111 
112  // Conditional branch in a loop back edge needs 3 additional exec
113  // manipulations in average.
114  UP.BEInsns += 3;
115 
116  // TODO: Do we want runtime unrolling?
117 
118  // Maximum alloca size than can fit registers. Reserve 16 registers.
119  const unsigned MaxAlloca = (256 - 16) * 4;
120  unsigned ThresholdPrivate = UnrollThresholdPrivate;
121  unsigned ThresholdLocal = UnrollThresholdLocal;
122 
123  // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
124  // provided threshold value as the default for Threshold
125  if (MDNode *LoopUnrollThreshold =
126  findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
127  if (LoopUnrollThreshold->getNumOperands() == 2) {
128  ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
129  LoopUnrollThreshold->getOperand(1));
130  if (MetaThresholdValue) {
131  // We will also use the supplied value for PartialThreshold for now.
132  // We may introduce additional metadata if it becomes necessary in the
133  // future.
134  UP.Threshold = MetaThresholdValue->getSExtValue();
135  UP.PartialThreshold = UP.Threshold;
136  ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
137  ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
138  }
139  }
140  }
141 
142  unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
143  for (const BasicBlock *BB : L->getBlocks()) {
144  const DataLayout &DL = BB->getModule()->getDataLayout();
145  unsigned LocalGEPsSeen = 0;
146 
147  if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
148  return SubLoop->contains(BB); }))
149  continue; // Block belongs to an inner loop.
150 
151  for (const Instruction &I : *BB) {
152  // Unroll a loop which contains an "if" statement whose condition
153  // defined by a PHI belonging to the loop. This may help to eliminate
154  // if region and potentially even PHI itself, saving on both divergence
155  // and registers used for the PHI.
156  // Add a small bonus for each of such "if" statements.
157  if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
158  if (UP.Threshold < MaxBoost && Br->isConditional()) {
159  BasicBlock *Succ0 = Br->getSuccessor(0);
160  BasicBlock *Succ1 = Br->getSuccessor(1);
161  if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
162  (L->contains(Succ1) && L->isLoopExiting(Succ1)))
163  continue;
164  if (dependsOnLocalPhi(L, Br->getCondition())) {
166  LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
167  << " for loop:\n"
168  << *L << " due to " << *Br << '\n');
169  if (UP.Threshold >= MaxBoost)
170  return;
171  }
172  }
173  continue;
174  }
175 
176  const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
177  if (!GEP)
178  continue;
179 
180  unsigned AS = GEP->getAddressSpace();
181  unsigned Threshold = 0;
182  if (AS == AMDGPUAS::PRIVATE_ADDRESS)
183  Threshold = ThresholdPrivate;
184  else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
185  Threshold = ThresholdLocal;
186  else
187  continue;
188 
189  if (UP.Threshold >= Threshold)
190  continue;
191 
192  if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
193  const Value *Ptr = GEP->getPointerOperand();
194  const AllocaInst *Alloca =
195  dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
196  if (!Alloca || !Alloca->isStaticAlloca())
197  continue;
198  Type *Ty = Alloca->getAllocatedType();
199  unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
200  if (AllocaSize > MaxAlloca)
201  continue;
202  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
203  AS == AMDGPUAS::REGION_ADDRESS) {
204  LocalGEPsSeen++;
205  // Inhibit unroll for local memory if we have seen addressing not to
206  // a variable, most likely we will be unable to combine it.
207  // Do not unroll too deep inner loops for local memory to give a chance
208  // to unroll an outer loop for a more important reason.
209  if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
210  (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
211  !isa<Argument>(GEP->getPointerOperand())))
212  continue;
213  LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
214  << *L << " due to LDS use.\n");
216  }
217 
218  // Check if GEP depends on a value defined by this loop itself.
219  bool HasLoopDef = false;
220  for (const Value *Op : GEP->operands()) {
221  const Instruction *Inst = dyn_cast<Instruction>(Op);
222  if (!Inst || L->isLoopInvariant(Op))
223  continue;
224 
225  if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
226  return SubLoop->contains(Inst); }))
227  continue;
228  HasLoopDef = true;
229  break;
230  }
231  if (!HasLoopDef)
232  continue;
233 
234  // We want to do whatever we can to limit the number of alloca
235  // instructions that make it through to the code generator. allocas
236  // require us to use indirect addressing, which is slow and prone to
237  // compiler bugs. If this loop does an address calculation on an
238  // alloca ptr, then we want to use a higher than normal loop unroll
239  // threshold. This will give SROA a better chance to eliminate these
240  // allocas.
241  //
242  // We also want to have more unrolling for local memory to let ds
243  // instructions with different offsets combine.
244  //
245  // Don't use the maximum allowed value here as it will make some
246  // programs way too big.
247  UP.Threshold = Threshold;
248  LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
249  << " for loop:\n"
250  << *L << " due to " << *GEP << '\n');
251  if (UP.Threshold >= MaxBoost)
252  return;
253  }
254 
255  // If we got a GEP in a small BB from inner loop then increase max trip
256  // count to analyze for better estimation cost in unroll
257  if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
259  }
260 }
261 
264  BaseT::getPeelingPreferences(L, SE, PP);
265 }
266 
267 const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
268  // Codegen control options which don't matter.
269  AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
270  AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
271  AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
272  AMDGPU::FeatureUnalignedAccessMode,
273 
274  AMDGPU::FeatureAutoWaitcntBeforeBarrier,
275 
276  // Property of the kernel/environment which can't actually differ.
277  AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
278  AMDGPU::FeatureTrapHandler,
279 
280  // The default assumption needs to be ecc is enabled, but no directly
281  // exposed operations depend on it, so it can be safely inlined.
282  AMDGPU::FeatureSRAMECC,
283 
284  // Perf-tuning features
285  AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
286 
288  : BaseT(TM, F.getParent()->getDataLayout()),
289  ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
290  TLI(ST->getTargetLowering()), CommonTTI(TM, F),
291  IsGraphics(AMDGPU::isGraphics(F.getCallingConv())),
292  MaxVGPRs(ST->getMaxNumVGPRs(
293  std::max(ST->getWavesPerEU(F).first,
295  ST->getFlatWorkGroupSizes(F).second)))) {
297  HasFP32Denormals = Mode.allFP32Denormals();
298  HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
299 }
300 
302  // The concept of vector registers doesn't really exist. Some packed vector
303  // operations operate on the normal 32-bit registers.
304  return MaxVGPRs;
305 }
306 
307 unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
308  // This is really the number of registers to fill when vectorizing /
309  // interleaving loops, so we lie to avoid trying to use all registers.
310  return getHardwareNumberOfRegisters(Vec) >> 3;
311 }
312 
313 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
314  const SIRegisterInfo *TRI = ST->getRegisterInfo();
315  const TargetRegisterClass *RC = TRI->getRegClass(RCID);
316  unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32;
317  return getHardwareNumberOfRegisters(false) / NumVGPRs;
318 }
319 
320 TypeSize
322  switch (K) {
324  return TypeSize::getFixed(32);
326  return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
328  return TypeSize::getScalable(0);
329  }
330  llvm_unreachable("Unsupported register kind");
331 }
332 
334  return 32;
335 }
336 
337 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
338  if (Opcode == Instruction::Load || Opcode == Instruction::Store)
339  return 32 * 4 / ElemWidth;
340  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
341  : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
342  : 1;
343 }
344 
345 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
346  unsigned ChainSizeInBytes,
347  VectorType *VecTy) const {
348  unsigned VecRegBitWidth = VF * LoadSize;
349  if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
350  // TODO: Support element-size less than 32bit?
351  return 128 / LoadSize;
352 
353  return VF;
354 }
355 
356 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
357  unsigned ChainSizeInBytes,
358  VectorType *VecTy) const {
359  unsigned VecRegBitWidth = VF * StoreSize;
360  if (VecRegBitWidth > 128)
361  return 128 / StoreSize;
362 
363  return VF;
364 }
365 
366 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
367  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
368  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
369  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
370  AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
371  return 512;
372  }
373 
374  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
375  return 8 * ST->getMaxPrivateElementSize();
376 
377  // Common to flat, global, local and region. Assume for unknown addrspace.
378  return 128;
379 }
380 
381 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
382  Align Alignment,
383  unsigned AddrSpace) const {
384  // We allow vectorization of flat stores, even though we may need to decompose
385  // them later if they may access private memory. We don't have enough context
386  // here, and legalization can handle it.
387  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
388  return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
389  ChainSizeInBytes <= ST->getMaxPrivateElementSize();
390  }
391  return true;
392 }
393 
394 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
395  Align Alignment,
396  unsigned AddrSpace) const {
397  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
398 }
399 
400 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
401  Align Alignment,
402  unsigned AddrSpace) const {
403  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
404 }
405 
406 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
407 // iteration. Should we report a larger size and let it legalize?
408 //
409 // FIXME: Should we use narrower types for local/region, or account for when
410 // unaligned access is legal?
411 //
412 // FIXME: This could use fine tuning and microbenchmarks.
414  unsigned SrcAddrSpace,
415  unsigned DestAddrSpace,
416  unsigned SrcAlign,
417  unsigned DestAlign) const {
418  unsigned MinAlign = std::min(SrcAlign, DestAlign);
419 
420  // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
421  // hardware into byte accesses. If you assume all alignments are equally
422  // probable, it's more efficient on average to use short accesses for this
423  // case.
424  if (MinAlign == 2)
425  return Type::getInt16Ty(Context);
426 
427  // Not all subtargets have 128-bit DS instructions, and we currently don't
428  // form them by default.
429  if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
430  SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
431  DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
432  DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
434  }
435 
436  // Global memory works best with 16-byte accesses. Private memory will also
437  // hit this, although they'll be decomposed.
439 }
440 
443  unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
444  unsigned SrcAlign, unsigned DestAlign) const {
445  assert(RemainingBytes < 16);
446 
447  unsigned MinAlign = std::min(SrcAlign, DestAlign);
448 
449  if (MinAlign != 2) {
450  Type *I64Ty = Type::getInt64Ty(Context);
451  while (RemainingBytes >= 8) {
452  OpsOut.push_back(I64Ty);
453  RemainingBytes -= 8;
454  }
455 
456  Type *I32Ty = Type::getInt32Ty(Context);
457  while (RemainingBytes >= 4) {
458  OpsOut.push_back(I32Ty);
459  RemainingBytes -= 4;
460  }
461  }
462 
463  Type *I16Ty = Type::getInt16Ty(Context);
464  while (RemainingBytes >= 2) {
465  OpsOut.push_back(I16Ty);
466  RemainingBytes -= 2;
467  }
468 
469  Type *I8Ty = Type::getInt8Ty(Context);
470  while (RemainingBytes) {
471  OpsOut.push_back(I8Ty);
472  --RemainingBytes;
473  }
474 }
475 
476 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
477  // Disable unrolling if the loop is not vectorized.
478  // TODO: Enable this again.
479  if (VF == 1)
480  return 1;
481 
482  return 8;
483 }
484 
486  MemIntrinsicInfo &Info) const {
487  switch (Inst->getIntrinsicID()) {
488  case Intrinsic::amdgcn_atomic_inc:
489  case Intrinsic::amdgcn_atomic_dec:
490  case Intrinsic::amdgcn_ds_ordered_add:
491  case Intrinsic::amdgcn_ds_ordered_swap:
492  case Intrinsic::amdgcn_ds_fadd:
493  case Intrinsic::amdgcn_ds_fmin:
494  case Intrinsic::amdgcn_ds_fmax: {
495  auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
496  auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
497  if (!Ordering || !Volatile)
498  return false; // Invalid.
499 
500  unsigned OrderingVal = Ordering->getZExtValue();
501  if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
502  return false;
503 
504  Info.PtrVal = Inst->getArgOperand(0);
505  Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
506  Info.ReadMem = true;
507  Info.WriteMem = true;
508  Info.IsVolatile = !Volatile->isZero();
509  return true;
510  }
511  default:
512  return false;
513  }
514 }
515 
517  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
518  TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
519  TTI::OperandValueProperties Opd1PropInfo,
521  const Instruction *CxtI) {
522 
523  // Legalize the type.
524  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
525  int ISD = TLI->InstructionOpcodeToISD(Opcode);
526 
527  // Because we don't have any legal vector operations, but the legal types, we
528  // need to account for split vectors.
529  unsigned NElts = LT.second.isVector() ?
530  LT.second.getVectorNumElements() : 1;
531 
532  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
533 
534  switch (ISD) {
535  case ISD::SHL:
536  case ISD::SRL:
537  case ISD::SRA:
538  if (SLT == MVT::i64)
539  return get64BitInstrCost(CostKind) * LT.first * NElts;
540 
541  if (ST->has16BitInsts() && SLT == MVT::i16)
542  NElts = (NElts + 1) / 2;
543 
544  // i32
545  return getFullRateInstrCost() * LT.first * NElts;
546  case ISD::ADD:
547  case ISD::SUB:
548  case ISD::AND:
549  case ISD::OR:
550  case ISD::XOR:
551  if (SLT == MVT::i64) {
552  // and, or and xor are typically split into 2 VALU instructions.
553  return 2 * getFullRateInstrCost() * LT.first * NElts;
554  }
555 
556  if (ST->has16BitInsts() && SLT == MVT::i16)
557  NElts = (NElts + 1) / 2;
558 
559  return LT.first * NElts * getFullRateInstrCost();
560  case ISD::MUL: {
561  const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
562  if (SLT == MVT::i64) {
563  const int FullRateCost = getFullRateInstrCost();
564  return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
565  }
566 
567  if (ST->has16BitInsts() && SLT == MVT::i16)
568  NElts = (NElts + 1) / 2;
569 
570  // i32
571  return QuarterRateCost * NElts * LT.first;
572  }
573  case ISD::FMUL:
574  // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
575  // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
576  // fused operation.
577  if (CxtI && CxtI->hasOneUse())
578  if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
579  const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
580  if (OPC == ISD::FADD || OPC == ISD::FSUB) {
581  if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
583  if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
585 
586  // Estimate all types may be fused with contract/unsafe flags
588  if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
589  Options.UnsafeFPMath ||
590  (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
592  }
593  }
595  case ISD::FADD:
596  case ISD::FSUB:
597  if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
598  NElts = (NElts + 1) / 2;
599  if (SLT == MVT::f64)
600  return LT.first * NElts * get64BitInstrCost(CostKind);
601 
602  if (ST->has16BitInsts() && SLT == MVT::f16)
603  NElts = (NElts + 1) / 2;
604 
605  if (SLT == MVT::f32 || SLT == MVT::f16)
606  return LT.first * NElts * getFullRateInstrCost();
607  break;
608  case ISD::FDIV:
609  case ISD::FREM:
610  // FIXME: frem should be handled separately. The fdiv in it is most of it,
611  // but the current lowering is also not entirely correct.
612  if (SLT == MVT::f64) {
613  int Cost = 7 * get64BitInstrCost(CostKind) +
614  getQuarterRateInstrCost(CostKind) +
615  3 * getHalfRateInstrCost(CostKind);
616  // Add cost of workaround.
618  Cost += 3 * getFullRateInstrCost();
619 
620  return LT.first * Cost * NElts;
621  }
622 
623  if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
624  // TODO: This is more complicated, unsafe flags etc.
625  if ((SLT == MVT::f32 && !HasFP32Denormals) ||
626  (SLT == MVT::f16 && ST->has16BitInsts())) {
627  return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
628  }
629  }
630 
631  if (SLT == MVT::f16 && ST->has16BitInsts()) {
632  // 2 x v_cvt_f32_f16
633  // f32 rcp
634  // f32 fmul
635  // v_cvt_f16_f32
636  // f16 div_fixup
637  int Cost =
638  4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
639  return LT.first * Cost * NElts;
640  }
641 
642  if (SLT == MVT::f32 || SLT == MVT::f16) {
643  // 4 more v_cvt_* insts without f16 insts support
644  int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
645  1 * getQuarterRateInstrCost(CostKind);
646 
647  if (!HasFP32Denormals) {
648  // FP mode switches.
649  Cost += 2 * getFullRateInstrCost();
650  }
651 
652  return LT.first * NElts * Cost;
653  }
654  break;
655  case ISD::FNEG:
656  // Use the backend' estimation. If fneg is not free each element will cost
657  // one additional instruction.
658  return TLI->isFNegFree(SLT) ? 0 : NElts;
659  default:
660  break;
661  }
662 
663  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
664  Opd1PropInfo, Opd2PropInfo, Args, CxtI);
665 }
666 
667 // Return true if there's a potential benefit from using v2f16/v2i16
668 // instructions for an intrinsic, even if it requires nontrivial legalization.
670  switch (ID) {
671  case Intrinsic::fma: // TODO: fmuladd
672  // There's a small benefit to using vector ops in the legalized code.
673  case Intrinsic::round:
674  case Intrinsic::uadd_sat:
675  case Intrinsic::usub_sat:
676  case Intrinsic::sadd_sat:
677  case Intrinsic::ssub_sat:
678  return true;
679  default:
680  return false;
681  }
682 }
683 
687  if (ICA.getID() == Intrinsic::fabs)
688  return 0;
689 
692 
693  Type *RetTy = ICA.getReturnType();
694  EVT OrigTy = TLI->getValueType(DL, RetTy);
695  if (!OrigTy.isSimple()) {
698 
699  // TODO: Combine these two logic paths.
700  if (ICA.isTypeBasedOnly())
702 
703  unsigned RetVF =
704  (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
705  : 1);
706  const IntrinsicInst *I = ICA.getInst();
708  FastMathFlags FMF = ICA.getFlags();
709  // Assume that we need to scalarize this intrinsic.
710 
711  // Compute the scalarization overhead based on Args for a vector
712  // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
713  // CostModel will pass a vector RetTy and VF is 1.
714  InstructionCost ScalarizationCost = InstructionCost::getInvalid();
715  if (RetVF > 1) {
716  ScalarizationCost = 0;
717  if (!RetTy->isVoidTy())
718  ScalarizationCost +=
719  getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
720  ScalarizationCost +=
722  }
723 
724  IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I,
725  ScalarizationCost);
727  }
728 
729  // Legalize the type.
730  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
731 
732  unsigned NElts = LT.second.isVector() ?
733  LT.second.getVectorNumElements() : 1;
734 
735  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
736 
737  if (SLT == MVT::f64)
738  return LT.first * NElts * get64BitInstrCost(CostKind);
739 
740  if ((ST->has16BitInsts() && SLT == MVT::f16) ||
741  (ST->hasPackedFP32Ops() && SLT == MVT::f32))
742  NElts = (NElts + 1) / 2;
743 
744  // TODO: Get more refined intrinsic costs?
745  unsigned InstRate = getQuarterRateInstrCost(CostKind);
746 
747  switch (ICA.getID()) {
748  case Intrinsic::fma:
749  InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
750  : getQuarterRateInstrCost(CostKind);
751  break;
752  case Intrinsic::uadd_sat:
753  case Intrinsic::usub_sat:
754  case Intrinsic::sadd_sat:
755  case Intrinsic::ssub_sat:
756  static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
757  if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
758  NElts = 1;
759  break;
760  }
761 
762  return LT.first * NElts * InstRate;
763 }
764 
767  const Instruction *I) {
768  assert((I == nullptr || I->getOpcode() == Opcode) &&
769  "Opcode should reflect passed instruction.");
770  const bool SCost =
772  const int CBrCost = SCost ? 5 : 7;
773  switch (Opcode) {
774  case Instruction::Br: {
775  // Branch instruction takes about 4 slots on gfx900.
776  auto BI = dyn_cast_or_null<BranchInst>(I);
777  if (BI && BI->isUnconditional())
778  return SCost ? 1 : 4;
779  // Suppose conditional branch takes additional 3 exec manipulations
780  // instructions in average.
781  return CBrCost;
782  }
783  case Instruction::Switch: {
784  auto SI = dyn_cast_or_null<SwitchInst>(I);
785  // Each case (including default) takes 1 cmp + 1 cbr instructions in
786  // average.
787  return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
788  }
789  case Instruction::Ret:
790  return SCost ? 1 : 10;
791  }
792  return BaseT::getCFInstrCost(Opcode, CostKind, I);
793 }
794 
800  return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
801 
802  EVT OrigTy = TLI->getValueType(DL, Ty);
803 
804  // Computes cost on targets that have packed math instructions(which support
805  // 16-bit types only).
806  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
807  return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
808 
809  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
810  return LT.first * getFullRateInstrCost();
811 }
812 
815  bool IsUnsigned,
817  EVT OrigTy = TLI->getValueType(DL, Ty);
818 
819  // Computes cost on targets that have packed math instructions(which support
820  // 16-bit types only).
821  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
822  return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
823 
824  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
825  return LT.first * getHalfRateInstrCost(CostKind);
826 }
827 
829  unsigned Index) {
830  switch (Opcode) {
831  case Instruction::ExtractElement:
832  case Instruction::InsertElement: {
833  unsigned EltSize
834  = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
835  if (EltSize < 32) {
836  if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
837  return 0;
838  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
839  }
840 
841  // Extracts are just reads of a subregister, so are free. Inserts are
842  // considered free because we don't want to have any cost for scalarizing
843  // operations, and we don't have to copy into a different register class.
844 
845  // Dynamic indexing isn't free and is best avoided.
846  return Index == ~0u ? 2 : 0;
847  }
848  default:
849  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
850  }
851 }
852 
853 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
854 /// this is analyzing the collective result of all output registers. Otherwise,
855 /// this is only querying a specific result index if this returns multiple
856 /// registers in a struct.
858  const CallInst *CI, ArrayRef<unsigned> Indices) const {
859  // TODO: Handle complex extract indices
860  if (Indices.size() > 1)
861  return true;
862 
863  const DataLayout &DL = CI->getModule()->getDataLayout();
864  const SIRegisterInfo *TRI = ST->getRegisterInfo();
865  TargetLowering::AsmOperandInfoVector TargetConstraints =
866  TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
867 
868  const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
869 
870  int OutputIdx = 0;
871  for (auto &TC : TargetConstraints) {
872  if (TC.Type != InlineAsm::isOutput)
873  continue;
874 
875  // Skip outputs we don't care about.
876  if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
877  continue;
878 
879  TLI->ComputeConstraintToUse(TC, SDValue());
880 
881  Register AssignedReg;
882  const TargetRegisterClass *RC;
883  std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
884  TRI, TC.ConstraintCode, TC.ConstraintVT);
885  if (AssignedReg) {
886  // FIXME: This is a workaround for getRegForInlineAsmConstraint
887  // returning VS_32
888  RC = TRI->getPhysRegClass(AssignedReg);
889  }
890 
891  // For AGPR constraints null is returned on subtargets without AGPRs, so
892  // assume divergent for null.
893  if (!RC || !TRI->isSGPRClass(RC))
894  return true;
895  }
896 
897  return false;
898 }
899 
900 /// \returns true if the new GPU divergence analysis is enabled.
902  return !UseLegacyDA;
903 }
904 
905 /// \returns true if the result of the value could potentially be
906 /// different across workitems in a wavefront.
908  if (const Argument *A = dyn_cast<Argument>(V))
909  return !AMDGPU::isArgPassedInSGPR(A);
910 
911  // Loads from the private and flat address spaces are divergent, because
912  // threads can execute the load instruction with the same inputs and get
913  // different results.
914  //
915  // All other loads are not divergent, because if threads issue loads with the
916  // same arguments, they will always get the same result.
917  if (const LoadInst *Load = dyn_cast<LoadInst>(V))
918  return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
919  Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
920 
921  // Atomics are divergent because they are executed sequentially: when an
922  // atomic operation refers to the same address in each thread, then each
923  // thread after the first sees the value written by the previous thread as
924  // original value.
925  if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
926  return true;
927 
928  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
929  return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
930 
931  // Assume all function calls are a source of divergence.
932  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
933  if (CI->isInlineAsm())
935  return true;
936  }
937 
938  // Assume all function calls are a source of divergence.
939  if (isa<InvokeInst>(V))
940  return true;
941 
942  return false;
943 }
944 
945 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
946  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
947  switch (Intrinsic->getIntrinsicID()) {
948  default:
949  return false;
950  case Intrinsic::amdgcn_readfirstlane:
951  case Intrinsic::amdgcn_readlane:
952  case Intrinsic::amdgcn_icmp:
953  case Intrinsic::amdgcn_fcmp:
954  case Intrinsic::amdgcn_ballot:
955  case Intrinsic::amdgcn_if_break:
956  return true;
957  }
958  }
959 
960  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
961  if (CI->isInlineAsm())
962  return !isInlineAsmSourceOfDivergence(CI);
963  return false;
964  }
965 
966  const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
967  if (!ExtValue)
968  return false;
969 
970  const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
971  if (!CI)
972  return false;
973 
974  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
975  switch (Intrinsic->getIntrinsicID()) {
976  default:
977  return false;
978  case Intrinsic::amdgcn_if:
979  case Intrinsic::amdgcn_else: {
980  ArrayRef<unsigned> Indices = ExtValue->getIndices();
981  return Indices.size() == 1 && Indices[0] == 1;
982  }
983  }
984  }
985 
986  // If we have inline asm returning mixed SGPR and VGPR results, we inferred
987  // divergent for the overall struct return. We need to override it in the
988  // case we're extracting an SGPR component here.
989  if (CI->isInlineAsm())
990  return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
991 
992  return false;
993 }
994 
996  Intrinsic::ID IID) const {
997  switch (IID) {
998  case Intrinsic::amdgcn_atomic_inc:
999  case Intrinsic::amdgcn_atomic_dec:
1000  case Intrinsic::amdgcn_ds_fadd:
1001  case Intrinsic::amdgcn_ds_fmin:
1002  case Intrinsic::amdgcn_ds_fmax:
1003  case Intrinsic::amdgcn_is_shared:
1004  case Intrinsic::amdgcn_is_private:
1005  OpIndexes.push_back(0);
1006  return true;
1007  default:
1008  return false;
1009  }
1010 }
1011 
1013  Value *OldV,
1014  Value *NewV) const {
1015  auto IntrID = II->getIntrinsicID();
1016  switch (IntrID) {
1017  case Intrinsic::amdgcn_atomic_inc:
1018  case Intrinsic::amdgcn_atomic_dec:
1019  case Intrinsic::amdgcn_ds_fadd:
1020  case Intrinsic::amdgcn_ds_fmin:
1021  case Intrinsic::amdgcn_ds_fmax: {
1022  const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
1023  if (!IsVolatile->isZero())
1024  return nullptr;
1025  Module *M = II->getParent()->getParent()->getParent();
1026  Type *DestTy = II->getType();
1027  Type *SrcTy = NewV->getType();
1028  Function *NewDecl =
1029  Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
1030  II->setArgOperand(0, NewV);
1031  II->setCalledFunction(NewDecl);
1032  return II;
1033  }
1034  case Intrinsic::amdgcn_is_shared:
1035  case Intrinsic::amdgcn_is_private: {
1036  unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1038  unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1039  LLVMContext &Ctx = NewV->getType()->getContext();
1040  ConstantInt *NewVal = (TrueAS == NewAS) ?
1042  return NewVal;
1043  }
1044  case Intrinsic::ptrmask: {
1045  unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1046  unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1047  Value *MaskOp = II->getArgOperand(1);
1048  Type *MaskTy = MaskOp->getType();
1049 
1050  bool DoTruncate = false;
1051 
1052  const GCNTargetMachine &TM =
1053  static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1054  if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1055  // All valid 64-bit to 32-bit casts work by chopping off the high
1056  // bits. Any masking only clearing the low bits will also apply in the new
1057  // address space.
1058  if (DL.getPointerSizeInBits(OldAS) != 64 ||
1059  DL.getPointerSizeInBits(NewAS) != 32)
1060  return nullptr;
1061 
1062  // TODO: Do we need to thread more context in here?
1063  KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
1064  if (Known.countMinLeadingOnes() < 32)
1065  return nullptr;
1066 
1067  DoTruncate = true;
1068  }
1069 
1070  IRBuilder<> B(II);
1071  if (DoTruncate) {
1072  MaskTy = B.getInt32Ty();
1073  MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1074  }
1075 
1076  return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1077  {NewV, MaskOp});
1078  }
1079  default:
1080  return nullptr;
1081  }
1082 }
1083 
1086  int Index, VectorType *SubTp) {
1088  if (ST->hasVOP3PInsts()) {
1089  if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
1090  DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1091  // With op_sel VOP3P instructions freely can access the low half or high
1092  // half of a register, so any swizzle is free.
1093 
1094  switch (Kind) {
1095  case TTI::SK_Broadcast:
1096  case TTI::SK_Reverse:
1098  return 0;
1099  default:
1100  break;
1101  }
1102  }
1103  }
1104 
1105  return BaseT::getShuffleCost(Kind, VT, Mask, Index, SubTp);
1106 }
1107 
1109  const Function *Callee) const {
1110  const TargetMachine &TM = getTLI()->getTargetMachine();
1111  const GCNSubtarget *CallerST
1112  = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1113  const GCNSubtarget *CalleeST
1114  = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1115 
1116  const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1117  const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1118 
1119  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1120  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1121  if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1122  return false;
1123 
1124  // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1125  // no way to support merge for backend defined attributes.
1126  AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
1128  if (!CallerMode.isInlineCompatible(CalleeMode))
1129  return false;
1130 
1131  if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1132  Callee->hasFnAttribute(Attribute::InlineHint))
1133  return true;
1134 
1135  // Hack to make compile times reasonable.
1136  if (InlineMaxBB) {
1137  // Single BB does not increase total BB amount.
1138  if (Callee->size() == 1)
1139  return true;
1140  size_t BBSize = Caller->size() + Callee->size() - 1;
1141  return BBSize <= InlineMaxBB;
1142  }
1143 
1144  return true;
1145 }
1146 
1148  // If we have a pointer to private array passed into a function
1149  // it will not be optimized out, leaving scratch usage.
1150  // Increase the inline threshold to allow inlining in this case.
1151  uint64_t AllocaSize = 0;
1153  for (Value *PtrArg : CB->args()) {
1154  PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1155  if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
1157  continue;
1158 
1159  PtrArg = getUnderlyingObject(PtrArg);
1160  if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
1161  if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1162  continue;
1163  AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1164  // If the amount of stack memory is excessive we will not be able
1165  // to get rid of the scratch anyway, bail out.
1166  if (AllocaSize > ArgAllocaCutoff) {
1167  AllocaSize = 0;
1168  break;
1169  }
1170  }
1171  }
1172  if (AllocaSize)
1173  return ArgAllocaCost;
1174  return 0;
1175 }
1176 
1180  CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1181 }
1182 
1185  CommonTTI.getPeelingPreferences(L, SE, PP);
1186 }
1187 
1188 int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1189  return ST->hasFullRate64Ops()
1190  ? getFullRateInstrCost()
1191  : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1192  : getQuarterRateInstrCost(CostKind);
1193 }
llvm::ISD::SUB
@ SUB
Definition: ISDOpcodes.h:240
UseLegacyDA
static cl::opt< bool > UseLegacyDA("amdgpu-use-legacy-divergence-analysis", cl::desc("Enable legacy divergence analysis for AMDGPU"), cl::init(false), cl::Hidden)
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::GCNTTIImpl::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
Definition: AMDGPUTargetTransformInfo.cpp:814
llvm::TargetTransformInfo::UnrollingPreferences::BEInsns
unsigned BEInsns
Definition: TargetTransformInfo.h:482
llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
Attrs
Function Attrs
Definition: README_ALTIVEC.txt:215
llvm::Type::isSized
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:263
llvm::BasicTTIImplBase< AMDGPUTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:39
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:489
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:212
llvm::Loop::isLoopInvariant
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:64
llvm::BasicTTIImplBase< GCNTTIImpl >::getOperandsScalarizationOverhead
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys)
Estimate the overhead of scalarizing an instructions unique non-constant operands.
Definition: BasicTTIImpl.h:714
llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition: TargetTransformInfo.h:457
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AllocatorList.h:23
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:363
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:358
llvm::GCNTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: AMDGPUTargetTransformInfo.cpp:1177
llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1614
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:633
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1384
llvm::TargetTransformInfo::UnrollingPreferences::MaxCount
unsigned MaxCount
Definition: TargetTransformInfo.h:473
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
llvm::TargetOptions
Definition: TargetOptions.h:124
llvm::Function
Definition: Function.h:62
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
llvm::LoopBase::contains
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:122
llvm::AMDGPUTargetLowering::isFNegFree
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
Definition: AMDGPUISelLowering.cpp:892
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
llvm::CallBase::setCalledFunction
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1434
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:664
llvm::BasicTTIImplBase< GCNTTIImpl >::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1061
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:359
llvm::PatternMatch::m_FPOne
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:823
llvm::GCNTTIImpl::isSourceOfDivergence
bool isSourceOfDivergence(const Value *V) const
Definition: AMDGPUTargetTransformInfo.cpp:907
llvm::CallBase::isInlineAsm
bool isInlineAsm() const
Check if this call is an inline asm statement.
Definition: InstrTypes.h:1463
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:734
llvm::IRBuilder<>
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:151
llvm::PointerType::getAddressSpace
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:687
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:460
llvm::TargetTransformInfo::RGK_Scalar
@ RGK_Scalar
Definition: TargetTransformInfo.h:911
llvm::IntrinsicCostAttributes::getInst
const IntrinsicInst * getInst() const
Definition: TargetTransformInfo.h:150
llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:757
ValueTracking.h
llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:215
llvm::GCNTTIImpl::getMemcpyLoopLoweringType
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign) const
Definition: AMDGPUTargetTransformInfo.cpp:413
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::GCNTTIImpl::isLegalToVectorizeMemChain
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:381
llvm::Depth
@ Depth
Definition: SIMachineScheduler.h:36
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:485
dependsOnLocalPhi
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
Definition: AMDGPUTargetTransformInfo.cpp:79
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::GCNTTIImpl::getMaxInterleaveFactor
unsigned getMaxInterleaveFactor(unsigned VF)
Definition: AMDGPUTargetTransformInfo.cpp:476
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:539
llvm::BasicTTIImplBase< GCNTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:839
llvm::SITargetLowering::getRegForInlineAsmConstraint
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
Definition: SIISelLowering.cpp:11669
llvm::Optional
Definition: APInt.h:33
llvm::AMDGPU::SIModeRegisterDefaults
Definition: AMDGPUBaseInfo.h:915
llvm::GCNTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: AMDGPUTargetTransformInfo.cpp:1108
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:40
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:422
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
llvm::TargetLowering::ComputeConstraintToUse
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
Definition: TargetLowering.cpp:5066
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:116
llvm::AMDGPU::SIModeRegisterDefaults::isInlineCompatible
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Definition: AMDGPUBaseInfo.h:1000
llvm::GCNTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: AMDGPUTargetTransformInfo.cpp:796
llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:165
llvm::AMDGPU::IsaInfo::getMaxNumVGPRs
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
Definition: AMDGPUBaseInfo.cpp:747
llvm::GCNSubtarget::hasPackedFP32Ops
bool hasPackedFP32Ops() const
Definition: GCNSubtarget.h:833
llvm::AMDGPU::isIntrinsicSourceOfDivergence
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
Definition: AMDGPUBaseInfo.cpp:2016
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition: TargetTransformInfo.h:874
llvm::Type::getInt8Ty
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:239
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1559
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:241
llvm::GCNSubtarget::getRegisterInfo
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:219
llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:158
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:864
F
#define F(x, y, z)
Definition: MD5.cpp:56
KnownBits.h
llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(Optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition: TargetTransformInfo.h:1177
Context
ManagedStatic< detail::RecordContext > Context
Definition: Record.cpp:96
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::IntrinsicCostAttributes::getFlags
FastMathFlags getFlags() const
Definition: TargetTransformInfo.h:152
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:130
llvm::MVT::SimpleValueType
SimpleValueType
Definition: MachineValueType.h:33
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:266
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::IntrinsicCostAttributes::getArgTypes
const SmallVectorImpl< Type * > & getArgTypes() const
Definition: TargetTransformInfo.h:155
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::AllocaInst::isStaticAlloca
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Definition: Instructions.cpp:1397
llvm::LoopBase::getSubLoops
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
Definition: LoopInfo.h:143
llvm::MinAlign
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition: MathExtras.h:672
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:366
llvm::AMDGPUTargetMachine
Definition: AMDGPUTargetMachine.h:30
llvm::GCNSubtarget::hasFastFMAF32
bool hasFastFMAF32() const
Definition: GCNSubtarget.h:298
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::AllocaInst::getAllocatedType
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:113
llvm::BasicTTIImplBase< GCNTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:1114
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:863
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:35
llvm::GCNTTIImpl::isLegalToVectorizeStoreChain
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:400
llvm::MVT::f64
@ f64
Definition: MachineValueType.h:56
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:57
llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2145
llvm::GCNTTIImpl::getMinVectorRegisterBitWidth
unsigned getMinVectorRegisterBitWidth() const
Definition: AMDGPUTargetTransformInfo.cpp:333
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
llvm::ISD::SRA
@ SRA
Definition: ISDOpcodes.h:658
llvm::GCNTTIImpl::getMemcpyLoopResidualLoweringType
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign) const
Definition: AMDGPUTargetTransformInfo.cpp:441
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:226
llvm::TargetTransformInfo::UnrollingPreferences::MaxIterationsCountToAnalyze
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
Definition: TargetTransformInfo.h:511
llvm::GCNSubtarget::getMaxPrivateElementSize
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:271
llvm::GCNTTIImpl::getLoadVectorFactor
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: AMDGPUTargetTransformInfo.cpp:345
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:191
Options
const char LLVMTargetMachineRef LLVMPassBuilderOptionsRef Options
Definition: PassBuilderBindings.cpp:48
llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
Definition: ValueTracking.cpp:4397
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:30
llvm::GCNTTIImpl::isAlwaysUniform
bool isAlwaysUniform(const Value *V) const
Definition: AMDGPUTargetTransformInfo.cpp:945
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:632
llvm::AMDGPUSubtarget::has16BitInsts
bool has16BitInsts() const
Definition: AMDGPUSubtarget.h:144
PatternMatch.h
llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition: TargetTransformInfo.h:911
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:686
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::AMDGPUTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AMDGPUTargetTransformInfo.cpp:262
llvm::LinearPolySize< TypeSize >::getFixed
static TypeSize getFixed(ScalarTy MinVal)
Definition: TypeSize.h:283
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:91
llvm::lltok::Kind
Kind
Definition: LLToken.h:18
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:119
LoopInfo.h
InlineMaxBB
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
AMDGPUTargetTransformInfo.h
llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:377
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:865
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition: AtomicOrdering.h:56
llvm::cl::opt
Definition: CommandLine.h:1432
llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:364
llvm::GCNTTIImpl::getTgtMemIntrinsic
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
Definition: AMDGPUTargetTransformInfo.cpp:485
AMDGPUMCTargetDesc.h
llvm::TargetRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(unsigned i) const
Returns the register class associated with the enumeration value.
Definition: TargetRegisterInfo.h:739
Index
uint32_t Index
Definition: ELFObjHandler.cpp:84
llvm::KnownBits::countMinLeadingOnes
unsigned countMinLeadingOnes() const
Returns the minimum number of leading one bits.
Definition: KnownBits.h:241
uint64_t
llvm::AMDGPUTTIImpl::AMDGPUTTIImpl
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition: AMDGPUTargetTransformInfo.cpp:98
llvm::Instruction::hasAllowContract
bool hasAllowContract() const
Determine whether the allow-contract flag is set.
Definition: Instruction.cpp:276
llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:578
const
aarch64 promote const
Definition: AArch64PromoteConstant.cpp:232
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
llvm::TargetLowering::ParseConstraints
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
Definition: TargetLowering.cpp:4715
llvm::AMDGPU::getIntegerAttribute
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
Definition: AMDGPUBaseInfo.cpp:854
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:432
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:890
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:928
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:441
llvm::AMDGPUSubtarget::hasMadMacF32Insts
bool hasMadMacF32Insts() const
Definition: AMDGPUSubtarget.h:152
llvm::PointerType
Class to represent pointers.
Definition: DerivedTypes.h:632
llvm::BasicTTIImplBase< AMDGPUTTIImpl >
intrinsicHasPackedVectorBenefit
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
Definition: AMDGPUTargetTransformInfo.cpp:669
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:224
llvm::TargetMachine::Options
TargetOptions Options
Definition: TargetMachine.h:121
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:80
llvm::LoopBase::getLoopDepth
unsigned getLoopDepth() const
Return the nesting level of this loop.
Definition: LoopInfo.h:96
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::GCNSubtarget::hasUnalignedScratchAccess
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:510
Mode
SI Whole Quad Mode
Definition: SIWholeQuadMode.cpp:262
llvm::Type::isVoidTy
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:138
UnrollMaxBlockToAnalyze
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
llvm::MVT
Machine Value Type.
Definition: MachineValueType.h:31
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
llvm::MDNode
Metadata node.
Definition: Metadata.h:906
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:882
llvm::GCNTTIImpl::getHardwareNumberOfRegisters
unsigned getHardwareNumberOfRegisters(bool Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:301
UnrollThresholdPrivate
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
llvm::GCNTargetMachine
Definition: AMDGPUTargetMachine.h:76
llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition: TargetTransformInfo.h:263
llvm::AMDGPUSubtarget::hasVOP3PInsts
bool hasVOP3PInsts() const
Definition: AMDGPUSubtarget.h:164
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1607
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:47
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:179
llvm::AMDGPUTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: AMDGPUTargetTransformInfo.cpp:104
llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:353
llvm::BasicTTIImplBase< GCNTTIImpl >::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:678
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:216
llvm::GCNTTIImpl::isLegalToVectorizeLoadChain
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:394
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:134
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:360
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::GCNTTIImpl::isInlineAsmSourceOfDivergence
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
Definition: AMDGPUTargetTransformInfo.cpp:857
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
getParent
static const Function * getParent(const Value *V)
Definition: BasicAliasAnalysis.cpp:870
UnrollThresholdIf
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
llvm::TargetLoweringBase::InstructionOpcodeToISD
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Definition: TargetLoweringBase.cpp:1767
llvm::AMDGPU::isGraphics
bool isGraphics(CallingConv::ID cc)
Definition: AMDGPUBaseInfo.cpp:1375
llvm::findOptionMDForLoop
MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
Definition: LoopInfo.cpp:1046
llvm::GCNTTIImpl::getRegisterBitWidth
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:321
llvm::ConstantInt::getSExtValue
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:148
llvm::BasicTTIImplBase< AMDGPUTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:565
llvm::AMDGPU::HSAMD::Kernel::Arg::Key::IsVolatile
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
Definition: AMDGPUMetadata.h:194
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:286
llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:127
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:175
llvm::ISD::FMUL
@ FMUL
Definition: ISDOpcodes.h:379
llvm::CallBase::setArgOperand
void setArgOperand(unsigned i, Value *v)
Definition: InstrTypes.h:1348
UnrollThresholdLocal
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:362
llvm::ConstantInt::getFalse
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:880
llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:634
Callee
amdgpu Simplify well known AMD library false FunctionCallee Callee
Definition: AMDGPULibCalls.cpp:206
llvm::AMDGPUAS::BUFFER_FAT_POINTER
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
Definition: AMDGPU.h:368
llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition: TargetLowering.h:339
llvm::LoopBase::isInnermost
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:165
ArgAllocaCost
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
llvm::GCNTTIImpl::getMaximumVF
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
Definition: AMDGPUTargetTransformInfo.cpp:337
llvm::GCNTTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AMDGPUTargetTransformInfo.cpp:765
llvm::Type::getInt64Ty
static IntegerType * getInt64Ty(LLVMContext &C)
Definition: Type.cpp:242
llvm::GCNTTIImpl::getNumberOfRegisters
unsigned getNumberOfRegisters(bool Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:307
llvm::AMDGPU::isArgPassedInSGPR
bool isArgPassedInSGPR(const Argument *A)
Definition: AMDGPUBaseInfo.cpp:1827
llvm::ConstantInt::getTrue
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:873
std
Definition: BitVector.h:838
llvm::KnownBits
Definition: KnownBits.h:23
llvm::SITargetLowering::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Definition: SIISelLowering.cpp:12472
llvm::TargetRegisterInfo::getRegSizeInBits
unsigned getRegSizeInBits(const TargetRegisterClass &RC) const
Return the size in bits of a register from class RC.
Definition: TargetRegisterInfo.h:276
llvm::GCNSubtarget::hasFullRate64Ops
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:306
llvm::GCNTTIImpl::collectFlatAddressOperands
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
Definition: AMDGPUTargetTransformInfo.cpp:995
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:325
llvm::GCNTTIImpl::getLoadStoreVecRegBitWidth
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:366
llvm::ExtractValueInst
This instruction extracts a struct member or array element value from an aggregate value.
Definition: Instructions.h:2395
llvm::GCNTTIImpl::getStoreVectorFactor
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: AMDGPUTargetTransformInfo.cpp:356
llvm::TypeSize
Definition: TypeSize.h:416
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
llvm::LinearPolySize< TypeSize >::getScalable
static TypeSize getScalable(ScalarTy MinVal)
Definition: TypeSize.h:286
llvm::GCNTTIImpl::adjustInliningThreshold
unsigned adjustInliningThreshold(const CallBase *CB) const
Definition: AMDGPUTargetTransformInfo.cpp:1147
llvm::GCNTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: AMDGPUTargetTransformInfo.cpp:516
llvm::SDValue
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
Definition: SelectionDAGNodes.h:137
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
llvm::TargetTransformInfo::RGK_ScalableVector
@ RGK_ScalableVector
Definition: TargetTransformInfo.h:911
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
llvm::InstructionCost::getInvalid
static InstructionCost getInvalid(CostType Val=0)
Definition: InstructionCost.h:73
llvm::GCNTTIImpl::rewriteIntrinsicWithAddressSpace
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
Definition: AMDGPUTargetTransformInfo.cpp:1012
llvm::InlineAsm::isOutput
@ isOutput
Definition: InlineAsm.h:95
llvm::FPOpFusion::Fast
@ Fast
Definition: TargetOptions.h:37
llvm::BasicTTIImplBase< GCNTTIImpl >::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
Definition: BasicTTIImpl.h:2155
llvm::RecurKind::FAdd
@ FAdd
Sum of floats.
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::ISD::FSUB
@ FSUB
Definition: ISDOpcodes.h:378
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:149
llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:657
llvm::TargetTransformInfo::RegisterKind
RegisterKind
Definition: TargetTransformInfo.h:911
llvm::ISD::FREM
@ FREM
Definition: ISDOpcodes.h:381
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:241
llvm::MVT::f16
@ f16
Definition: MachineValueType.h:54
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1343
llvm::BasicTTIImplBase< GCNTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: BasicTTIImpl.h:872
llvm::BasicTTIImplBase< GCNTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1355
llvm::TargetTransformInfo::UnrollingPreferences::Threshold
unsigned Threshold
The cost threshold for the unrolled loop.
Definition: TargetTransformInfo.h:440
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:94
llvm::ISD::SRL
@ SRL
Definition: ISDOpcodes.h:659
llvm::AMDGPU::IsaInfo::getWavesPerEUForWorkGroup
unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize)
Definition: AMDGPUBaseInfo.cpp:572
llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:163
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
llvm::GCNTTIImpl::GCNTTIImpl
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition: AMDGPUTargetTransformInfo.cpp:287
llvm::PHINode
Definition: Instructions.h:2648
Threshold
static cl::opt< unsigned > Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), cl::init(100), cl::Hidden)
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:43
llvm::MemIntrinsicInfo
Information about a load/store intrinsic defined by the target.
Definition: TargetTransformInfo.h:71
llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1176
llvm::Type::getInt16Ty
static IntegerType * getInt16Ty(LLVMContext &C)
Definition: Type.cpp:240
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:401
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::DataLayout::getPointerSizeInBits
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:409
llvm::AMDGPU::HSAMD::Kernel::CodeProps::Key::NumVGPRs
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
Definition: AMDGPUMetadata.h:255
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:45
llvm::LoopBase::isLoopExiting
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
Definition: LoopInfo.h:225
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1469
llvm::BasicTTIImplBase< GCNTTIImpl >::getTypeBasedIntrinsicInstrCost
InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on argument types.
Definition: BasicTTIImpl.h:1536
llvm::ISD::FNEG
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:866
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:172
llvm::GCNTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: AMDGPUTargetTransformInfo.cpp:1084
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
llvm::GCNTTIImpl::useGPUDivergenceAnalysis
bool useGPUDivergenceAnalysis() const
Definition: AMDGPUTargetTransformInfo.cpp:901
llvm::AllocaInst
an instruction to allocate memory on the stack
Definition: Instructions.h:62
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:412
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1426
ArgAllocaCutoff
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
llvm::IntrinsicCostAttributes::isTypeBasedOnly
bool isTypeBasedOnly() const
Definition: TargetTransformInfo.h:157
llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition: Instructions.h:3083
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:89
llvm::MVT::f32
@ f32
Definition: MachineValueType.h:55
UnrollRuntimeLocal
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
llvm::ExtractValueInst::getIndices
ArrayRef< unsigned > getIndices() const
Definition: Instructions.h:2460
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::GCNTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: AMDGPUTargetTransformInfo.cpp:685
llvm::IntrinsicCostAttributes::getArgs
const SmallVectorImpl< const Value * > & getArgs() const
Definition: TargetTransformInfo.h:154
AMDGPUTargetMachine.h
llvm::CallBase::args
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1334
llvm::GCNTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index)
Definition: AMDGPUTargetTransformInfo.cpp:828
llvm::TargetLowering::AsmOperandInfoVector
std::vector< AsmOperandInfo > AsmOperandInfoVector
Definition: TargetLowering.h:4279
llvm::GCNSubtarget::hasUsableDivScaleConditionOutput
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:404
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:503
llvm::ISD::FDIV
@ FDIV
Definition: ISDOpcodes.h:380
llvm::GCNTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AMDGPUTargetTransformInfo.cpp:1183