LLVM  16.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
18 #include "AMDGPUTargetMachine.h"
20 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/IR/IntrinsicsAMDGPU.h"
24 #include "llvm/IR/PatternMatch.h"
25 #include "llvm/Support/KnownBits.h"
26 #include <optional>
27 
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "AMDGPUtti"
31 
33  "amdgpu-unroll-threshold-private",
34  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
35  cl::init(2700), cl::Hidden);
36 
38  "amdgpu-unroll-threshold-local",
39  cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
40  cl::init(1000), cl::Hidden);
41 
43  "amdgpu-unroll-threshold-if",
44  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
45  cl::init(200), cl::Hidden);
46 
48  "amdgpu-unroll-runtime-local",
49  cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
50  cl::init(true), cl::Hidden);
51 
53  "amdgpu-use-legacy-divergence-analysis",
54  cl::desc("Enable legacy divergence analysis for AMDGPU"),
55  cl::init(false), cl::Hidden);
56 
58  "amdgpu-unroll-max-block-to-analyze",
59  cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
60  cl::init(32), cl::Hidden);
61 
62 static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
63  cl::Hidden, cl::init(4000),
64  cl::desc("Cost of alloca argument"));
65 
66 // If the amount of scratch memory to eliminate exceeds our ability to allocate
67 // it into registers we gain nothing by aggressively inlining functions for that
68 // heuristic.
69 static cl::opt<unsigned>
70  ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
71  cl::init(256),
72  cl::desc("Maximum alloca size to use for inline cost"));
73 
74 // Inliner constraint to achieve reasonable compilation time.
76  "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
77  cl::desc("Maximum number of BBs allowed in a function after inlining"
78  " (compile time constraint)"));
79 
80 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
81  unsigned Depth = 0) {
82  const Instruction *I = dyn_cast<Instruction>(Cond);
83  if (!I)
84  return false;
85 
86  for (const Value *V : I->operand_values()) {
87  if (!L->contains(I))
88  continue;
89  if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
90  if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
91  return SubLoop->contains(PHI); }))
92  return true;
93  } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
94  return true;
95  }
96  return false;
97 }
98 
100  : BaseT(TM, F.getParent()->getDataLayout()),
101  TargetTriple(TM->getTargetTriple()),
102  ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
103  TLI(ST->getTargetLowering()) {}
104 
108  const Function &F = *L->getHeader()->getParent();
109  UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
111  UP.Partial = true;
112 
113  // Conditional branch in a loop back edge needs 3 additional exec
114  // manipulations in average.
115  UP.BEInsns += 3;
116 
117  // TODO: Do we want runtime unrolling?
118 
119  // Maximum alloca size than can fit registers. Reserve 16 registers.
120  const unsigned MaxAlloca = (256 - 16) * 4;
121  unsigned ThresholdPrivate = UnrollThresholdPrivate;
122  unsigned ThresholdLocal = UnrollThresholdLocal;
123 
124  // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
125  // provided threshold value as the default for Threshold
126  if (MDNode *LoopUnrollThreshold =
127  findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
128  if (LoopUnrollThreshold->getNumOperands() == 2) {
129  ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
130  LoopUnrollThreshold->getOperand(1));
131  if (MetaThresholdValue) {
132  // We will also use the supplied value for PartialThreshold for now.
133  // We may introduce additional metadata if it becomes necessary in the
134  // future.
135  UP.Threshold = MetaThresholdValue->getSExtValue();
136  UP.PartialThreshold = UP.Threshold;
137  ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
138  ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
139  }
140  }
141  }
142 
143  unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
144  for (const BasicBlock *BB : L->getBlocks()) {
145  const DataLayout &DL = BB->getModule()->getDataLayout();
146  unsigned LocalGEPsSeen = 0;
147 
148  if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
149  return SubLoop->contains(BB); }))
150  continue; // Block belongs to an inner loop.
151 
152  for (const Instruction &I : *BB) {
153  // Unroll a loop which contains an "if" statement whose condition
154  // defined by a PHI belonging to the loop. This may help to eliminate
155  // if region and potentially even PHI itself, saving on both divergence
156  // and registers used for the PHI.
157  // Add a small bonus for each of such "if" statements.
158  if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
159  if (UP.Threshold < MaxBoost && Br->isConditional()) {
160  BasicBlock *Succ0 = Br->getSuccessor(0);
161  BasicBlock *Succ1 = Br->getSuccessor(1);
162  if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
163  (L->contains(Succ1) && L->isLoopExiting(Succ1)))
164  continue;
165  if (dependsOnLocalPhi(L, Br->getCondition())) {
167  LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
168  << " for loop:\n"
169  << *L << " due to " << *Br << '\n');
170  if (UP.Threshold >= MaxBoost)
171  return;
172  }
173  }
174  continue;
175  }
176 
177  const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
178  if (!GEP)
179  continue;
180 
181  unsigned AS = GEP->getAddressSpace();
182  unsigned Threshold = 0;
183  if (AS == AMDGPUAS::PRIVATE_ADDRESS)
184  Threshold = ThresholdPrivate;
185  else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
186  Threshold = ThresholdLocal;
187  else
188  continue;
189 
190  if (UP.Threshold >= Threshold)
191  continue;
192 
193  if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
194  const Value *Ptr = GEP->getPointerOperand();
195  const AllocaInst *Alloca =
196  dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
197  if (!Alloca || !Alloca->isStaticAlloca())
198  continue;
199  Type *Ty = Alloca->getAllocatedType();
200  unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
201  if (AllocaSize > MaxAlloca)
202  continue;
203  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
204  AS == AMDGPUAS::REGION_ADDRESS) {
205  LocalGEPsSeen++;
206  // Inhibit unroll for local memory if we have seen addressing not to
207  // a variable, most likely we will be unable to combine it.
208  // Do not unroll too deep inner loops for local memory to give a chance
209  // to unroll an outer loop for a more important reason.
210  if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
211  (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
212  !isa<Argument>(GEP->getPointerOperand())))
213  continue;
214  LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
215  << *L << " due to LDS use.\n");
217  }
218 
219  // Check if GEP depends on a value defined by this loop itself.
220  bool HasLoopDef = false;
221  for (const Value *Op : GEP->operands()) {
222  const Instruction *Inst = dyn_cast<Instruction>(Op);
223  if (!Inst || L->isLoopInvariant(Op))
224  continue;
225 
226  if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
227  return SubLoop->contains(Inst); }))
228  continue;
229  HasLoopDef = true;
230  break;
231  }
232  if (!HasLoopDef)
233  continue;
234 
235  // We want to do whatever we can to limit the number of alloca
236  // instructions that make it through to the code generator. allocas
237  // require us to use indirect addressing, which is slow and prone to
238  // compiler bugs. If this loop does an address calculation on an
239  // alloca ptr, then we want to use a higher than normal loop unroll
240  // threshold. This will give SROA a better chance to eliminate these
241  // allocas.
242  //
243  // We also want to have more unrolling for local memory to let ds
244  // instructions with different offsets combine.
245  //
246  // Don't use the maximum allowed value here as it will make some
247  // programs way too big.
248  UP.Threshold = Threshold;
249  LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
250  << " for loop:\n"
251  << *L << " due to " << *GEP << '\n');
252  if (UP.Threshold >= MaxBoost)
253  return;
254  }
255 
256  // If we got a GEP in a small BB from inner loop then increase max trip
257  // count to analyze for better estimation cost in unroll
258  if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
260  }
261 }
262 
265  BaseT::getPeelingPreferences(L, SE, PP);
266 }
267 
268 const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
269  // Codegen control options which don't matter.
270  AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
271  AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
272  AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
273  AMDGPU::FeatureUnalignedAccessMode,
274 
275  AMDGPU::FeatureAutoWaitcntBeforeBarrier,
276 
277  // Property of the kernel/environment which can't actually differ.
278  AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
279  AMDGPU::FeatureTrapHandler,
280 
281  // The default assumption needs to be ecc is enabled, but no directly
282  // exposed operations depend on it, so it can be safely inlined.
283  AMDGPU::FeatureSRAMECC,
284 
285  // Perf-tuning features
286  AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
287 
289  : BaseT(TM, F.getParent()->getDataLayout()),
290  ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
291  TLI(ST->getTargetLowering()), CommonTTI(TM, F),
292  IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
294  HasFP32Denormals = Mode.allFP32Denormals();
295  HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
296 }
297 
298 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
299  // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
300  // registers. See getRegisterClassForType for the implementation.
301  // In this case vector registers are not vector in terms of
302  // VGPRs, but those which can hold multiple values.
303 
304  // This is really the number of registers to fill when vectorizing /
305  // interleaving loops, so we lie to avoid trying to use all registers.
306  return 4;
307 }
308 
309 TypeSize
311  switch (K) {
313  return TypeSize::getFixed(32);
315  return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
317  return TypeSize::getScalable(0);
318  }
319  llvm_unreachable("Unsupported register kind");
320 }
321 
323  return 32;
324 }
325 
326 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
327  if (Opcode == Instruction::Load || Opcode == Instruction::Store)
328  return 32 * 4 / ElemWidth;
329  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
330  : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
331  : 1;
332 }
333 
334 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
335  unsigned ChainSizeInBytes,
336  VectorType *VecTy) const {
337  unsigned VecRegBitWidth = VF * LoadSize;
338  if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
339  // TODO: Support element-size less than 32bit?
340  return 128 / LoadSize;
341 
342  return VF;
343 }
344 
345 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
346  unsigned ChainSizeInBytes,
347  VectorType *VecTy) const {
348  unsigned VecRegBitWidth = VF * StoreSize;
349  if (VecRegBitWidth > 128)
350  return 128 / StoreSize;
351 
352  return VF;
353 }
354 
355 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
356  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
357  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
358  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
359  AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
360  return 512;
361  }
362 
363  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
364  return 8 * ST->getMaxPrivateElementSize();
365 
366  // Common to flat, global, local and region. Assume for unknown addrspace.
367  return 128;
368 }
369 
370 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
371  Align Alignment,
372  unsigned AddrSpace) const {
373  // We allow vectorization of flat stores, even though we may need to decompose
374  // them later if they may access private memory. We don't have enough context
375  // here, and legalization can handle it.
376  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
377  return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
378  ChainSizeInBytes <= ST->getMaxPrivateElementSize();
379  }
380  return true;
381 }
382 
383 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
384  Align Alignment,
385  unsigned AddrSpace) const {
386  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
387 }
388 
389 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
390  Align Alignment,
391  unsigned AddrSpace) const {
392  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
393 }
394 
395 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
396 // iteration. Should we report a larger size and let it legalize?
397 //
398 // FIXME: Should we use narrower types for local/region, or account for when
399 // unaligned access is legal?
400 //
401 // FIXME: This could use fine tuning and microbenchmarks.
403  LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
404  unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
405  std::optional<uint32_t> AtomicElementSize) const {
406 
407  if (AtomicElementSize)
408  return Type::getIntNTy(Context, *AtomicElementSize * 8);
409 
410  unsigned MinAlign = std::min(SrcAlign, DestAlign);
411 
412  // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
413  // hardware into byte accesses. If you assume all alignments are equally
414  // probable, it's more efficient on average to use short accesses for this
415  // case.
416  if (MinAlign == 2)
417  return Type::getInt16Ty(Context);
418 
419  // Not all subtargets have 128-bit DS instructions, and we currently don't
420  // form them by default.
421  if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
422  SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
423  DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
424  DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
426  }
427 
428  // Global memory works best with 16-byte accesses. Private memory will also
429  // hit this, although they'll be decomposed.
431 }
432 
435  unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
436  unsigned SrcAlign, unsigned DestAlign,
437  std::optional<uint32_t> AtomicCpySize) const {
438  assert(RemainingBytes < 16);
439 
440  if (AtomicCpySize)
442  OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
443  DestAlign, AtomicCpySize);
444 
445  unsigned MinAlign = std::min(SrcAlign, DestAlign);
446 
447  if (MinAlign != 2) {
448  Type *I64Ty = Type::getInt64Ty(Context);
449  while (RemainingBytes >= 8) {
450  OpsOut.push_back(I64Ty);
451  RemainingBytes -= 8;
452  }
453 
454  Type *I32Ty = Type::getInt32Ty(Context);
455  while (RemainingBytes >= 4) {
456  OpsOut.push_back(I32Ty);
457  RemainingBytes -= 4;
458  }
459  }
460 
461  Type *I16Ty = Type::getInt16Ty(Context);
462  while (RemainingBytes >= 2) {
463  OpsOut.push_back(I16Ty);
464  RemainingBytes -= 2;
465  }
466 
467  Type *I8Ty = Type::getInt8Ty(Context);
468  while (RemainingBytes) {
469  OpsOut.push_back(I8Ty);
470  --RemainingBytes;
471  }
472 }
473 
474 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
475  // Disable unrolling if the loop is not vectorized.
476  // TODO: Enable this again.
477  if (VF == 1)
478  return 1;
479 
480  return 8;
481 }
482 
484  MemIntrinsicInfo &Info) const {
485  switch (Inst->getIntrinsicID()) {
486  case Intrinsic::amdgcn_atomic_inc:
487  case Intrinsic::amdgcn_atomic_dec:
488  case Intrinsic::amdgcn_ds_ordered_add:
489  case Intrinsic::amdgcn_ds_ordered_swap:
490  case Intrinsic::amdgcn_ds_fadd:
491  case Intrinsic::amdgcn_ds_fmin:
492  case Intrinsic::amdgcn_ds_fmax: {
493  auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
494  auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
495  if (!Ordering || !Volatile)
496  return false; // Invalid.
497 
498  unsigned OrderingVal = Ordering->getZExtValue();
499  if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
500  return false;
501 
502  Info.PtrVal = Inst->getArgOperand(0);
503  Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
504  Info.ReadMem = true;
505  Info.WriteMem = true;
506  Info.IsVolatile = !Volatile->isZero();
507  return true;
508  }
509  default:
510  return false;
511  }
512 }
513 
515  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
518  const Instruction *CxtI) {
519 
520  // Legalize the type.
521  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
522  int ISD = TLI->InstructionOpcodeToISD(Opcode);
523 
524  // Because we don't have any legal vector operations, but the legal types, we
525  // need to account for split vectors.
526  unsigned NElts = LT.second.isVector() ?
527  LT.second.getVectorNumElements() : 1;
528 
529  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
530 
531  switch (ISD) {
532  case ISD::SHL:
533  case ISD::SRL:
534  case ISD::SRA:
535  if (SLT == MVT::i64)
536  return get64BitInstrCost(CostKind) * LT.first * NElts;
537 
538  if (ST->has16BitInsts() && SLT == MVT::i16)
539  NElts = (NElts + 1) / 2;
540 
541  // i32
542  return getFullRateInstrCost() * LT.first * NElts;
543  case ISD::ADD:
544  case ISD::SUB:
545  case ISD::AND:
546  case ISD::OR:
547  case ISD::XOR:
548  if (SLT == MVT::i64) {
549  // and, or and xor are typically split into 2 VALU instructions.
550  return 2 * getFullRateInstrCost() * LT.first * NElts;
551  }
552 
553  if (ST->has16BitInsts() && SLT == MVT::i16)
554  NElts = (NElts + 1) / 2;
555 
556  return LT.first * NElts * getFullRateInstrCost();
557  case ISD::MUL: {
558  const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
559  if (SLT == MVT::i64) {
560  const int FullRateCost = getFullRateInstrCost();
561  return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
562  }
563 
564  if (ST->has16BitInsts() && SLT == MVT::i16)
565  NElts = (NElts + 1) / 2;
566 
567  // i32
568  return QuarterRateCost * NElts * LT.first;
569  }
570  case ISD::FMUL:
571  // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
572  // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
573  // fused operation.
574  if (CxtI && CxtI->hasOneUse())
575  if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
576  const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
577  if (OPC == ISD::FADD || OPC == ISD::FSUB) {
578  if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
580  if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
582 
583  // Estimate all types may be fused with contract/unsafe flags
585  if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
586  Options.UnsafeFPMath ||
587  (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
589  }
590  }
591  [[fallthrough]];
592  case ISD::FADD:
593  case ISD::FSUB:
594  if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
595  NElts = (NElts + 1) / 2;
596  if (SLT == MVT::f64)
597  return LT.first * NElts * get64BitInstrCost(CostKind);
598 
599  if (ST->has16BitInsts() && SLT == MVT::f16)
600  NElts = (NElts + 1) / 2;
601 
602  if (SLT == MVT::f32 || SLT == MVT::f16)
603  return LT.first * NElts * getFullRateInstrCost();
604  break;
605  case ISD::FDIV:
606  case ISD::FREM:
607  // FIXME: frem should be handled separately. The fdiv in it is most of it,
608  // but the current lowering is also not entirely correct.
609  if (SLT == MVT::f64) {
610  int Cost = 7 * get64BitInstrCost(CostKind) +
611  getQuarterRateInstrCost(CostKind) +
612  3 * getHalfRateInstrCost(CostKind);
613  // Add cost of workaround.
615  Cost += 3 * getFullRateInstrCost();
616 
617  return LT.first * Cost * NElts;
618  }
619 
620  if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
621  // TODO: This is more complicated, unsafe flags etc.
622  if ((SLT == MVT::f32 && !HasFP32Denormals) ||
623  (SLT == MVT::f16 && ST->has16BitInsts())) {
624  return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
625  }
626  }
627 
628  if (SLT == MVT::f16 && ST->has16BitInsts()) {
629  // 2 x v_cvt_f32_f16
630  // f32 rcp
631  // f32 fmul
632  // v_cvt_f16_f32
633  // f16 div_fixup
634  int Cost =
635  4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
636  return LT.first * Cost * NElts;
637  }
638 
639  if (SLT == MVT::f32 || SLT == MVT::f16) {
640  // 4 more v_cvt_* insts without f16 insts support
641  int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
642  1 * getQuarterRateInstrCost(CostKind);
643 
644  if (!HasFP32Denormals) {
645  // FP mode switches.
646  Cost += 2 * getFullRateInstrCost();
647  }
648 
649  return LT.first * NElts * Cost;
650  }
651  break;
652  case ISD::FNEG:
653  // Use the backend' estimation. If fneg is not free each element will cost
654  // one additional instruction.
655  return TLI->isFNegFree(SLT) ? 0 : NElts;
656  default:
657  break;
658  }
659 
660  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
661  Args, CxtI);
662 }
663 
664 // Return true if there's a potential benefit from using v2f16/v2i16
665 // instructions for an intrinsic, even if it requires nontrivial legalization.
667  switch (ID) {
668  case Intrinsic::fma: // TODO: fmuladd
669  // There's a small benefit to using vector ops in the legalized code.
670  case Intrinsic::round:
671  case Intrinsic::uadd_sat:
672  case Intrinsic::usub_sat:
673  case Intrinsic::sadd_sat:
674  case Intrinsic::ssub_sat:
675  return true;
676  default:
677  return false;
678  }
679 }
680 
684  if (ICA.getID() == Intrinsic::fabs)
685  return 0;
686 
689 
690  Type *RetTy = ICA.getReturnType();
691 
692  // Legalize the type.
693  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
694 
695  unsigned NElts = LT.second.isVector() ?
696  LT.second.getVectorNumElements() : 1;
697 
698  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
699 
700  if (SLT == MVT::f64)
701  return LT.first * NElts * get64BitInstrCost(CostKind);
702 
703  if ((ST->has16BitInsts() && SLT == MVT::f16) ||
704  (ST->hasPackedFP32Ops() && SLT == MVT::f32))
705  NElts = (NElts + 1) / 2;
706 
707  // TODO: Get more refined intrinsic costs?
708  unsigned InstRate = getQuarterRateInstrCost(CostKind);
709 
710  switch (ICA.getID()) {
711  case Intrinsic::fma:
712  InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
713  : getQuarterRateInstrCost(CostKind);
714  break;
715  case Intrinsic::uadd_sat:
716  case Intrinsic::usub_sat:
717  case Intrinsic::sadd_sat:
718  case Intrinsic::ssub_sat:
719  static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
720  if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
721  NElts = 1;
722  break;
723  }
724 
725  return LT.first * NElts * InstRate;
726 }
727 
730  const Instruction *I) {
731  assert((I == nullptr || I->getOpcode() == Opcode) &&
732  "Opcode should reflect passed instruction.");
733  const bool SCost =
735  const int CBrCost = SCost ? 5 : 7;
736  switch (Opcode) {
737  case Instruction::Br: {
738  // Branch instruction takes about 4 slots on gfx900.
739  auto BI = dyn_cast_or_null<BranchInst>(I);
740  if (BI && BI->isUnconditional())
741  return SCost ? 1 : 4;
742  // Suppose conditional branch takes additional 3 exec manipulations
743  // instructions in average.
744  return CBrCost;
745  }
746  case Instruction::Switch: {
747  auto SI = dyn_cast_or_null<SwitchInst>(I);
748  // Each case (including default) takes 1 cmp + 1 cbr instructions in
749  // average.
750  return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
751  }
752  case Instruction::Ret:
753  return SCost ? 1 : 10;
754  }
755  return BaseT::getCFInstrCost(Opcode, CostKind, I);
756 }
757 
760  std::optional<FastMathFlags> FMF,
763  return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
764 
765  EVT OrigTy = TLI->getValueType(DL, Ty);
766 
767  // Computes cost on targets that have packed math instructions(which support
768  // 16-bit types only).
769  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
770  return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
771 
772  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
773  return LT.first * getFullRateInstrCost();
774 }
775 
778  bool IsUnsigned,
780  EVT OrigTy = TLI->getValueType(DL, Ty);
781 
782  // Computes cost on targets that have packed math instructions(which support
783  // 16-bit types only).
784  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
785  return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
786 
787  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
788  return LT.first * getHalfRateInstrCost(CostKind);
789 }
790 
792  unsigned Index) {
793  switch (Opcode) {
794  case Instruction::ExtractElement:
795  case Instruction::InsertElement: {
796  unsigned EltSize
797  = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
798  if (EltSize < 32) {
799  if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
800  return 0;
801  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
802  }
803 
804  // Extracts are just reads of a subregister, so are free. Inserts are
805  // considered free because we don't want to have any cost for scalarizing
806  // operations, and we don't have to copy into a different register class.
807 
808  // Dynamic indexing isn't free and is best avoided.
809  return Index == ~0u ? 2 : 0;
810  }
811  default:
812  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
813  }
814 }
815 
816 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
817 /// this is analyzing the collective result of all output registers. Otherwise,
818 /// this is only querying a specific result index if this returns multiple
819 /// registers in a struct.
821  const CallInst *CI, ArrayRef<unsigned> Indices) const {
822  // TODO: Handle complex extract indices
823  if (Indices.size() > 1)
824  return true;
825 
826  const DataLayout &DL = CI->getModule()->getDataLayout();
827  const SIRegisterInfo *TRI = ST->getRegisterInfo();
828  TargetLowering::AsmOperandInfoVector TargetConstraints =
829  TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
830 
831  const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
832 
833  int OutputIdx = 0;
834  for (auto &TC : TargetConstraints) {
835  if (TC.Type != InlineAsm::isOutput)
836  continue;
837 
838  // Skip outputs we don't care about.
839  if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
840  continue;
841 
842  TLI->ComputeConstraintToUse(TC, SDValue());
843 
845  TRI, TC.ConstraintCode, TC.ConstraintVT).second;
846 
847  // For AGPR constraints null is returned on subtargets without AGPRs, so
848  // assume divergent for null.
849  if (!RC || !TRI->isSGPRClass(RC))
850  return true;
851  }
852 
853  return false;
854 }
855 
856 /// \returns true if the new GPU divergence analysis is enabled.
858  return !UseLegacyDA;
859 }
860 
862  const IntrinsicInst *ReadReg) const {
863  Metadata *MD =
864  cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
866  cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
867 
868  // Special case registers that look like VCC.
869  MVT VT = MVT::getVT(ReadReg->getType());
870  if (VT == MVT::i1)
871  return true;
872 
873  // Special case scalar registers that start with 'v'.
874  if (RegName.startswith("vcc") || RegName.empty())
875  return false;
876 
877  // VGPR or AGPR is divergent. There aren't any specially named vector
878  // registers.
879  return RegName[0] == 'v' || RegName[0] == 'a';
880 }
881 
882 /// \returns true if the result of the value could potentially be
883 /// different across workitems in a wavefront.
885  if (const Argument *A = dyn_cast<Argument>(V))
886  return !AMDGPU::isArgPassedInSGPR(A);
887 
888  // Loads from the private and flat address spaces are divergent, because
889  // threads can execute the load instruction with the same inputs and get
890  // different results.
891  //
892  // All other loads are not divergent, because if threads issue loads with the
893  // same arguments, they will always get the same result.
894  if (const LoadInst *Load = dyn_cast<LoadInst>(V))
895  return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
896  Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
897 
898  // Atomics are divergent because they are executed sequentially: when an
899  // atomic operation refers to the same address in each thread, then each
900  // thread after the first sees the value written by the previous thread as
901  // original value.
902  if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
903  return true;
904 
905  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
906  if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
907  return isReadRegisterSourceOfDivergence(Intrinsic);
908 
909  return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
910  }
911 
912  // Assume all function calls are a source of divergence.
913  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
914  if (CI->isInlineAsm())
916  return true;
917  }
918 
919  // Assume all function calls are a source of divergence.
920  if (isa<InvokeInst>(V))
921  return true;
922 
923  return false;
924 }
925 
926 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
927  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
928  switch (Intrinsic->getIntrinsicID()) {
929  default:
930  return false;
931  case Intrinsic::amdgcn_readfirstlane:
932  case Intrinsic::amdgcn_readlane:
933  case Intrinsic::amdgcn_icmp:
934  case Intrinsic::amdgcn_fcmp:
935  case Intrinsic::amdgcn_ballot:
936  case Intrinsic::amdgcn_if_break:
937  return true;
938  }
939  }
940 
941  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
942  if (CI->isInlineAsm())
943  return !isInlineAsmSourceOfDivergence(CI);
944  return false;
945  }
946 
947  // In most cases TID / wavefrontsize is uniform.
948  //
949  // However, if a kernel has uneven dimesions we can have a value of
950  // workitem-id-x divided by the wavefrontsize non-uniform. For example
951  // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
952  // packed into a same wave which gives 1 and 0 after the division by 64
953  // respectively.
954  //
955  // FIXME: limit it to 1D kernels only, although that shall be possible
956  // to perform this optimization is the size of the X dimension is a power
957  // of 2, we just do not currently have infrastructure to query it.
958  using namespace llvm::PatternMatch;
959  uint64_t C;
960  if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
961  m_ConstantInt(C))) ||
962  match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
963  m_ConstantInt(C)))) {
964  const Function *F = cast<Instruction>(V)->getFunction();
965  return C >= ST->getWavefrontSizeLog2() &&
966  ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
967  }
968 
969  Value *Mask;
970  if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
971  m_Value(Mask)))) {
972  const Function *F = cast<Instruction>(V)->getFunction();
973  const DataLayout &DL = F->getParent()->getDataLayout();
974  return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
975  ST->getWavefrontSizeLog2() &&
976  ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
977  }
978 
979  const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
980  if (!ExtValue)
981  return false;
982 
983  const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
984  if (!CI)
985  return false;
986 
987  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
988  switch (Intrinsic->getIntrinsicID()) {
989  default:
990  return false;
991  case Intrinsic::amdgcn_if:
992  case Intrinsic::amdgcn_else: {
993  ArrayRef<unsigned> Indices = ExtValue->getIndices();
994  return Indices.size() == 1 && Indices[0] == 1;
995  }
996  }
997  }
998 
999  // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1000  // divergent for the overall struct return. We need to override it in the
1001  // case we're extracting an SGPR component here.
1002  if (CI->isInlineAsm())
1003  return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1004 
1005  return false;
1006 }
1007 
1009  Intrinsic::ID IID) const {
1010  switch (IID) {
1011  case Intrinsic::amdgcn_atomic_inc:
1012  case Intrinsic::amdgcn_atomic_dec:
1013  case Intrinsic::amdgcn_ds_fadd:
1014  case Intrinsic::amdgcn_ds_fmin:
1015  case Intrinsic::amdgcn_ds_fmax:
1016  case Intrinsic::amdgcn_is_shared:
1017  case Intrinsic::amdgcn_is_private:
1018  case Intrinsic::amdgcn_flat_atomic_fadd:
1019  case Intrinsic::amdgcn_flat_atomic_fmax:
1020  case Intrinsic::amdgcn_flat_atomic_fmin:
1021  OpIndexes.push_back(0);
1022  return true;
1023  default:
1024  return false;
1025  }
1026 }
1027 
1029  Value *OldV,
1030  Value *NewV) const {
1031  auto IntrID = II->getIntrinsicID();
1032  switch (IntrID) {
1033  case Intrinsic::amdgcn_atomic_inc:
1034  case Intrinsic::amdgcn_atomic_dec:
1035  case Intrinsic::amdgcn_ds_fadd:
1036  case Intrinsic::amdgcn_ds_fmin:
1037  case Intrinsic::amdgcn_ds_fmax: {
1038  const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
1039  if (!IsVolatile->isZero())
1040  return nullptr;
1041  Module *M = II->getParent()->getParent()->getParent();
1042  Type *DestTy = II->getType();
1043  Type *SrcTy = NewV->getType();
1044  Function *NewDecl =
1045  Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
1046  II->setArgOperand(0, NewV);
1047  II->setCalledFunction(NewDecl);
1048  return II;
1049  }
1050  case Intrinsic::amdgcn_is_shared:
1051  case Intrinsic::amdgcn_is_private: {
1052  unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1054  unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1055  LLVMContext &Ctx = NewV->getType()->getContext();
1056  ConstantInt *NewVal = (TrueAS == NewAS) ?
1058  return NewVal;
1059  }
1060  case Intrinsic::ptrmask: {
1061  unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1062  unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1063  Value *MaskOp = II->getArgOperand(1);
1064  Type *MaskTy = MaskOp->getType();
1065 
1066  bool DoTruncate = false;
1067 
1068  const GCNTargetMachine &TM =
1069  static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1070  if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1071  // All valid 64-bit to 32-bit casts work by chopping off the high
1072  // bits. Any masking only clearing the low bits will also apply in the new
1073  // address space.
1074  if (DL.getPointerSizeInBits(OldAS) != 64 ||
1075  DL.getPointerSizeInBits(NewAS) != 32)
1076  return nullptr;
1077 
1078  // TODO: Do we need to thread more context in here?
1079  KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
1080  if (Known.countMinLeadingOnes() < 32)
1081  return nullptr;
1082 
1083  DoTruncate = true;
1084  }
1085 
1086  IRBuilder<> B(II);
1087  if (DoTruncate) {
1088  MaskTy = B.getInt32Ty();
1089  MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1090  }
1091 
1092  return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1093  {NewV, MaskOp});
1094  }
1095  case Intrinsic::amdgcn_flat_atomic_fadd:
1096  case Intrinsic::amdgcn_flat_atomic_fmax:
1097  case Intrinsic::amdgcn_flat_atomic_fmin: {
1098  Module *M = II->getParent()->getParent()->getParent();
1099  Type *DestTy = II->getType();
1100  Type *SrcTy = NewV->getType();
1102  {DestTy, SrcTy, DestTy});
1103  II->setArgOperand(0, NewV);
1104  II->setCalledFunction(NewDecl);
1105  return II;
1106  }
1107  default:
1108  return nullptr;
1109  }
1110 }
1111 
1115  int Index, VectorType *SubTp,
1117  Kind = improveShuffleKindFromMask(Kind, Mask);
1118  if (ST->hasVOP3PInsts()) {
1119  if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
1120  DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1121  // With op_sel VOP3P instructions freely can access the low half or high
1122  // half of a register, so any swizzle is free.
1123 
1124  switch (Kind) {
1125  case TTI::SK_Broadcast:
1126  case TTI::SK_Reverse:
1128  return 0;
1129  default:
1130  break;
1131  }
1132  }
1133  }
1134 
1135  return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
1136 }
1137 
1139  const Function *Callee) const {
1140  const TargetMachine &TM = getTLI()->getTargetMachine();
1141  const GCNSubtarget *CallerST
1142  = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1143  const GCNSubtarget *CalleeST
1144  = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1145 
1146  const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1147  const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1148 
1149  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1150  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1151  if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1152  return false;
1153 
1154  // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1155  // no way to support merge for backend defined attributes.
1156  AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
1158  if (!CallerMode.isInlineCompatible(CalleeMode))
1159  return false;
1160 
1161  if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1162  Callee->hasFnAttribute(Attribute::InlineHint))
1163  return true;
1164 
1165  // Hack to make compile times reasonable.
1166  if (InlineMaxBB) {
1167  // Single BB does not increase total BB amount.
1168  if (Callee->size() == 1)
1169  return true;
1170  size_t BBSize = Caller->size() + Callee->size() - 1;
1171  return BBSize <= InlineMaxBB;
1172  }
1173 
1174  return true;
1175 }
1176 
1178  // If we have a pointer to private array passed into a function
1179  // it will not be optimized out, leaving scratch usage.
1180  // Increase the inline threshold to allow inlining in this case.
1181  uint64_t AllocaSize = 0;
1183  for (Value *PtrArg : CB->args()) {
1184  PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1185  if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
1187  continue;
1188 
1189  PtrArg = getUnderlyingObject(PtrArg);
1190  if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
1191  if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1192  continue;
1193  AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1194  // If the amount of stack memory is excessive we will not be able
1195  // to get rid of the scratch anyway, bail out.
1196  if (AllocaSize > ArgAllocaCutoff) {
1197  AllocaSize = 0;
1198  break;
1199  }
1200  }
1201  }
1202  if (AllocaSize)
1203  return ArgAllocaCost;
1204  return 0;
1205 }
1206 
1210  CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1211 }
1212 
1215  CommonTTI.getPeelingPreferences(L, SE, PP);
1216 }
1217 
1218 int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1219  return ST->hasFullRate64Ops()
1220  ? getFullRateInstrCost()
1221  : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1222  : getQuarterRateInstrCost(CostKind);
1223 }
1224 
1225 std::pair<InstructionCost, MVT>
1226 GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1227  std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1228  auto Size = DL.getTypeSizeInBits(Ty);
1229  // Maximum load or store can handle 8 dwords for scalar and 4 for
1230  // vector ALU. Let's assume anything above 8 dwords is expensive
1231  // even if legal.
1232  if (Size <= 256)
1233  return Cost;
1234 
1235  Cost.first += (Size + 255) / 256;
1236  return Cost;
1237 }
llvm::ISD::SUB
@ SUB
Definition: ISDOpcodes.h:240
UseLegacyDA
static cl::opt< bool > UseLegacyDA("amdgpu-use-legacy-divergence-analysis", cl::desc("Enable legacy divergence analysis for AMDGPU"), cl::init(false), cl::Hidden)
llvm::InstructionCost
Definition: InstructionCost.h:30
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::GCNTTIImpl::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
Definition: AMDGPUTargetTransformInfo.cpp:777
llvm::TargetTransformInfo::UnrollingPreferences::BEInsns
unsigned BEInsns
Definition: TargetTransformInfo.h:467
llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
llvm::TargetTransformInfoImplBase::getMemcpyLoopResidualLoweringType
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional< uint32_t > AtomicCpySize) const
Definition: TargetTransformInfoImpl.h:751
llvm::Type::isSized
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:283
llvm::BasicTTIImplBase< AMDGPUTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:38
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:474
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:218
llvm::Loop::isLoopInvariant
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:60
llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition: TargetTransformInfo.h:442
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:69
llvm::GCNTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: AMDGPUTargetTransformInfo.cpp:1207
llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1748
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:667
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
PHI
Rewrite undef for PHI
Definition: AMDGPURewriteUndefForPHI.cpp:101
llvm::TargetTransformInfo::UnrollingPreferences::MaxCount
unsigned MaxCount
Definition: TargetTransformInfo.h:458
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:104
llvm::TargetOptions
Definition: TargetOptions.h:124
llvm::Function
Definition: Function.h:60
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
llvm::LoopBase::contains
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:139
llvm::AMDGPUTargetLowering::isFNegFree
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
Definition: AMDGPUISelLowering.cpp:817
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
llvm::PatternMatch::m_LShr
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1117
llvm::CallBase::setCalledFunction
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1436
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:673
llvm::BasicTTIImplBase< GCNTTIImpl >::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1132
llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition: TargetTransformInfo.h:1270
llvm::PatternMatch::m_FPOne
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:818
llvm::GCNTTIImpl::isSourceOfDivergence
bool isSourceOfDivergence(const Value *V) const
Definition: AMDGPUTargetTransformInfo.cpp:884
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:373
llvm::BasicTTIImplBase< GCNTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
Definition: BasicTTIImpl.h:939
llvm::CallBase::isInlineAsm
bool isInlineAsm() const
Check if this call is an inline asm statement.
Definition: InstrTypes.h:1465
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:729
llvm::IRBuilder<>
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:153
llvm::PointerType::getAddressSpace
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:682
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:449
llvm::TargetTransformInfo::RGK_Scalar
@ RGK_Scalar
Definition: TargetTransformInfo.h:967
ValueTracking.h
llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:221
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:140
llvm::GCNTTIImpl::isLegalToVectorizeMemChain
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:370
llvm::Depth
@ Depth
Definition: SIMachineScheduler.h:36
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:470
dependsOnLocalPhi
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
Definition: AMDGPUTargetTransformInfo.cpp:80
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::GCNTTIImpl::getMaxInterleaveFactor
unsigned getMaxInterleaveFactor(unsigned VF)
Definition: AMDGPUTargetTransformInfo.cpp:474
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:529
llvm::BasicTTIImplBase< GCNTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:903
llvm::SITargetLowering::getRegForInlineAsmConstraint
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
Definition: SIISelLowering.cpp:12230
llvm::AMDGPU::SIModeRegisterDefaults
Definition: AMDGPUBaseInfo.h:1276
llvm::GCNTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: AMDGPUTargetTransformInfo.cpp:1138
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:40
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
llvm::PatternMatch::m_AShr
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1123
llvm::GCNTTIImpl::getNumberOfRegisters
unsigned getNumberOfRegisters(unsigned RCID) const
Definition: AMDGPUTargetTransformInfo.cpp:298
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:422
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
llvm::TargetLowering::ComputeConstraintToUse
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
Definition: TargetLowering.cpp:5665
llvm::max
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:337
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:119
llvm::AMDGPU::SIModeRegisterDefaults::isInlineCompatible
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Definition: AMDGPUBaseInfo.h:1361
llvm::PatternMatch::m_c_And
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
Definition: PatternMatch.h:2251
llvm::TargetTransformInfo::OperandValueInfo
Definition: TargetTransformInfo.h:926
llvm::GCNSubtarget::hasPackedFP32Ops
bool hasPackedFP32Ops() const
Definition: GCNSubtarget.h:880
llvm::AMDGPU::isIntrinsicSourceOfDivergence
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
Definition: AMDGPUBaseInfo.cpp:2568
llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition: TargetTransformInfo.h:900
llvm::Type::getInt8Ty
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:237
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1628
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:239
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:372
llvm::GCNSubtarget::getRegisterInfo
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:228
llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:159
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2285
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:890
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:371
KnownBits.h
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
llvm::MVT::SimpleValueType
SimpleValueType
Definition: MachineValueType.h:33
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:266
Context
LLVMContext & Context
Definition: NVVMIntrRange.cpp:66
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::AllocaInst::isStaticAlloca
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Definition: Instructions.cpp:1508
llvm::LoopBase::getSubLoops
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
Definition: LoopInfo.h:160
llvm::MVT::i1
@ i1
Definition: MachineValueType.h:43
llvm::BasicTTIImplBase< GCNTTIImpl >::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:789
llvm::MinAlign
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition: MathExtras.h:600
llvm::AMDGPUTargetMachine
Definition: AMDGPUTargetMachine.h:29
llvm::GCNSubtarget::hasFastFMAF32
bool hasFastFMAF32() const
Definition: GCNSubtarget.h:317
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::AllocaInst::getAllocatedType
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:115
llvm::BasicTTIImplBase< GCNTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:1187
InlinePriorityMode::Cost
@ Cost
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:889
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:34
llvm::GCNTTIImpl::isLegalToVectorizeStoreChain
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:389
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::MVT::f64
@ f64
Definition: MachineValueType.h:58
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:56
llvm::Instruction::hasAllowContract
bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
Definition: Instruction.cpp:316
llvm::GCNTTIImpl::getMemcpyLoopLoweringType
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional< uint32_t > AtomicElementSize) const
Definition: AMDGPUTargetTransformInfo.cpp:402
llvm::GCNTTIImpl::getMinVectorRegisterBitWidth
unsigned getMinVectorRegisterBitWidth() const
Definition: AMDGPUTargetTransformInfo.cpp:322
SI
@ SI
Definition: SIInstrInfo.cpp:7966
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
llvm::AMDGPUAS::BUFFER_FAT_POINTER
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
Definition: AMDGPU.h:381
llvm::ISD::SRA
@ SRA
Definition: ISDOpcodes.h:692
llvm::TargetTransformInfo::UnrollingPreferences::MaxIterationsCountToAnalyze
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
Definition: TargetTransformInfo.h:496
llvm::GCNSubtarget::getMaxPrivateElementSize
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:290
llvm::GCNTTIImpl::getLoadVectorFactor
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: AMDGPUTargetTransformInfo.cpp:334
llvm::dwarf::Index
Index
Definition: Dwarf.h:472
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:825
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:188
AMDGPU
Definition: AMDGPUReplaceLDSUseWithPointer.cpp:114
llvm::PatternMatch::m_ConstantInt
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:147
llvm::Instruction
Definition: Instruction.h:42
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:189
Options
const char LLVMTargetMachineRef LLVMPassBuilderOptionsRef Options
Definition: PassBuilderBindings.cpp:48
llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
Definition: ValueTracking.cpp:4499
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:30
llvm::GCNTTIImpl::isAlwaysUniform
bool isAlwaysUniform(const Value *V) const
Definition: AMDGPUTargetTransformInfo.cpp:926
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:666
llvm::AMDGPUSubtarget::has16BitInsts
bool has16BitInsts() const
Definition: AMDGPUSubtarget.h:146
PatternMatch.h
llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition: TargetTransformInfo.h:967
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:684
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:62
llvm::AMDGPUTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AMDGPUTargetTransformInfo.cpp:263
llvm::LinearPolySize< TypeSize >::getFixed
static TypeSize getFixed(ScalarTy MinVal)
Definition: TypeSize.h:283
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:100
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:120
LoopInfo.h
InlineMaxBB
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
AMDGPUTargetTransformInfo.h
llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:891
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition: AtomicOrdering.h:56
llvm::cl::opt
Definition: CommandLine.h:1412
llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:375
llvm::GCNTTIImpl::getTgtMemIntrinsic
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
Definition: AMDGPUTargetTransformInfo.cpp:483
AMDGPUMCTargetDesc.h
llvm::KnownBits::countMinLeadingOnes
unsigned countMinLeadingOnes() const
Returns the minimum number of leading one bits.
Definition: KnownBits.h:248
uint64_t
llvm::AMDGPUTTIImpl::AMDGPUTTIImpl
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition: AMDGPUTargetTransformInfo.cpp:99
llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:652
const
aarch64 promote const
Definition: AArch64PromoteConstant.cpp:232
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
llvm::TargetLowering::ParseConstraints
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
Definition: TargetLowering.cpp:5316
llvm::AMDGPU::getIntegerAttribute
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
Definition: AMDGPUBaseInfo.cpp:1139
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:417
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:929
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:447
llvm::AMDGPUSubtarget::hasMadMacF32Insts
bool hasMadMacF32Insts() const
Definition: AMDGPUSubtarget.h:156
llvm::PointerType
Class to represent pointers.
Definition: DerivedTypes.h:632
llvm::BasicTTIImplBase< AMDGPUTTIImpl >
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
intrinsicHasPackedVectorBenefit
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
Definition: AMDGPUTargetTransformInfo.cpp:666
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:197
llvm::TargetMachine::Options
TargetOptions Options
Definition: TargetMachine.h:118
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
llvm::LoopBase::getLoopDepth
unsigned getLoopDepth() const
Return the nesting level of this loop.
Definition: LoopInfo.h:97
Ptr
@ Ptr
Definition: TargetLibraryInfo.cpp:60
llvm::GCNSubtarget::hasUnalignedScratchAccess
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:535
Mode
SI Whole Quad Mode
Definition: SIWholeQuadMode.cpp:264
UnrollMaxBlockToAnalyze
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
llvm::MVT
Machine Value Type.
Definition: MachineValueType.h:31
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:66
llvm::MDNode
Metadata node.
Definition: Metadata.h:944
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
UnrollThresholdPrivate
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
llvm::GCNTargetMachine
Definition: AMDGPUTargetMachine.h:74
llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition: TargetTransformInfo.h:244
llvm::AMDGPUSubtarget::hasVOP3PInsts
bool hasVOP3PInsts() const
Definition: AMDGPUSubtarget.h:168
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1741
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:49
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:138
llvm::AMDGPUTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: AMDGPUTargetTransformInfo.cpp:105
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:352
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:222
llvm::GCNTTIImpl::isLegalToVectorizeLoadChain
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:383
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::GCNTTIImpl::isInlineAsmSourceOfDivergence
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
Definition: AMDGPUTargetTransformInfo.cpp:820
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
getParent
static const Function * getParent(const Value *V)
Definition: BasicAliasAnalysis.cpp:805
llvm::GCNTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
Definition: AMDGPUTargetTransformInfo.cpp:1112
UnrollThresholdIf
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
llvm::TargetLoweringBase::InstructionOpcodeToISD
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Definition: TargetLoweringBase.cpp:1785
llvm::AMDGPU::isGraphics
bool isGraphics(CallingConv::ID cc)
Definition: AMDGPUBaseInfo.cpp:1823
llvm::findOptionMDForLoop
MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
Definition: LoopInfo.cpp:1043
llvm::GCNTTIImpl::getRegisterBitWidth
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:310
llvm::ConstantInt::getSExtValue
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:148
llvm::BasicTTIImplBase< AMDGPUTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:597
llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:174
llvm::ISD::FMUL
@ FMUL
Definition: ISDOpcodes.h:392
llvm::CallBase::setArgOperand
void setArgOperand(unsigned i, Value *v)
Definition: InstrTypes.h:1347
UnrollThresholdLocal
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1481
llvm::GCNTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: AMDGPUTargetTransformInfo.cpp:759
llvm::ConstantInt::getFalse
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:834
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:668
llvm::GCNTTIImpl::isReadRegisterSourceOfDivergence
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
Definition: AMDGPUTargetTransformInfo.cpp:861
Callee
amdgpu Simplify well known AMD library false FunctionCallee Callee
Definition: AMDGPULibCalls.cpp:187
llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition: TargetLowering.h:349
llvm::LoopBase::isInnermost
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:182
ArgAllocaCost
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
llvm::GCNTTIImpl::getMaximumVF
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
Definition: AMDGPUTargetTransformInfo.cpp:326
llvm::GCNTTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AMDGPUTargetTransformInfo.cpp:728
llvm::Type::getInt64Ty
static IntegerType * getInt64Ty(LLVMContext &C)
Definition: Type.cpp:240
llvm::AMDGPU::isArgPassedInSGPR
bool isArgPassedInSGPR(const Argument *A)
Definition: AMDGPUBaseInfo.cpp:2377
llvm::ConstantInt::getTrue
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:827
llvm::KnownBits
Definition: KnownBits.h:23
llvm::Type::getIntNTy
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:243
llvm::GCNSubtarget::hasFullRate64Ops
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:325
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:376
llvm::GCNTTIImpl::collectFlatAddressOperands
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
Definition: AMDGPUTargetTransformInfo.cpp:1008
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:348
llvm::GCNTTIImpl::getLoadStoreVecRegBitWidth
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:355
llvm::ExtractValueInst
This instruction extracts a struct member or array element value from an aggregate value.
Definition: Instructions.h:2444
llvm::GCNTTIImpl::getStoreVectorFactor
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: AMDGPUTargetTransformInfo.cpp:345
llvm::TypeSize
Definition: TypeSize.h:435
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:105
llvm::LinearPolySize< TypeSize >::getScalable
static TypeSize getScalable(ScalarTy MinVal)
Definition: TypeSize.h:286
llvm::GCNTTIImpl::adjustInliningThreshold
unsigned adjustInliningThreshold(const CallBase *CB) const
Definition: AMDGPUTargetTransformInfo.cpp:1177
llvm::SDValue
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
Definition: SelectionDAGNodes.h:145
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
llvm::TargetTransformInfo::RGK_ScalableVector
@ RGK_ScalableVector
Definition: TargetTransformInfo.h:967
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
llvm::GCNTTIImpl::rewriteIntrinsicWithAddressSpace
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
Definition: AMDGPUTargetTransformInfo.cpp:1028
llvm::GCNTTIImpl::getMemcpyLoopResidualLoweringType
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional< uint32_t > AtomicCpySize) const
Definition: AMDGPUTargetTransformInfo.cpp:433
llvm::InlineAsm::isOutput
@ isOutput
Definition: InlineAsm.h:96
llvm::FPOpFusion::Fast
@ Fast
Definition: TargetOptions.h:37
llvm::BasicTTIImplBase< GCNTTIImpl >::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
Definition: BasicTTIImpl.h:2295
llvm::GCNTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: AMDGPUTargetTransformInfo.cpp:514
llvm::RecurKind::FAdd
@ FAdd
Sum of floats.
llvm::ISD::FSUB
@ FSUB
Definition: ISDOpcodes.h:391
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:151
llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:691
llvm::TargetTransformInfo::RegisterKind
RegisterKind
Definition: TargetTransformInfo.h:967
llvm::ISD::FREM
@ FREM
Definition: ISDOpcodes.h:394
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:241
llvm::MVT::f16
@ f16
Definition: MachineValueType.h:56
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1342
llvm::BasicTTIImplBase< GCNTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1435
llvm::TargetTransformInfo::UnrollingPreferences::Threshold
unsigned Threshold
The cost threshold for the unrolled loop.
Definition: TargetTransformInfo.h:425
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:91
llvm::ISD::SRL
@ SRL
Definition: ISDOpcodes.h:693
llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:164
llvm::MVT::getVT
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:563
llvm::GCNTTIImpl::GCNTTIImpl
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition: AMDGPUTargetTransformInfo.cpp:288
llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:377
llvm::PHINode
Definition: Instructions.h:2697
llvm::PatternMatch
Definition: PatternMatch.h:47
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:42
RegName
#define RegName(no)
llvm::MemIntrinsicInfo
Information about a load/store intrinsic defined by the target.
Definition: TargetTransformInfo.h:72
llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1175
llvm::Type::getInt16Ty
static IntegerType * getInt16Ty(LLVMContext &C)
Definition: Type.cpp:238
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:399
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::DataLayout::getPointerSizeInBits
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:412
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:47
llvm::LoopBase::isLoopExiting
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
Definition: LoopInfo.h:242
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1474
llvm::ISD::FNEG
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:911
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:171
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:394
llvm::GCNTTIImpl::useGPUDivergenceAnalysis
bool useGPUDivergenceAnalysis() const
Definition: AMDGPUTargetTransformInfo.cpp:857
llvm::AllocaInst
an instruction to allocate memory on the stack
Definition: Instructions.h:59
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:413
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1497
ArgAllocaCutoff
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition: Instructions.h:3132
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:98
llvm::MVT::f32
@ f32
Definition: MachineValueType.h:57
UnrollRuntimeLocal
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
llvm::ExtractValueInst::getIndices
ArrayRef< unsigned > getIndices() const
Definition: Instructions.h:2509
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::GCNTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: AMDGPUTargetTransformInfo.cpp:682
AMDGPUTargetMachine.h
llvm::CallBase::args
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1333
llvm::GCNTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index)
Definition: AMDGPUTargetTransformInfo.cpp:791
llvm::TargetLowering::AsmOperandInfoVector
std::vector< AsmOperandInfo > AsmOperandInfoVector
Definition: TargetLowering.h:4627
llvm::GCNSubtarget::hasUsableDivScaleConditionOutput
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:423
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:379
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:39
llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:506
llvm::ISD::FDIV
@ FDIV
Definition: ISDOpcodes.h:393
llvm::GCNTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AMDGPUTargetTransformInfo.cpp:1213