LLVM  10.0.0svn
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
18 #include "AMDGPUSubtarget.h"
19 #include "Utils/AMDGPUBaseInfo.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/IR/Argument.h"
27 #include "llvm/IR/Attributes.h"
28 #include "llvm/IR/BasicBlock.h"
29 #include "llvm/IR/CallingConv.h"
30 #include "llvm/IR/DataLayout.h"
31 #include "llvm/IR/DerivedTypes.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/Instruction.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/IntrinsicInst.h"
36 #include "llvm/IR/Module.h"
37 #include "llvm/IR/PatternMatch.h"
38 #include "llvm/IR/Type.h"
39 #include "llvm/IR/Value.h"
41 #include "llvm/Support/Casting.h"
43 #include "llvm/Support/Debug.h"
48 #include <algorithm>
49 #include <cassert>
50 #include <limits>
51 #include <utility>
52 
53 using namespace llvm;
54 
55 #define DEBUG_TYPE "AMDGPUtti"
56 
58  "amdgpu-unroll-threshold-private",
59  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
60  cl::init(2000), cl::Hidden);
61 
63  "amdgpu-unroll-threshold-local",
64  cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
65  cl::init(1000), cl::Hidden);
66 
68  "amdgpu-unroll-threshold-if",
69  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
70  cl::init(150), cl::Hidden);
71 
72 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
73  unsigned Depth = 0) {
74  const Instruction *I = dyn_cast<Instruction>(Cond);
75  if (!I)
76  return false;
77 
78  for (const Value *V : I->operand_values()) {
79  if (!L->contains(I))
80  continue;
81  if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
82  if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
83  return SubLoop->contains(PHI); }))
84  return true;
85  } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
86  return true;
87  }
88  return false;
89 }
90 
93  UP.Threshold = 300; // Twice the default.
95  UP.Partial = true;
96 
97  // TODO: Do we want runtime unrolling?
98 
99  // Maximum alloca size than can fit registers. Reserve 16 registers.
100  const unsigned MaxAlloca = (256 - 16) * 4;
101  unsigned ThresholdPrivate = UnrollThresholdPrivate;
102  unsigned ThresholdLocal = UnrollThresholdLocal;
103  unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
104  for (const BasicBlock *BB : L->getBlocks()) {
105  const DataLayout &DL = BB->getModule()->getDataLayout();
106  unsigned LocalGEPsSeen = 0;
107 
108  if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
109  return SubLoop->contains(BB); }))
110  continue; // Block belongs to an inner loop.
111 
112  for (const Instruction &I : *BB) {
113  // Unroll a loop which contains an "if" statement whose condition
114  // defined by a PHI belonging to the loop. This may help to eliminate
115  // if region and potentially even PHI itself, saving on both divergence
116  // and registers used for the PHI.
117  // Add a small bonus for each of such "if" statements.
118  if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
119  if (UP.Threshold < MaxBoost && Br->isConditional()) {
120  BasicBlock *Succ0 = Br->getSuccessor(0);
121  BasicBlock *Succ1 = Br->getSuccessor(1);
122  if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
123  (L->contains(Succ1) && L->isLoopExiting(Succ1)))
124  continue;
125  if (dependsOnLocalPhi(L, Br->getCondition())) {
127  LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
128  << " for loop:\n"
129  << *L << " due to " << *Br << '\n');
130  if (UP.Threshold >= MaxBoost)
131  return;
132  }
133  }
134  continue;
135  }
136 
138  if (!GEP)
139  continue;
140 
141  unsigned AS = GEP->getAddressSpace();
142  unsigned Threshold = 0;
143  if (AS == AMDGPUAS::PRIVATE_ADDRESS)
144  Threshold = ThresholdPrivate;
145  else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
146  Threshold = ThresholdLocal;
147  else
148  continue;
149 
150  if (UP.Threshold >= Threshold)
151  continue;
152 
153  if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
154  const Value *Ptr = GEP->getPointerOperand();
155  const AllocaInst *Alloca =
157  if (!Alloca || !Alloca->isStaticAlloca())
158  continue;
159  Type *Ty = Alloca->getAllocatedType();
160  unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
161  if (AllocaSize > MaxAlloca)
162  continue;
163  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
164  AS == AMDGPUAS::REGION_ADDRESS) {
165  LocalGEPsSeen++;
166  // Inhibit unroll for local memory if we have seen addressing not to
167  // a variable, most likely we will be unable to combine it.
168  // Do not unroll too deep inner loops for local memory to give a chance
169  // to unroll an outer loop for a more important reason.
170  if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
171  (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
172  !isa<Argument>(GEP->getPointerOperand())))
173  continue;
174  }
175 
176  // Check if GEP depends on a value defined by this loop itself.
177  bool HasLoopDef = false;
178  for (const Value *Op : GEP->operands()) {
179  const Instruction *Inst = dyn_cast<Instruction>(Op);
180  if (!Inst || L->isLoopInvariant(Op))
181  continue;
182 
183  if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
184  return SubLoop->contains(Inst); }))
185  continue;
186  HasLoopDef = true;
187  break;
188  }
189  if (!HasLoopDef)
190  continue;
191 
192  // We want to do whatever we can to limit the number of alloca
193  // instructions that make it through to the code generator. allocas
194  // require us to use indirect addressing, which is slow and prone to
195  // compiler bugs. If this loop does an address calculation on an
196  // alloca ptr, then we want to use a higher than normal loop unroll
197  // threshold. This will give SROA a better chance to eliminate these
198  // allocas.
199  //
200  // We also want to have more unrolling for local memory to let ds
201  // instructions with different offsets combine.
202  //
203  // Don't use the maximum allowed value here as it will make some
204  // programs way too big.
205  UP.Threshold = Threshold;
206  LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
207  << " for loop:\n"
208  << *L << " due to " << *GEP << '\n');
209  if (UP.Threshold >= MaxBoost)
210  return;
211  }
212  }
213 }
214 
216  // The concept of vector registers doesn't really exist. Some packed vector
217  // operations operate on the normal 32-bit registers.
218  return 256;
219 }
220 
221 unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
222  // This is really the number of registers to fill when vectorizing /
223  // interleaving loops, so we lie to avoid trying to use all registers.
224  return getHardwareNumberOfRegisters(Vec) >> 3;
225 }
226 
228  return 32;
229 }
230 
232  return 32;
233 }
234 
235 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
236  unsigned ChainSizeInBytes,
237  VectorType *VecTy) const {
238  unsigned VecRegBitWidth = VF * LoadSize;
239  if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
240  // TODO: Support element-size less than 32bit?
241  return 128 / LoadSize;
242 
243  return VF;
244 }
245 
246 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
247  unsigned ChainSizeInBytes,
248  VectorType *VecTy) const {
249  unsigned VecRegBitWidth = VF * StoreSize;
250  if (VecRegBitWidth > 128)
251  return 128 / StoreSize;
252 
253  return VF;
254 }
255 
256 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
257  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
258  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
259  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
260  AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
261  return 512;
262  }
263 
264  if (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
265  AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
266  AddrSpace == AMDGPUAS::REGION_ADDRESS)
267  return 128;
268 
269  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
270  return 8 * ST->getMaxPrivateElementSize();
271 
272  llvm_unreachable("unhandled address space");
273 }
274 
275 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
276  unsigned Alignment,
277  unsigned AddrSpace) const {
278  // We allow vectorization of flat stores, even though we may need to decompose
279  // them later if they may access private memory. We don't have enough context
280  // here, and legalization can handle it.
281  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
282  return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
283  ChainSizeInBytes <= ST->getMaxPrivateElementSize();
284  }
285  return true;
286 }
287 
288 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
289  unsigned Alignment,
290  unsigned AddrSpace) const {
291  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
292 }
293 
294 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
295  unsigned Alignment,
296  unsigned AddrSpace) const {
297  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
298 }
299 
300 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
301  // Disable unrolling if the loop is not vectorized.
302  // TODO: Enable this again.
303  if (VF == 1)
304  return 1;
305 
306  return 8;
307 }
308 
310  MemIntrinsicInfo &Info) const {
311  switch (Inst->getIntrinsicID()) {
312  case Intrinsic::amdgcn_atomic_inc:
313  case Intrinsic::amdgcn_atomic_dec:
314  case Intrinsic::amdgcn_ds_ordered_add:
315  case Intrinsic::amdgcn_ds_ordered_swap:
316  case Intrinsic::amdgcn_ds_fadd:
317  case Intrinsic::amdgcn_ds_fmin:
318  case Intrinsic::amdgcn_ds_fmax: {
319  auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
320  auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
321  if (!Ordering || !Volatile)
322  return false; // Invalid.
323 
324  unsigned OrderingVal = Ordering->getZExtValue();
325  if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
326  return false;
327 
328  Info.PtrVal = Inst->getArgOperand(0);
329  Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
330  Info.ReadMem = true;
331  Info.WriteMem = true;
332  Info.IsVolatile = !Volatile->isNullValue();
333  return true;
334  }
335  default:
336  return false;
337  }
338 }
339 
341  unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
342  TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
344  EVT OrigTy = TLI->getValueType(DL, Ty);
345  if (!OrigTy.isSimple()) {
346  return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
347  Opd1PropInfo, Opd2PropInfo);
348  }
349 
350  // Legalize the type.
351  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
352  int ISD = TLI->InstructionOpcodeToISD(Opcode);
353 
354  // Because we don't have any legal vector operations, but the legal types, we
355  // need to account for split vectors.
356  unsigned NElts = LT.second.isVector() ?
357  LT.second.getVectorNumElements() : 1;
358 
359  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
360 
361  switch (ISD) {
362  case ISD::SHL:
363  case ISD::SRL:
364  case ISD::SRA:
365  if (SLT == MVT::i64)
366  return get64BitInstrCost() * LT.first * NElts;
367 
368  // i32
369  return getFullRateInstrCost() * LT.first * NElts;
370  case ISD::ADD:
371  case ISD::SUB:
372  case ISD::AND:
373  case ISD::OR:
374  case ISD::XOR:
375  if (SLT == MVT::i64){
376  // and, or and xor are typically split into 2 VALU instructions.
377  return 2 * getFullRateInstrCost() * LT.first * NElts;
378  }
379 
380  return LT.first * NElts * getFullRateInstrCost();
381  case ISD::MUL: {
382  const int QuarterRateCost = getQuarterRateInstrCost();
383  if (SLT == MVT::i64) {
384  const int FullRateCost = getFullRateInstrCost();
385  return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
386  }
387 
388  // i32
389  return QuarterRateCost * NElts * LT.first;
390  }
391  case ISD::FADD:
392  case ISD::FSUB:
393  case ISD::FMUL:
394  if (SLT == MVT::f64)
395  return LT.first * NElts * get64BitInstrCost();
396 
397  if (SLT == MVT::f32 || SLT == MVT::f16)
398  return LT.first * NElts * getFullRateInstrCost();
399  break;
400  case ISD::FDIV:
401  case ISD::FREM:
402  // FIXME: frem should be handled separately. The fdiv in it is most of it,
403  // but the current lowering is also not entirely correct.
404  if (SLT == MVT::f64) {
405  int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
406  // Add cost of workaround.
407  if (!ST->hasUsableDivScaleConditionOutput())
408  Cost += 3 * getFullRateInstrCost();
409 
410  return LT.first * Cost * NElts;
411  }
412 
413  if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
414  // TODO: This is more complicated, unsafe flags etc.
415  if ((SLT == MVT::f32 && !ST->hasFP32Denormals()) ||
416  (SLT == MVT::f16 && ST->has16BitInsts())) {
417  return LT.first * getQuarterRateInstrCost() * NElts;
418  }
419  }
420 
421  if (SLT == MVT::f16 && ST->has16BitInsts()) {
422  // 2 x v_cvt_f32_f16
423  // f32 rcp
424  // f32 fmul
425  // v_cvt_f16_f32
426  // f16 div_fixup
427  int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
428  return LT.first * Cost * NElts;
429  }
430 
431  if (SLT == MVT::f32 || SLT == MVT::f16) {
432  int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
433 
434  if (!ST->hasFP32Denormals()) {
435  // FP mode switches.
436  Cost += 2 * getFullRateInstrCost();
437  }
438 
439  return LT.first * NElts * Cost;
440  }
441  break;
442  default:
443  break;
444  }
445 
446  return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
447  Opd1PropInfo, Opd2PropInfo);
448 }
449 
450 unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) {
451  // XXX - For some reason this isn't called for switch.
452  switch (Opcode) {
453  case Instruction::Br:
454  case Instruction::Ret:
455  return 10;
456  default:
457  return BaseT::getCFInstrCost(Opcode);
458  }
459 }
460 
462  bool IsPairwise) {
463  EVT OrigTy = TLI->getValueType(DL, Ty);
464 
465  // Computes cost on targets that have packed math instructions(which support
466  // 16-bit types only).
467  if (IsPairwise ||
468  !ST->hasVOP3PInsts() ||
469  OrigTy.getScalarSizeInBits() != 16)
470  return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise);
471 
472  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
473  return LT.first * getFullRateInstrCost();
474 }
475 
477  bool IsPairwise,
478  bool IsUnsigned) {
479  EVT OrigTy = TLI->getValueType(DL, Ty);
480 
481  // Computes cost on targets that have packed math instructions(which support
482  // 16-bit types only).
483  if (IsPairwise ||
484  !ST->hasVOP3PInsts() ||
485  OrigTy.getScalarSizeInBits() != 16)
486  return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned);
487 
488  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
489  return LT.first * getHalfRateInstrCost();
490 }
491 
492 int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
493  unsigned Index) {
494  switch (Opcode) {
495  case Instruction::ExtractElement:
496  case Instruction::InsertElement: {
497  unsigned EltSize
498  = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
499  if (EltSize < 32) {
500  if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
501  return 0;
502  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
503  }
504 
505  // Extracts are just reads of a subregister, so are free. Inserts are
506  // considered free because we don't want to have any cost for scalarizing
507  // operations, and we don't have to copy into a different register class.
508 
509  // Dynamic indexing isn't free and is best avoided.
510  return Index == ~0u ? 2 : 0;
511  }
512  default:
513  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
514  }
515 }
516 
517 
518 
519 static bool isArgPassedInSGPR(const Argument *A) {
520  const Function *F = A->getParent();
521 
522  // Arguments to compute shaders are never a source of divergence.
524  switch (CC) {
527  return true;
535  // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
536  // Everything else is in VGPRs.
537  return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
538  F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
539  default:
540  // TODO: Should calls support inreg for SGPR inputs?
541  return false;
542  }
543 }
544 
545 /// \returns true if the result of the value could potentially be
546 /// different across workitems in a wavefront.
548  if (const Argument *A = dyn_cast<Argument>(V))
549  return !isArgPassedInSGPR(A);
550 
551  // Loads from the private and flat address spaces are divergent, because
552  // threads can execute the load instruction with the same inputs and get
553  // different results.
554  //
555  // All other loads are not divergent, because if threads issue loads with the
556  // same arguments, they will always get the same result.
557  if (const LoadInst *Load = dyn_cast<LoadInst>(V))
558  return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
559  Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
560 
561  // Atomics are divergent because they are executed sequentially: when an
562  // atomic operation refers to the same address in each thread, then each
563  // thread after the first sees the value written by the previous thread as
564  // original value.
565  if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
566  return true;
567 
568  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
569  return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
570 
571  // Assume all function calls are a source of divergence.
572  if (isa<CallInst>(V) || isa<InvokeInst>(V))
573  return true;
574 
575  return false;
576 }
577 
578 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
579  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
580  switch (Intrinsic->getIntrinsicID()) {
581  default:
582  return false;
583  case Intrinsic::amdgcn_readfirstlane:
584  case Intrinsic::amdgcn_readlane:
585  case Intrinsic::amdgcn_icmp:
586  case Intrinsic::amdgcn_fcmp:
587  return true;
588  }
589  }
590  return false;
591 }
592 
594  Intrinsic::ID IID) const {
595  switch (IID) {
596  case Intrinsic::amdgcn_atomic_inc:
597  case Intrinsic::amdgcn_atomic_dec:
598  case Intrinsic::amdgcn_ds_fadd:
599  case Intrinsic::amdgcn_ds_fmin:
600  case Intrinsic::amdgcn_ds_fmax:
601  case Intrinsic::amdgcn_is_shared:
602  case Intrinsic::amdgcn_is_private:
603  OpIndexes.push_back(0);
604  return true;
605  default:
606  return false;
607  }
608 }
609 
611  IntrinsicInst *II, Value *OldV, Value *NewV) const {
612  auto IntrID = II->getIntrinsicID();
613  switch (IntrID) {
614  case Intrinsic::amdgcn_atomic_inc:
615  case Intrinsic::amdgcn_atomic_dec:
616  case Intrinsic::amdgcn_ds_fadd:
617  case Intrinsic::amdgcn_ds_fmin:
618  case Intrinsic::amdgcn_ds_fmax: {
619  const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
620  if (!IsVolatile->isZero())
621  return false;
622  Module *M = II->getParent()->getParent()->getParent();
623  Type *DestTy = II->getType();
624  Type *SrcTy = NewV->getType();
625  Function *NewDecl =
626  Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
627  II->setArgOperand(0, NewV);
628  II->setCalledFunction(NewDecl);
629  return true;
630  }
631  case Intrinsic::amdgcn_is_shared:
632  case Intrinsic::amdgcn_is_private: {
633  unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
635  unsigned NewAS = NewV->getType()->getPointerAddressSpace();
636  LLVMContext &Ctx = NewV->getType()->getContext();
637  ConstantInt *NewVal = (TrueAS == NewAS) ?
639  II->replaceAllUsesWith(NewVal);
640  II->eraseFromParent();
641  return true;
642  }
643  default:
644  return false;
645  }
646 }
647 
649  Type *SubTp) {
650  if (ST->hasVOP3PInsts()) {
651  VectorType *VT = cast<VectorType>(Tp);
652  if (VT->getNumElements() == 2 &&
653  DL.getTypeSizeInBits(VT->getElementType()) == 16) {
654  // With op_sel VOP3P instructions freely can access the low half or high
655  // half of a register, so any swizzle is free.
656 
657  switch (Kind) {
658  case TTI::SK_Broadcast:
659  case TTI::SK_Reverse:
661  return 0;
662  default:
663  break;
664  }
665  }
666  }
667 
668  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
669 }
670 
672  const Function *Callee) const {
673  const TargetMachine &TM = getTLI()->getTargetMachine();
674  const FeatureBitset &CallerBits =
675  TM.getSubtargetImpl(*Caller)->getFeatureBits();
676  const FeatureBitset &CalleeBits =
677  TM.getSubtargetImpl(*Callee)->getFeatureBits();
678 
679  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
680  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
681  if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
682  return false;
683 
684  // FIXME: dx10_clamp can just take the caller setting, but there seems to be
685  // no way to support merge for backend defined attributes.
686  AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
687  AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
688  return CallerMode.isInlineCompatible(CalleeMode);
689 }
690 
693  CommonTTI.getUnrollingPreferences(L, SE, UP);
694 }
695 
696 unsigned GCNTTIImpl::getUserCost(const User *U,
698  // Estimate extractelement elimination
699  if (const ExtractElementInst *EE = dyn_cast<ExtractElementInst>(U)) {
700  ConstantInt *CI = dyn_cast<ConstantInt>(EE->getOperand(1));
701  unsigned Idx = -1;
702  if (CI)
703  Idx = CI->getZExtValue();
704  return getVectorInstrCost(EE->getOpcode(), EE->getOperand(0)->getType(),
705  Idx);
706  }
707 
708  // Estimate insertelement elimination
709  if (const InsertElementInst *IE = dyn_cast<InsertElementInst>(U)) {
710  ConstantInt *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
711  unsigned Idx = -1;
712  if (CI)
713  Idx = CI->getZExtValue();
714  return getVectorInstrCost(IE->getOpcode(), IE->getType(), Idx);
715  }
716 
717  // Estimate different intrinsics, e.g. llvm.fabs
718  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
719  SmallVector<Value *, 4> Args(II->arg_operands());
720  FastMathFlags FMF;
721  if (auto *FPMO = dyn_cast<FPMathOperator>(II))
722  FMF = FPMO->getFastMathFlags();
723  return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args,
724  FMF);
725  }
726  return BaseT::getUserCost(U, Operands);
727 }
728 
730  return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
731 }
732 
733 unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
734  return getHardwareNumberOfRegisters(Vec);
735 }
736 
738  return 32;
739 }
740 
742  return 32;
743 }
744 
745 unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
746  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
747  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
748  return 128;
749  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
750  AddrSpace == AMDGPUAS::REGION_ADDRESS)
751  return 64;
752  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
753  return 32;
754 
755  if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
756  AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
757  (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
758  AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
759  return 128;
760  llvm_unreachable("unhandled address space");
761 }
762 
763 bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
764  unsigned Alignment,
765  unsigned AddrSpace) const {
766  // We allow vectorization of flat stores, even though we may need to decompose
767  // them later if they may access private memory. We don't have enough context
768  // here, and legalization can handle it.
769  return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
770 }
771 
772 bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
773  unsigned Alignment,
774  unsigned AddrSpace) const {
775  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
776 }
777 
778 bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
779  unsigned Alignment,
780  unsigned AddrSpace) const {
781  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
782 }
783 
784 unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
785  // Disable unrolling if the loop is not vectorized.
786  // TODO: Enable this again.
787  if (VF == 1)
788  return 1;
789 
790  return 8;
791 }
792 
793 unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) {
794  // XXX - For some reason this isn't called for switch.
795  switch (Opcode) {
796  case Instruction::Br:
797  case Instruction::Ret:
798  return 10;
799  default:
800  return BaseT::getCFInstrCost(Opcode);
801  }
802 }
803 
804 int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
805  unsigned Index) {
806  switch (Opcode) {
807  case Instruction::ExtractElement:
808  case Instruction::InsertElement: {
809  unsigned EltSize
810  = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
811  if (EltSize < 32) {
812  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
813  }
814 
815  // Extracts are just reads of a subregister, so are free. Inserts are
816  // considered free because we don't want to have any cost for scalarizing
817  // operations, and we don't have to copy into a different register class.
818 
819  // Dynamic indexing isn't free and is best avoided.
820  return Index == ~0u ? 2 : 0;
821  }
822  default:
823  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
824  }
825 }
826 
829  CommonTTI.getUnrollingPreferences(L, SE, UP);
830 }
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:67
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:616
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
Definition: BasicTTIImpl.h:621
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const
This file a TargetTransformInfo::Concept conforming object specific to the AMDGPU target machine...
Address space for 160-bit buffer fat pointers.
Definition: AMDGPU.h:279
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
AMDGPU specific subclass of TargetSubtarget.
This class represents lattice values for constants.
Definition: AllocatorList.h:23
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:199
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:66
bool isSized(SmallPtrSetImpl< Type *> *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:265
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(150), cl::Hidden)
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
unsigned getCFInstrCost(unsigned Opcode)
unsigned getLoopDepth() const
Return the nesting level of this loop.
Definition: LoopInfo.h:97
Address space for 32-bit constant memory.
Definition: AMDGPU.h:277
void push_back(const T &Elt)
Definition: SmallVector.h:211
The main scalar evolution driver.
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const
int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index)
unsigned getMinVectorRegisterBitWidth() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
void setArgOperand(unsigned i, Value *v)
Definition: InstrTypes.h:1246
F(f)
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:635
An instruction for reading from memory.
Definition: Instructions.h:169
int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise)
Hexagon Common GEP
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:624
Address space for region memory. (GDS)
Definition: AMDGPU.h:271
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:130
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1241
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:270
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:47
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Value * > Args, FastMathFlags FMF, unsigned VF=1)
Get intrinsic cost based on arguments.
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
const FeatureBitset & getFeatureBits() const
bool isAlwaysUniform(const Value *V) const
Shift and rotation operations.
Definition: ISDOpcodes.h:449
unsigned getNumberOfRegisters(bool Vector) const
Calling convention used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (ve...
Definition: CallingConv.h:193
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2000), cl::Hidden)
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1179
This file contains the simple types necessary to represent the attributes associated with functions a...
unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise)
Try to calculate arithmetic and shuffle op costs for reduction operations.
mir Rename Register Operands
uint64_t getNumElements() const
For scalable vectors, this will return the minimum number of elements in the vector.
Definition: DerivedTypes.h:398
AtomicOrdering
Atomic ordering for LLVM&#39;s memory model.
unsigned getScalarSizeInBits() const
Definition: ValueTypes.h:297
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:246
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1323
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:200
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
unsigned getRegisterBitWidth(bool Vector) const
unsigned getMaxInterleaveFactor(unsigned VF)
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:223
Calling convention used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:196
Calling convention used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:220
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:429
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
Reverse the order of the vector.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type *> Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1093
Analysis containing CSE Info
Definition: CSEInfo.cpp:20
unsigned getAddressSpace() const
Returns the address space of this instruction&#39;s pointer type.
Definition: Instructions.h:990
unsigned getCFInstrCost(unsigned Opcode)
Address space for private memory.
Definition: AMDGPU.h:275
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:881
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
This instruction inserts a single (scalar) element into a VectorType value.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:148
Container class for subtarget features.
LLVM Basic Block Representation.
Definition: BasicBlock.h:57
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop...
Definition: LoopInfo.h:208
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:64
unsigned getUserCost(const User *U, ArrayRef< const Value *> Operands)
Simple binary floating point operators.
Definition: ISDOpcodes.h:287
Conditional or Unconditional Branch instruction.
Address space for local memory.
Definition: AMDGPU.h:274
Address space for flat memory.
Definition: AMDGPU.h:269
unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise, bool)
Try to calculate op costs for min/max reduction operations.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:486
unsigned getUserCost(const User *U, ArrayRef< const Value * > Operands)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1172
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
unsigned getNumberOfRegisters(bool Vec) const
op_range operands()
Definition: User.h:237
static bool isArgPassedInSGPR(const Argument *A)
Extended Value Type.
Definition: ValueTypes.h:33
unsigned getHardwareNumberOfRegisters(bool Vec) const
const TargetMachine & getTargetMachine() const
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:105
Address space for indirect addressible parameter memory (VTX1).
Definition: AMDGPU.h:284
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:61
OperandValueProperties
Additional properties of an operand&#39;s values.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:50
unsigned getCFInstrCost(unsigned Opcode)
Definition: BasicTTIImpl.h:820
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
Definition: BasicTTIImpl.h:668
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:115
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool hasParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const
Equivalent to hasAttribute(ArgNo + FirstArgIndex, Kind).
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
bool isSourceOfDivergence(const Value *V) const
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:390
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type...
Definition: Type.cpp:134
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:212
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
Module.h This file contains the declarations for the Module class.
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value *> Args=ArrayRef< const Value *>())
int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm, bool IsUnsigned)
Calling convention used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:202
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:609
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target&#39;s TargetSubtargetInf...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
unsigned getMaxInterleaveFactor(unsigned VF)
Class to represent vector types.
Definition: DerivedTypes.h:432
SPIR_KERNEL - Calling convention for SPIR kernel functions.
Definition: CallingConv.h:141
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition: Argument.h:47
Address space for constant memory (VTX2).
Definition: AMDGPU.h:273
static cl::opt< unsigned > Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), cl::init(100), cl::Hidden)
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Calling convention used for AMDPAL shader stage before geometry shader if geometry is in use...
Definition: CallingConv.h:225
int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index)
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
Definition: LoopInfo.h:136
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:628
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:426
unsigned Threshold
The cost threshold for the unrolled loop.
const Function * getParent() const
Definition: Argument.h:41
unsigned getHardwareNumberOfRegisters(bool Vector) const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:509
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:154
Parameters that control the generic loop unrolling transformation.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:106
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
#define I(x, y, z)
Definition: MD5.cpp:58
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:192
iterator_range< value_op_iterator > operand_values()
Definition: User.h:261
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:332
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:865
This instruction extracts a single (scalar) element from a VectorType value.
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
Calling convention used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:212
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:575
LLVM Value Representation.
Definition: Value.h:74
Broadcast element 0 to all other elements.
unsigned getMinVectorRegisterBitWidth() const
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:65
Type * getElementType() const
Definition: DerivedTypes.h:399
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:159
OperandValueKind
Additional information about an operand&#39;s possible values.
This pass exposes codegen information to IR-level passes.
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size...
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:125
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
#define LLVM_DEBUG(X)
Definition: Debug.h:122
Address space for direct addressible parameter memory (CONST0).
Definition: AMDGPU.h:282
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:205
Information about a load/store intrinsic defined by the target.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
unsigned getRegisterBitWidth(bool Vector) const
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:143
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:43
const BasicBlock * getParent() const
Definition: Instruction.h:66
an instruction to allocate memory on the stack
Definition: Instructions.h:59
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Shuffle elements of single source vector with any shuffle mask.