LLVM  14.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "ARMSubtarget.h"
12 #include "llvm/ADT/APInt.h"
13 #include "llvm/ADT/SmallVector.h"
14 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
18 #include "llvm/IR/BasicBlock.h"
19 #include "llvm/IR/DataLayout.h"
20 #include "llvm/IR/DerivedTypes.h"
21 #include "llvm/IR/Instruction.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/Intrinsics.h"
24 #include "llvm/IR/IntrinsicInst.h"
25 #include "llvm/IR/IntrinsicsARM.h"
26 #include "llvm/IR/PatternMatch.h"
27 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Casting.h"
30 #include "llvm/Support/KnownBits.h"
36 #include <algorithm>
37 #include <cassert>
38 #include <cstdint>
39 #include <utility>
40 
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "armtti"
44 
46  "enable-arm-maskedldst", cl::Hidden, cl::init(true),
47  cl::desc("Enable the generation of masked loads and stores"));
48 
50  "disable-arm-loloops", cl::Hidden, cl::init(false),
51  cl::desc("Disable the generation of low-overhead loops"));
52 
53 static cl::opt<bool>
54  AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
55  cl::desc("Enable the generation of WLS loops"));
56 
58 
60 
62 
63 /// Convert a vector load intrinsic into a simple llvm load instruction.
64 /// This is beneficial when the underlying object being addressed comes
65 /// from a constant, since we get constant-folding for free.
66 static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
68  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
69 
70  if (!IntrAlign)
71  return nullptr;
72 
73  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
74  ? MemAlign
75  : IntrAlign->getLimitedValue();
76 
77  if (!isPowerOf2_32(Alignment))
78  return nullptr;
79 
80  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
81  PointerType::get(II.getType(), 0));
82  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
83 }
84 
86  const Function *Callee) const {
87  const TargetMachine &TM = getTLI()->getTargetMachine();
88  const FeatureBitset &CallerBits =
89  TM.getSubtargetImpl(*Caller)->getFeatureBits();
90  const FeatureBitset &CalleeBits =
91  TM.getSubtargetImpl(*Callee)->getFeatureBits();
92 
93  // To inline a callee, all features not in the allowed list must match exactly.
94  bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
95  (CalleeBits & ~InlineFeaturesAllowed);
96  // For features in the allowed list, the callee's features must be a subset of
97  // the callers'.
98  bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
99  (CalleeBits & InlineFeaturesAllowed);
100  return MatchExact && MatchSubset;
101 }
102 
105  ScalarEvolution *SE) const {
106  if (ST->hasMVEIntegerOps())
107  return TTI::AMK_PostIndexed;
108 
109  if (L->getHeader()->getParent()->hasOptSize())
110  return TTI::AMK_None;
111 
112  if (ST->isMClass() && ST->isThumb2() &&
113  L->getNumBlocks() == 1)
114  return TTI::AMK_PreIndexed;
115 
116  return TTI::AMK_None;
117 }
118 
121  using namespace PatternMatch;
122  Intrinsic::ID IID = II.getIntrinsicID();
123  switch (IID) {
124  default:
125  break;
126  case Intrinsic::arm_neon_vld1: {
127  Align MemAlign =
129  &IC.getAssumptionCache(), &IC.getDominatorTree());
130  if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
131  return IC.replaceInstUsesWith(II, V);
132  }
133  break;
134  }
135 
136  case Intrinsic::arm_neon_vld2:
137  case Intrinsic::arm_neon_vld3:
138  case Intrinsic::arm_neon_vld4:
139  case Intrinsic::arm_neon_vld2lane:
140  case Intrinsic::arm_neon_vld3lane:
141  case Intrinsic::arm_neon_vld4lane:
142  case Intrinsic::arm_neon_vst1:
143  case Intrinsic::arm_neon_vst2:
144  case Intrinsic::arm_neon_vst3:
145  case Intrinsic::arm_neon_vst4:
146  case Intrinsic::arm_neon_vst2lane:
147  case Intrinsic::arm_neon_vst3lane:
148  case Intrinsic::arm_neon_vst4lane: {
149  Align MemAlign =
151  &IC.getAssumptionCache(), &IC.getDominatorTree());
152  unsigned AlignArg = II.arg_size() - 1;
153  Value *AlignArgOp = II.getArgOperand(AlignArg);
154  MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
155  if (Align && *Align < MemAlign) {
156  return IC.replaceOperand(
157  II, AlignArg,
159  false));
160  }
161  break;
162  }
163 
164  case Intrinsic::arm_mve_pred_i2v: {
165  Value *Arg = II.getArgOperand(0);
166  Value *ArgArg;
167  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
168  PatternMatch::m_Value(ArgArg))) &&
169  II.getType() == ArgArg->getType()) {
170  return IC.replaceInstUsesWith(II, ArgArg);
171  }
172  Constant *XorMask;
173  if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
174  PatternMatch::m_Value(ArgArg)),
175  PatternMatch::m_Constant(XorMask))) &&
176  II.getType() == ArgArg->getType()) {
177  if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
178  if (CI->getValue().trunc(16).isAllOnes()) {
179  auto TrueVector = IC.Builder.CreateVectorSplat(
180  cast<FixedVectorType>(II.getType())->getNumElements(),
181  IC.Builder.getTrue());
182  return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
183  }
184  }
185  }
186  KnownBits ScalarKnown(32);
187  if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
188  ScalarKnown, 0)) {
189  return &II;
190  }
191  break;
192  }
193  case Intrinsic::arm_mve_pred_v2i: {
194  Value *Arg = II.getArgOperand(0);
195  Value *ArgArg;
196  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
197  PatternMatch::m_Value(ArgArg)))) {
198  return IC.replaceInstUsesWith(II, ArgArg);
199  }
200  if (!II.getMetadata(LLVMContext::MD_range)) {
201  Type *IntTy32 = Type::getInt32Ty(II.getContext());
202  Metadata *M[] = {
204  ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
205  II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
206  return &II;
207  }
208  break;
209  }
210  case Intrinsic::arm_mve_vadc:
211  case Intrinsic::arm_mve_vadc_predicated: {
212  unsigned CarryOp =
213  (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
214  assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
215  "Bad type for intrinsic!");
216 
217  KnownBits CarryKnown(32);
218  if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
219  CarryKnown)) {
220  return &II;
221  }
222  break;
223  }
224  case Intrinsic::arm_mve_vmldava: {
225  Instruction *I = cast<Instruction>(&II);
226  if (I->hasOneUse()) {
227  auto *User = cast<Instruction>(*I->user_begin());
228  Value *OpZ;
229  if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
230  match(I->getOperand(3), m_Zero())) {
231  Value *OpX = I->getOperand(4);
232  Value *OpY = I->getOperand(5);
233  Type *OpTy = OpX->getType();
234 
236  Value *V =
237  IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
238  {I->getOperand(0), I->getOperand(1),
239  I->getOperand(2), OpZ, OpX, OpY});
240 
241  IC.replaceInstUsesWith(*User, V);
242  return IC.eraseInstFromFunction(*User);
243  }
244  }
245  return None;
246  }
247  }
248  return None;
249 }
250 
252  InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
253  APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
254  std::function<void(Instruction *, unsigned, APInt, APInt &)>
255  SimplifyAndSetOp) const {
256 
257  // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
258  // opcode specifying a Top/Bottom instruction, which can change between
259  // instructions.
260  auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
261  unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
262  unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
263 
264  // The only odd/even lanes of operand 0 will only be demanded depending
265  // on whether this is a top/bottom instruction.
266  APInt DemandedElts =
267  APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
268  : APInt::getHighBitsSet(2, 1));
269  SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
270  // The other lanes will be defined from the inserted elements.
271  UndefElts &= APInt::getSplat(NumElts, !IsTop ? APInt::getLowBitsSet(2, 1)
272  : APInt::getHighBitsSet(2, 1));
273  return None;
274  };
275 
276  switch (II.getIntrinsicID()) {
277  default:
278  break;
279  case Intrinsic::arm_mve_vcvt_narrow:
280  SimplifyNarrowInstrTopBottom(2);
281  break;
282  case Intrinsic::arm_mve_vqmovn:
283  SimplifyNarrowInstrTopBottom(4);
284  break;
285  case Intrinsic::arm_mve_vshrn:
286  SimplifyNarrowInstrTopBottom(7);
287  break;
288  }
289 
290  return None;
291 }
292 
295  assert(Ty->isIntegerTy());
296 
297  unsigned Bits = Ty->getPrimitiveSizeInBits();
298  if (Bits == 0 || Imm.getActiveBits() >= 64)
299  return 4;
300 
301  int64_t SImmVal = Imm.getSExtValue();
302  uint64_t ZImmVal = Imm.getZExtValue();
303  if (!ST->isThumb()) {
304  if ((SImmVal >= 0 && SImmVal < 65536) ||
305  (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
306  (ARM_AM::getSOImmVal(~ZImmVal) != -1))
307  return 1;
308  return ST->hasV6T2Ops() ? 2 : 3;
309  }
310  if (ST->isThumb2()) {
311  if ((SImmVal >= 0 && SImmVal < 65536) ||
312  (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
313  (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
314  return 1;
315  return ST->hasV6T2Ops() ? 2 : 3;
316  }
317  // Thumb1, any i8 imm cost 1.
318  if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
319  return 1;
320  if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
321  return 2;
322  // Load from constantpool.
323  return 3;
324 }
325 
326 // Constants smaller than 256 fit in the immediate field of
327 // Thumb1 instructions so we return a zero cost and 1 otherwise.
329  const APInt &Imm, Type *Ty) {
330  if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
331  return 0;
332 
333  return 1;
334 }
335 
336 // Checks whether Inst is part of a min(max()) or max(min()) pattern
337 // that will match to an SSAT instruction. Returns the instruction being
338 // saturated, or null if no saturation pattern was found.
339 static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
340  Value *LHS, *RHS;
341  ConstantInt *C;
343 
344  if (InstSPF == SPF_SMAX &&
346  C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
347 
348  auto isSSatMin = [&](Value *MinInst) {
349  if (isa<SelectInst>(MinInst)) {
350  Value *MinLHS, *MinRHS;
351  ConstantInt *MinC;
352  SelectPatternFlavor MinSPF =
353  matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
354  if (MinSPF == SPF_SMIN &&
356  MinC->getValue() == ((-Imm) - 1))
357  return true;
358  }
359  return false;
360  };
361 
362  if (isSSatMin(Inst->getOperand(1)))
363  return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
364  if (Inst->hasNUses(2) &&
365  (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
366  return Inst->getOperand(1);
367  }
368  return nullptr;
369 }
370 
371 // Look for a FP Saturation pattern, where the instruction can be simplified to
372 // a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
373 static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
374  if (Imm.getBitWidth() != 64 ||
375  Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
376  return false;
377  Value *FP = isSSATMinMaxPattern(Inst, Imm);
378  if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
379  FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
380  if (!FP)
381  return false;
382  return isa<FPToSIInst>(FP);
383 }
384 
385 InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
386  const APInt &Imm, Type *Ty,
388  Instruction *Inst) {
389  // Division by a constant can be turned into multiplication, but only if we
390  // know it's constant. So it's not so much that the immediate is cheap (it's
391  // not), but that the alternative is worse.
392  // FIXME: this is probably unneeded with GlobalISel.
393  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
394  Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
395  Idx == 1)
396  return 0;
397 
398  // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
399  // splitting any large offsets.
400  if (Opcode == Instruction::GetElementPtr && Idx != 0)
401  return 0;
402 
403  if (Opcode == Instruction::And) {
404  // UXTB/UXTH
405  if (Imm == 255 || Imm == 65535)
406  return 0;
407  // Conversion to BIC is free, and means we can use ~Imm instead.
408  return std::min(getIntImmCost(Imm, Ty, CostKind),
409  getIntImmCost(~Imm, Ty, CostKind));
410  }
411 
412  if (Opcode == Instruction::Add)
413  // Conversion to SUB is free, and means we can use -Imm instead.
414  return std::min(getIntImmCost(Imm, Ty, CostKind),
415  getIntImmCost(-Imm, Ty, CostKind));
416 
417  if (Opcode == Instruction::ICmp && Imm.isNegative() &&
418  Ty->getIntegerBitWidth() == 32) {
419  int64_t NegImm = -Imm.getSExtValue();
420  if (ST->isThumb2() && NegImm < 1<<12)
421  // icmp X, #-C -> cmn X, #C
422  return 0;
423  if (ST->isThumb() && NegImm < 1<<8)
424  // icmp X, #-C -> adds X, #C
425  return 0;
426  }
427 
428  // xor a, -1 can always be folded to MVN
429  if (Opcode == Instruction::Xor && Imm.isAllOnes())
430  return 0;
431 
432  // Ensures negative constant of min(max()) or max(min()) patterns that
433  // match to SSAT instructions don't get hoisted
434  if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
435  Ty->getIntegerBitWidth() <= 32) {
436  if (isSSATMinMaxPattern(Inst, Imm) ||
437  (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
438  isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
439  return 0;
440  }
441 
442  if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
443  return 0;
444 
445  // We can convert <= -1 to < 0, which is generally quite cheap.
446  if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnesValue()) {
447  ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
448  if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
449  return std::min(getIntImmCost(Imm, Ty, CostKind),
450  getIntImmCost(Imm + 1, Ty, CostKind));
451  }
452 
453  return getIntImmCost(Imm, Ty, CostKind);
454 }
455 
458  const Instruction *I) {
460  (ST->hasNEON() || ST->hasMVEIntegerOps())) {
461  // FIXME: The vectorizer is highly sensistive to the cost of these
462  // instructions, which suggests that it may be using the costs incorrectly.
463  // But, for now, just make them free to avoid performance regressions for
464  // vector targets.
465  return 0;
466  }
467  return BaseT::getCFInstrCost(Opcode, CostKind, I);
468 }
469 
471  Type *Src,
474  const Instruction *I) {
475  int ISD = TLI->InstructionOpcodeToISD(Opcode);
476  assert(ISD && "Invalid opcode");
477 
478  // TODO: Allow non-throughput costs that aren't binary.
479  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
481  return Cost == 0 ? 0 : 1;
482  return Cost;
483  };
484  auto IsLegalFPType = [this](EVT VT) {
485  EVT EltVT = VT.getScalarType();
486  return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
487  (EltVT == MVT::f64 && ST->hasFP64()) ||
488  (EltVT == MVT::f16 && ST->hasFullFP16());
489  };
490 
491  EVT SrcTy = TLI->getValueType(DL, Src);
492  EVT DstTy = TLI->getValueType(DL, Dst);
493 
494  if (!SrcTy.isSimple() || !DstTy.isSimple())
495  return AdjustCost(
496  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
497 
498  // Extending masked load/Truncating masked stores is expensive because we
499  // currently don't split them. This means that we'll likely end up
500  // loading/storing each element individually (hence the high cost).
501  if ((ST->hasMVEIntegerOps() &&
502  (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
503  Opcode == Instruction::SExt)) ||
504  (ST->hasMVEFloatOps() &&
505  (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
506  IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
507  if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
508  return 2 * DstTy.getVectorNumElements() *
510 
511  // The extend of other kinds of load is free
512  if (CCH == TTI::CastContextHint::Normal ||
514  static const TypeConversionCostTblEntry LoadConversionTbl[] = {
527  };
528  if (const auto *Entry = ConvertCostTableLookup(
529  LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
530  return AdjustCost(Entry->Cost);
531 
532  static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
539  // The following extend from a legal type to an illegal type, so need to
540  // split the load. This introduced an extra load operation, but the
541  // extend is still "free".
548  };
549  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
550  if (const auto *Entry =
551  ConvertCostTableLookup(MVELoadConversionTbl, ISD,
552  DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
553  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
554  }
555 
556  static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
557  // FPExtends are similar but also require the VCVT instructions.
560  };
561  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
562  if (const auto *Entry =
563  ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
564  DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
565  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
566  }
567 
568  // The truncate of a store is free. This is the mirror of extends above.
569  static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
577  };
578  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
579  if (const auto *Entry =
580  ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
581  SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
582  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
583  }
584 
585  static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
588  };
589  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
590  if (const auto *Entry =
591  ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
592  SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
593  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
594  }
595  }
596 
597  // NEON vector operations that can extend their inputs.
598  if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
599  I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
600  static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
601  // vaddl
602  { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
603  { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
604  // vsubl
605  { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
606  { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
607  // vmull
608  { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
609  { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
610  // vshll
611  { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
612  { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
613  };
614 
615  auto *User = cast<Instruction>(*I->user_begin());
616  int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
617  if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
618  DstTy.getSimpleVT(),
619  SrcTy.getSimpleVT())) {
620  return AdjustCost(Entry->Cost);
621  }
622  }
623 
624  // Single to/from double precision conversions.
625  if (Src->isVectorTy() && ST->hasNEON() &&
626  ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
627  DstTy.getScalarType() == MVT::f32) ||
628  (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
629  DstTy.getScalarType() == MVT::f64))) {
630  static const CostTblEntry NEONFltDblTbl[] = {
631  // Vector fptrunc/fpext conversions.
634  {ISD::FP_EXTEND, MVT::v4f32, 4}};
635 
636  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
637  if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
638  return AdjustCost(LT.first * Entry->Cost);
639  }
640 
641  // Some arithmetic, load and store operations have specific instructions
642  // to cast up/down their types automatically at no extra cost.
643  // TODO: Get these tables to know at least what the related operations are.
644  static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
651 
652  // The number of vmovl instructions for the extension.
671 
672  // Operations that we legalize using splitting.
675 
676  // Vector float <-> i32 conversions.
679 
700 
707 
708  // Vector double <-> i32 conversions.
711 
718 
725  };
726 
727  if (SrcTy.isVector() && ST->hasNEON()) {
728  if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
729  DstTy.getSimpleVT(),
730  SrcTy.getSimpleVT()))
731  return AdjustCost(Entry->Cost);
732  }
733 
734  // Scalar float to integer conversions.
735  static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
756  };
757  if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
758  if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
759  DstTy.getSimpleVT(),
760  SrcTy.getSimpleVT()))
761  return AdjustCost(Entry->Cost);
762  }
763 
764  // Scalar integer to float conversions.
765  static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
786  };
787 
788  if (SrcTy.isInteger() && ST->hasNEON()) {
789  if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
790  ISD, DstTy.getSimpleVT(),
791  SrcTy.getSimpleVT()))
792  return AdjustCost(Entry->Cost);
793  }
794 
795  // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
796  // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
797  // are linearised so take more.
798  static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
811  };
812 
813  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
814  if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
815  ISD, DstTy.getSimpleVT(),
816  SrcTy.getSimpleVT()))
817  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
818  }
819 
820  if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
821  // As general rule, fp converts that were not matched above are scalarized
822  // and cost 1 vcvt for each lane, so long as the instruction is available.
823  // If not it will become a series of function calls.
824  const InstructionCost CallCost =
825  getCallInstrCost(nullptr, Dst, {Src}, CostKind);
826  int Lanes = 1;
827  if (SrcTy.isFixedLengthVector())
828  Lanes = SrcTy.getVectorNumElements();
829 
830  if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
831  return Lanes;
832  else
833  return Lanes * CallCost;
834  }
835 
836  if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
837  SrcTy.isFixedLengthVector()) {
838  // Treat a truncate with larger than legal source (128bits for MVE) as
839  // expensive, 2 instructions per lane.
840  if ((SrcTy.getScalarType() == MVT::i8 ||
841  SrcTy.getScalarType() == MVT::i16 ||
842  SrcTy.getScalarType() == MVT::i32) &&
843  SrcTy.getSizeInBits() > 128 &&
844  SrcTy.getSizeInBits() > DstTy.getSizeInBits())
845  return SrcTy.getVectorNumElements() * 2;
846  }
847 
848  // Scalar integer conversion costs.
849  static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
850  // i16 -> i64 requires two dependent operations.
852 
853  // Truncates on i64 are assumed to be free.
856  { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
858  };
859 
860  if (SrcTy.isInteger()) {
861  if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
862  DstTy.getSimpleVT(),
863  SrcTy.getSimpleVT()))
864  return AdjustCost(Entry->Cost);
865  }
866 
867  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
869  : 1;
870  return AdjustCost(
871  BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
872 }
873 
875  unsigned Index) {
876  // Penalize inserting into an D-subregister. We end up with a three times
877  // lower estimated throughput on swift.
878  if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
879  ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
880  return 3;
881 
882  if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
883  Opcode == Instruction::ExtractElement)) {
884  // Cross-class copies are expensive on many microarchitectures,
885  // so assume they are expensive by default.
886  if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
887  return 3;
888 
889  // Even if it's not a cross class copy, this likely leads to mixing
890  // of NEON and VFP code and should be therefore penalized.
891  if (ValTy->isVectorTy() &&
892  ValTy->getScalarSizeInBits() <= 32)
893  return std::max<InstructionCost>(
894  BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
895  }
896 
897  if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
898  Opcode == Instruction::ExtractElement)) {
899  // Integer cross-lane moves are more expensive than float, which can
900  // sometimes just be vmovs. Integer involve being passes to GPR registers,
901  // causing more of a delay.
902  std::pair<InstructionCost, MVT> LT =
903  getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
904  return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
905  }
906 
907  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
908 }
909 
911  Type *CondTy,
912  CmpInst::Predicate VecPred,
914  const Instruction *I) {
915  int ISD = TLI->InstructionOpcodeToISD(Opcode);
916 
917  // Thumb scalar code size cost for select.
918  if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
919  ST->isThumb() && !ValTy->isVectorTy()) {
920  // Assume expensive structs.
921  if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
922  return TTI::TCC_Expensive;
923 
924  // Select costs can vary because they:
925  // - may require one or more conditional mov (including an IT),
926  // - can't operate directly on immediates,
927  // - require live flags, which we can't copy around easily.
928  InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
929 
930  // Possible IT instruction for Thumb2, or more for Thumb1.
931  ++Cost;
932 
933  // i1 values may need rematerialising by using mov immediates and/or
934  // flag setting instructions.
935  if (ValTy->isIntegerTy(1))
936  ++Cost;
937 
938  return Cost;
939  }
940 
941  // If this is a vector min/max/abs, use the cost of that intrinsic directly
942  // instead. Hopefully when min/max intrinsics are more prevalent this code
943  // will not be needed.
944  const Instruction *Sel = I;
945  if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
946  Sel->hasOneUse())
947  Sel = cast<Instruction>(Sel->user_back());
948  if (Sel && ValTy->isVectorTy() &&
949  (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
950  const Value *LHS, *RHS;
952  unsigned IID = 0;
953  switch (SPF) {
954  case SPF_ABS:
955  IID = Intrinsic::abs;
956  break;
957  case SPF_SMIN:
958  IID = Intrinsic::smin;
959  break;
960  case SPF_SMAX:
961  IID = Intrinsic::smax;
962  break;
963  case SPF_UMIN:
964  IID = Intrinsic::umin;
965  break;
966  case SPF_UMAX:
967  IID = Intrinsic::umax;
968  break;
969  case SPF_FMINNUM:
970  IID = Intrinsic::minnum;
971  break;
972  case SPF_FMAXNUM:
973  IID = Intrinsic::maxnum;
974  break;
975  default:
976  break;
977  }
978  if (IID) {
979  // The ICmp is free, the select gets the cost of the min/max/etc
980  if (Sel != I)
981  return 0;
982  IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
983  return getIntrinsicInstrCost(CostAttrs, CostKind);
984  }
985  }
986 
987  // On NEON a vector select gets lowered to vbsl.
988  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
989  // Lowering of some vector selects is currently far from perfect.
990  static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
991  { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
992  { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
994  };
995 
996  EVT SelCondTy = TLI->getValueType(DL, CondTy);
997  EVT SelValTy = TLI->getValueType(DL, ValTy);
998  if (SelCondTy.isSimple() && SelValTy.isSimple()) {
999  if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1000  SelCondTy.getSimpleVT(),
1001  SelValTy.getSimpleVT()))
1002  return Entry->Cost;
1003  }
1004 
1005  std::pair<InstructionCost, MVT> LT =
1006  TLI->getTypeLegalizationCost(DL, ValTy);
1007  return LT.first;
1008  }
1009 
1010  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1011  (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1012  cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1013  FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1014  FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
1015  if (!VecCondTy)
1016  VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
1017 
1018  // If we don't have mve.fp any fp operations will need to be scalarized.
1019  if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1020  // One scalaization insert, one scalarization extract and the cost of the
1021  // fcmps.
1022  return BaseT::getScalarizationOverhead(VecValTy, false, true) +
1023  BaseT::getScalarizationOverhead(VecCondTy, true, false) +
1024  VecValTy->getNumElements() *
1025  getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1026  VecCondTy->getScalarType(), VecPred, CostKind,
1027  I);
1028  }
1029 
1030  std::pair<InstructionCost, MVT> LT =
1031  TLI->getTypeLegalizationCost(DL, ValTy);
1032  int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1033  // There are two types - the input that specifies the type of the compare
1034  // and the output vXi1 type. Because we don't know how the output will be
1035  // split, we may need an expensive shuffle to get two in sync. This has the
1036  // effect of making larger than legal compares (v8i32 for example)
1037  // expensive.
1038  if (LT.second.getVectorNumElements() > 2) {
1039  if (LT.first > 1)
1040  return LT.first * BaseCost +
1041  BaseT::getScalarizationOverhead(VecCondTy, true, false);
1042  return BaseCost;
1043  }
1044  }
1045 
1046  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1047  // for "multiple beats" potentially needed by MVE instructions.
1048  int BaseCost = 1;
1049  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1050  BaseCost = ST->getMVEVectorCostFactor(CostKind);
1051 
1052  return BaseCost *
1053  BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1054 }
1055 
1057  ScalarEvolution *SE,
1058  const SCEV *Ptr) {
1059  // Address computations in vectorized code with non-consecutive addresses will
1060  // likely result in more instructions compared to scalar code where the
1061  // computation can more often be merged into the index mode. The resulting
1062  // extra micro-ops can significantly decrease throughput.
1063  unsigned NumVectorInstToHideOverhead = 10;
1064  int MaxMergeDistance = 64;
1065 
1066  if (ST->hasNEON()) {
1067  if (Ty->isVectorTy() && SE &&
1068  !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1069  return NumVectorInstToHideOverhead;
1070 
1071  // In many cases the address computation is not merged into the instruction
1072  // addressing mode.
1073  return 1;
1074  }
1075  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1076 }
1077 
1079  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1080  // If a VCTP is part of a chain, it's already profitable and shouldn't be
1081  // optimized, else LSR may block tail-predication.
1082  switch (II->getIntrinsicID()) {
1083  case Intrinsic::arm_mve_vctp8:
1084  case Intrinsic::arm_mve_vctp16:
1085  case Intrinsic::arm_mve_vctp32:
1086  case Intrinsic::arm_mve_vctp64:
1087  return true;
1088  default:
1089  break;
1090  }
1091  }
1092  return false;
1093 }
1094 
1095 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1097  return false;
1098 
1099  if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1100  // Don't support v2i1 yet.
1101  if (VecTy->getNumElements() == 2)
1102  return false;
1103 
1104  // We don't support extending fp types.
1105  unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1106  if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1107  return false;
1108  }
1109 
1110  unsigned EltWidth = DataTy->getScalarSizeInBits();
1111  return (EltWidth == 32 && Alignment >= 4) ||
1112  (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1113 }
1114 
1117  return false;
1118 
1119  unsigned EltWidth = Ty->getScalarSizeInBits();
1120  return ((EltWidth == 32 && Alignment >= 4) ||
1121  (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1122 }
1123 
1124 /// Given a memcpy/memset/memmove instruction, return the number of memory
1125 /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1126 /// call is used.
1128  MemOp MOp;
1129  unsigned DstAddrSpace = ~0u;
1130  unsigned SrcAddrSpace = ~0u;
1131  const Function *F = I->getParent()->getParent();
1132 
1133  if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1134  ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1135  // If 'size' is not a constant, a library call will be generated.
1136  if (!C)
1137  return -1;
1138 
1139  const unsigned Size = C->getValue().getZExtValue();
1140  const Align DstAlign = *MC->getDestAlign();
1141  const Align SrcAlign = *MC->getSourceAlign();
1142 
1143  MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1144  /*IsVolatile*/ false);
1145  DstAddrSpace = MC->getDestAddressSpace();
1146  SrcAddrSpace = MC->getSourceAddressSpace();
1147  }
1148  else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1149  ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1150  // If 'size' is not a constant, a library call will be generated.
1151  if (!C)
1152  return -1;
1153 
1154  const unsigned Size = C->getValue().getZExtValue();
1155  const Align DstAlign = *MS->getDestAlign();
1156 
1157  MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1158  /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1159  DstAddrSpace = MS->getDestAddressSpace();
1160  }
1161  else
1162  llvm_unreachable("Expected a memcpy/move or memset!");
1163 
1164  unsigned Limit, Factor = 2;
1165  switch(I->getIntrinsicID()) {
1166  case Intrinsic::memcpy:
1167  Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1168  break;
1169  case Intrinsic::memmove:
1170  Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1171  break;
1172  case Intrinsic::memset:
1173  Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1174  Factor = 1;
1175  break;
1176  default:
1177  llvm_unreachable("Expected a memcpy/move or memset!");
1178  }
1179 
1180  // MemOps will be poplulated with a list of data types that needs to be
1181  // loaded and stored. That's why we multiply the number of elements by 2 to
1182  // get the cost for this memcpy.
1183  std::vector<EVT> MemOps;
1184  if (getTLI()->findOptimalMemOpLowering(
1185  MemOps, Limit, MOp, DstAddrSpace,
1186  SrcAddrSpace, F->getAttributes()))
1187  return MemOps.size() * Factor;
1188 
1189  // If we can't find an optimal memop lowering, return the default cost
1190  return -1;
1191 }
1192 
1194  int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1195 
1196  // To model the cost of a library call, we assume 1 for the call, and
1197  // 3 for the argument setup.
1198  if (NumOps == -1)
1199  return 4;
1200  return NumOps;
1201 }
1202 
1205  int Index, VectorType *SubTp) {
1207  if (ST->hasNEON()) {
1208  if (Kind == TTI::SK_Broadcast) {
1209  static const CostTblEntry NEONDupTbl[] = {
1210  // VDUP handles these cases.
1217 
1222 
1223  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1224  if (const auto *Entry =
1225  CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1226  return LT.first * Entry->Cost;
1227  }
1228  if (Kind == TTI::SK_Reverse) {
1229  static const CostTblEntry NEONShuffleTbl[] = {
1230  // Reverse shuffle cost one instruction if we are shuffling within a
1231  // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1238 
1243 
1244  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1245  if (const auto *Entry =
1246  CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1247  return LT.first * Entry->Cost;
1248  }
1249  if (Kind == TTI::SK_Select) {
1250  static const CostTblEntry NEONSelShuffleTbl[] = {
1251  // Select shuffle cost table for ARM. Cost is the number of
1252  // instructions
1253  // required to create the shuffled vector.
1254 
1259 
1263 
1265 
1267 
1268  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1269  if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1270  ISD::VECTOR_SHUFFLE, LT.second))
1271  return LT.first * Entry->Cost;
1272  }
1273  }
1274  if (ST->hasMVEIntegerOps()) {
1275  if (Kind == TTI::SK_Broadcast) {
1276  static const CostTblEntry MVEDupTbl[] = {
1277  // VDUP handles these cases.
1283 
1284  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1285  if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1286  LT.second))
1287  return LT.first * Entry->Cost *
1289  }
1290 
1291  if (!Mask.empty()) {
1292  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1293  if (Mask.size() <= LT.second.getVectorNumElements() &&
1294  (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1295  isVREVMask(Mask, LT.second, 64)))
1297  }
1298  }
1299 
1300  int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1302  : 1;
1303  return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
1304 }
1305 
1307  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1309  TTI::OperandValueProperties Opd1PropInfo,
1311  const Instruction *CxtI) {
1312  int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1313  if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1314  // Make operations on i1 relatively expensive as this often involves
1315  // combining predicates. AND and XOR should be easier to handle with IT
1316  // blocks.
1317  switch (ISDOpcode) {
1318  default:
1319  break;
1320  case ISD::AND:
1321  case ISD::XOR:
1322  return 2;
1323  case ISD::OR:
1324  return 3;
1325  }
1326  }
1327 
1328  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1329 
1330  if (ST->hasNEON()) {
1331  const unsigned FunctionCallDivCost = 20;
1332  const unsigned ReciprocalDivCost = 10;
1333  static const CostTblEntry CostTbl[] = {
1334  // Division.
1335  // These costs are somewhat random. Choose a cost of 20 to indicate that
1336  // vectorizing devision (added function call) is going to be very expensive.
1337  // Double registers types.
1338  { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1339  { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1340  { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1341  { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1342  { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1343  { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1344  { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1345  { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1346  { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1347  { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1348  { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1349  { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1350  { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1351  { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1352  { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1353  { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1354  // Quad register types.
1355  { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1356  { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1357  { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1358  { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1359  { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1360  { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1361  { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1362  { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1363  { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1364  { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1365  { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1366  { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1367  { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1368  { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1369  { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1370  { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1371  // Multiplication.
1372  };
1373 
1374  if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1375  return LT.first * Entry->Cost;
1376 
1378  Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
1379 
1380  // This is somewhat of a hack. The problem that we are facing is that SROA
1381  // creates a sequence of shift, and, or instructions to construct values.
1382  // These sequences are recognized by the ISel and have zero-cost. Not so for
1383  // the vectorized code. Because we have support for v2i64 but not i64 those
1384  // sequences look particularly beneficial to vectorize.
1385  // To work around this we increase the cost of v2i64 operations to make them
1386  // seem less beneficial.
1387  if (LT.second == MVT::v2i64 &&
1389  Cost += 4;
1390 
1391  return Cost;
1392  }
1393 
1394  // If this operation is a shift on arm/thumb2, it might well be folded into
1395  // the following instruction, hence having a cost of 0.
1396  auto LooksLikeAFreeShift = [&]() {
1397  if (ST->isThumb1Only() || Ty->isVectorTy())
1398  return false;
1399 
1400  if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1401  return false;
1403  return false;
1404 
1405  // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1406  switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1407  case Instruction::Add:
1408  case Instruction::Sub:
1409  case Instruction::And:
1410  case Instruction::Xor:
1411  case Instruction::Or:
1412  case Instruction::ICmp:
1413  return true;
1414  default:
1415  return false;
1416  }
1417  };
1418  if (LooksLikeAFreeShift())
1419  return 0;
1420 
1421  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1422  // for "multiple beats" potentially needed by MVE instructions.
1423  int BaseCost = 1;
1424  if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1425  BaseCost = ST->getMVEVectorCostFactor(CostKind);
1426 
1427  // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1428  // without treating floats as more expensive that scalars or increasing the
1429  // costs for custom operations. The results is also multiplied by the
1430  // MVEVectorCostFactor where appropriate.
1431  if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1432  return LT.first * BaseCost;
1433 
1434  // Else this is expand, assume that we need to scalarize this op.
1435  if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1436  unsigned Num = VTy->getNumElements();
1437  InstructionCost Cost =
1439  // Return the cost of multiple scalar invocation plus the cost of
1440  // inserting and extracting the values.
1441  SmallVector<Type *> Tys(Args.size(), Ty);
1442  return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
1443  }
1444 
1445  return BaseCost;
1446 }
1447 
1449  MaybeAlign Alignment,
1450  unsigned AddressSpace,
1452  const Instruction *I) {
1453  // TODO: Handle other cost kinds.
1455  return 1;
1456 
1457  // Type legalization can't handle structs
1458  if (TLI->getValueType(DL, Src, true) == MVT::Other)
1459  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1460  CostKind);
1461 
1462  if (ST->hasNEON() && Src->isVectorTy() &&
1463  (Alignment && *Alignment != Align(16)) &&
1464  cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1465  // Unaligned loads/stores are extremely inefficient.
1466  // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1467  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1468  return LT.first * 4;
1469  }
1470 
1471  // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1472  // Same for stores.
1473  if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1474  ((Opcode == Instruction::Load && I->hasOneUse() &&
1475  isa<FPExtInst>(*I->user_begin())) ||
1476  (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1477  FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1478  Type *DstTy =
1479  Opcode == Instruction::Load
1480  ? (*I->user_begin())->getType()
1481  : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1482  if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1483  DstTy->getScalarType()->isFloatTy())
1484  return ST->getMVEVectorCostFactor(CostKind);
1485  }
1486 
1487  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1489  : 1;
1490  return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1491  CostKind, I);
1492 }
1493 
1495 ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1496  unsigned AddressSpace,
1498  if (ST->hasMVEIntegerOps()) {
1499  if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1500  return ST->getMVEVectorCostFactor(CostKind);
1501  if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1502  return ST->getMVEVectorCostFactor(CostKind);
1503  }
1504  if (!isa<FixedVectorType>(Src))
1505  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1506  CostKind);
1507  // Scalar cost, which is currently very high due to the efficiency of the
1508  // generated code.
1509  return cast<FixedVectorType>(Src)->getNumElements() * 8;
1510 }
1511 
1513  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1514  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1515  bool UseMaskForCond, bool UseMaskForGaps) {
1516  assert(Factor >= 2 && "Invalid interleave factor");
1517  assert(isa<VectorType>(VecTy) && "Expect a vector type");
1518 
1519  // vldN/vstN doesn't support vector types of i64/f64 element.
1520  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1521 
1522  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1523  !UseMaskForCond && !UseMaskForGaps) {
1524  unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1525  auto *SubVecTy =
1526  FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1527 
1528  // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1529  // Accesses having vector types that are a multiple of 128 bits can be
1530  // matched to more than one vldN/vstN instruction.
1531  int BaseCost =
1533  if (NumElts % Factor == 0 &&
1534  TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1535  return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1536 
1537  // Some smaller than legal interleaved patterns are cheap as we can make
1538  // use of the vmovn or vrev patterns to interleave a standard load. This is
1539  // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1540  // promoted differently). The cost of 2 here is then a load and vrev or
1541  // vmovn.
1542  if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1543  VecTy->isIntOrIntVectorTy() &&
1544  DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
1545  return 2 * BaseCost;
1546  }
1547 
1548  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1549  Alignment, AddressSpace, CostKind,
1550  UseMaskForCond, UseMaskForGaps);
1551 }
1552 
1554  unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1555  Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1556  using namespace PatternMatch;
1557  if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1558  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1559  Alignment, CostKind, I);
1560 
1561  assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1562  auto *VTy = cast<FixedVectorType>(DataTy);
1563 
1564  // TODO: Splitting, once we do that.
1565 
1566  unsigned NumElems = VTy->getNumElements();
1567  unsigned EltSize = VTy->getScalarSizeInBits();
1568  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
1569 
1570  // For now, it is assumed that for the MVE gather instructions the loads are
1571  // all effectively serialised. This means the cost is the scalar cost
1572  // multiplied by the number of elements being loaded. This is possibly very
1573  // conservative, but even so we still end up vectorising loops because the
1574  // cost per iteration for many loops is lower than for scalar loops.
1575  InstructionCost VectorCost =
1576  NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1577  // The scalarization cost should be a lot higher. We use the number of vector
1578  // elements plus the scalarization overhead.
1579  InstructionCost ScalarCost =
1580  NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
1581  BaseT::getScalarizationOverhead(VTy, false, true);
1582 
1583  if (EltSize < 8 || Alignment < EltSize / 8)
1584  return ScalarCost;
1585 
1586  unsigned ExtSize = EltSize;
1587  // Check whether there's a single user that asks for an extended type
1588  if (I != nullptr) {
1589  // Dependent of the caller of this function, a gather instruction will
1590  // either have opcode Instruction::Load or be a call to the masked_gather
1591  // intrinsic
1592  if ((I->getOpcode() == Instruction::Load ||
1593  match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1594  I->hasOneUse()) {
1595  const User *Us = *I->users().begin();
1596  if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1597  // only allow valid type combinations
1598  unsigned TypeSize =
1599  cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1600  if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1601  (TypeSize == 16 && EltSize == 8)) &&
1602  TypeSize * NumElems == 128) {
1603  ExtSize = TypeSize;
1604  }
1605  }
1606  }
1607  // Check whether the input data needs to be truncated
1608  TruncInst *T;
1609  if ((I->getOpcode() == Instruction::Store ||
1610  match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1611  (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1612  // Only allow valid type combinations
1613  unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1614  if (((EltSize == 16 && TypeSize == 32) ||
1615  (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1616  TypeSize * NumElems == 128)
1617  ExtSize = TypeSize;
1618  }
1619  }
1620 
1621  if (ExtSize * NumElems != 128 || NumElems < 4)
1622  return ScalarCost;
1623 
1624  // Any (aligned) i32 gather will not need to be scalarised.
1625  if (ExtSize == 32)
1626  return VectorCost;
1627  // For smaller types, we need to ensure that the gep's inputs are correctly
1628  // extended from a small enough value. Other sizes (including i64) are
1629  // scalarized for now.
1630  if (ExtSize != 8 && ExtSize != 16)
1631  return ScalarCost;
1632 
1633  if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1634  Ptr = BC->getOperand(0);
1635  if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1636  if (GEP->getNumOperands() != 2)
1637  return ScalarCost;
1638  unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1639  // Scale needs to be correct (which is only relevant for i16s).
1640  if (Scale != 1 && Scale * 8 != ExtSize)
1641  return ScalarCost;
1642  // And we need to zext (not sext) the indexes from a small enough type.
1643  if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1644  if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1645  return VectorCost;
1646  }
1647  return ScalarCost;
1648  }
1649  return ScalarCost;
1650 }
1651 
1657  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1658 
1659  EVT ValVT = TLI->getValueType(DL, ValTy);
1660  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1661  if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1662  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1663 
1664  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1665 
1666  static const CostTblEntry CostTblAdd[]{
1667  {ISD::ADD, MVT::v16i8, 1},
1668  {ISD::ADD, MVT::v8i16, 1},
1669  {ISD::ADD, MVT::v4i32, 1},
1670  };
1671  if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1672  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1673 
1674  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1675 }
1676 
1678 ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
1679  Type *ResTy, VectorType *ValTy,
1681  EVT ValVT = TLI->getValueType(DL, ValTy);
1682  EVT ResVT = TLI->getValueType(DL, ResTy);
1683 
1684  if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1685  std::pair<InstructionCost, MVT> LT =
1686  TLI->getTypeLegalizationCost(DL, ValTy);
1687 
1688  // The legal cases are:
1689  // VADDV u/s 8/16/32
1690  // VMLAV u/s 8/16/32
1691  // VADDLV u/s 32
1692  // VMLALV u/s 16/32
1693  // Codegen currently cannot always handle larger than legal vectors very
1694  // well, especially for predicated reductions where the mask needs to be
1695  // split, so restrict to 128bit or smaller input types.
1696  unsigned RevVTSize = ResVT.getSizeInBits();
1697  if (ValVT.getSizeInBits() <= 128 &&
1698  ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1699  (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) ||
1700  (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1701  return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1702  }
1703 
1704  return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
1705  CostKind);
1706 }
1707 
1711  switch (ICA.getID()) {
1712  case Intrinsic::get_active_lane_mask:
1713  // Currently we make a somewhat optimistic assumption that
1714  // active_lane_mask's are always free. In reality it may be freely folded
1715  // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1716  // of add/icmp code. We may need to improve this in the future, but being
1717  // able to detect if it is free or not involves looking at a lot of other
1718  // code. We currently assume that the vectorizer inserted these, and knew
1719  // what it was doing in adding one.
1720  if (ST->hasMVEIntegerOps())
1721  return 0;
1722  break;
1723  case Intrinsic::sadd_sat:
1724  case Intrinsic::ssub_sat:
1725  case Intrinsic::uadd_sat:
1726  case Intrinsic::usub_sat: {
1727  if (!ST->hasMVEIntegerOps())
1728  break;
1729  Type *VT = ICA.getReturnType();
1730 
1731  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1732  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1733  LT.second == MVT::v16i8) {
1734  // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1735  // need to extend the type, as it uses shr(qadd(shl, shl)).
1736  unsigned Instrs =
1737  LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1738  return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1739  }
1740  break;
1741  }
1742  case Intrinsic::abs:
1743  case Intrinsic::smin:
1744  case Intrinsic::smax:
1745  case Intrinsic::umin:
1746  case Intrinsic::umax: {
1747  if (!ST->hasMVEIntegerOps())
1748  break;
1749  Type *VT = ICA.getReturnType();
1750 
1751  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1752  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1753  LT.second == MVT::v16i8)
1754  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1755  break;
1756  }
1757  case Intrinsic::minnum:
1758  case Intrinsic::maxnum: {
1759  if (!ST->hasMVEFloatOps())
1760  break;
1761  Type *VT = ICA.getReturnType();
1762  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1763  if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1764  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1765  break;
1766  }
1767  }
1768 
1770 }
1771 
1773  if (!F->isIntrinsic())
1775 
1776  // Assume all Arm-specific intrinsics map to an instruction.
1777  if (F->getName().startswith("llvm.arm"))
1778  return false;
1779 
1780  switch (F->getIntrinsicID()) {
1781  default: break;
1782  case Intrinsic::powi:
1783  case Intrinsic::sin:
1784  case Intrinsic::cos:
1785  case Intrinsic::pow:
1786  case Intrinsic::log:
1787  case Intrinsic::log10:
1788  case Intrinsic::log2:
1789  case Intrinsic::exp:
1790  case Intrinsic::exp2:
1791  return true;
1792  case Intrinsic::sqrt:
1793  case Intrinsic::fabs:
1794  case Intrinsic::copysign:
1795  case Intrinsic::floor:
1796  case Intrinsic::ceil:
1797  case Intrinsic::trunc:
1798  case Intrinsic::rint:
1799  case Intrinsic::nearbyint:
1800  case Intrinsic::round:
1801  case Intrinsic::canonicalize:
1802  case Intrinsic::lround:
1803  case Intrinsic::llround:
1804  case Intrinsic::lrint:
1805  case Intrinsic::llrint:
1806  if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
1807  return true;
1808  if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
1809  return true;
1810  // Some operations can be handled by vector instructions and assume
1811  // unsupported vectors will be expanded into supported scalar ones.
1812  // TODO Handle scalar operations properly.
1813  return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
1814  case Intrinsic::masked_store:
1815  case Intrinsic::masked_load:
1816  case Intrinsic::masked_gather:
1817  case Intrinsic::masked_scatter:
1818  return !ST->hasMVEIntegerOps();
1819  case Intrinsic::sadd_with_overflow:
1820  case Intrinsic::uadd_with_overflow:
1821  case Intrinsic::ssub_with_overflow:
1822  case Intrinsic::usub_with_overflow:
1823  case Intrinsic::sadd_sat:
1824  case Intrinsic::uadd_sat:
1825  case Intrinsic::ssub_sat:
1826  case Intrinsic::usub_sat:
1827  return false;
1828  }
1829 
1830  return BaseT::isLoweredToCall(F);
1831 }
1832 
1834  unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1835  EVT VT = TLI->getValueType(DL, I.getType(), true);
1836  if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1837  return true;
1838 
1839  // Check if an intrinsic will be lowered to a call and assume that any
1840  // other CallInst will generate a bl.
1841  if (auto *Call = dyn_cast<CallInst>(&I)) {
1842  if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1843  switch(II->getIntrinsicID()) {
1844  case Intrinsic::memcpy:
1845  case Intrinsic::memset:
1846  case Intrinsic::memmove:
1847  return getNumMemOps(II) == -1;
1848  default:
1849  if (const Function *F = Call->getCalledFunction())
1850  return isLoweredToCall(F);
1851  }
1852  }
1853  return true;
1854  }
1855 
1856  // FPv5 provides conversions between integer, double-precision,
1857  // single-precision, and half-precision formats.
1858  switch (I.getOpcode()) {
1859  default:
1860  break;
1861  case Instruction::FPToSI:
1862  case Instruction::FPToUI:
1863  case Instruction::SIToFP:
1864  case Instruction::UIToFP:
1865  case Instruction::FPTrunc:
1866  case Instruction::FPExt:
1867  return !ST->hasFPARMv8Base();
1868  }
1869 
1870  // FIXME: Unfortunately the approach of checking the Operation Action does
1871  // not catch all cases of Legalization that use library calls. Our
1872  // Legalization step categorizes some transformations into library calls as
1873  // Custom, Expand or even Legal when doing type legalization. So for now
1874  // we have to special case for instance the SDIV of 64bit integers and the
1875  // use of floating point emulation.
1876  if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1877  switch (ISD) {
1878  default:
1879  break;
1880  case ISD::SDIV:
1881  case ISD::UDIV:
1882  case ISD::SREM:
1883  case ISD::UREM:
1884  case ISD::SDIVREM:
1885  case ISD::UDIVREM:
1886  return true;
1887  }
1888  }
1889 
1890  // Assume all other non-float operations are supported.
1891  if (!VT.isFloatingPoint())
1892  return false;
1893 
1894  // We'll need a library call to handle most floats when using soft.
1895  if (TLI->useSoftFloat()) {
1896  switch (I.getOpcode()) {
1897  default:
1898  return true;
1899  case Instruction::Alloca:
1900  case Instruction::Load:
1901  case Instruction::Store:
1902  case Instruction::Select:
1903  case Instruction::PHI:
1904  return false;
1905  }
1906  }
1907 
1908  // We'll need a libcall to perform double precision operations on a single
1909  // precision only FPU.
1910  if (I.getType()->isDoubleTy() && !ST->hasFP64())
1911  return true;
1912 
1913  // Likewise for half precision arithmetic.
1914  if (I.getType()->isHalfTy() && !ST->hasFullFP16())
1915  return true;
1916 
1917  return false;
1918 }
1919 
1921  AssumptionCache &AC,
1922  TargetLibraryInfo *LibInfo,
1923  HardwareLoopInfo &HWLoopInfo) {
1924  // Low-overhead branches are only supported in the 'low-overhead branch'
1925  // extension of v8.1-m.
1926  if (!ST->hasLOB() || DisableLowOverheadLoops) {
1927  LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
1928  return false;
1929  }
1930 
1932  LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
1933  return false;
1934  }
1935 
1936  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1937  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
1938  LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
1939  return false;
1940  }
1941 
1942  const SCEV *TripCountSCEV =
1943  SE.getAddExpr(BackedgeTakenCount,
1944  SE.getOne(BackedgeTakenCount->getType()));
1945 
1946  // We need to store the trip count in LR, a 32-bit register.
1947  if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
1948  LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
1949  return false;
1950  }
1951 
1952  // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
1953  // point in generating a hardware loop if that's going to happen.
1954 
1955  auto IsHardwareLoopIntrinsic = [](Instruction &I) {
1956  if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
1957  switch (Call->getIntrinsicID()) {
1958  default:
1959  break;
1960  case Intrinsic::start_loop_iterations:
1961  case Intrinsic::test_start_loop_iterations:
1962  case Intrinsic::loop_decrement:
1963  case Intrinsic::loop_decrement_reg:
1964  return true;
1965  }
1966  }
1967  return false;
1968  };
1969 
1970  // Scan the instructions to see if there's any that we know will turn into a
1971  // call or if this loop is already a low-overhead loop or will become a tail
1972  // predicated loop.
1973  bool IsTailPredLoop = false;
1974  auto ScanLoop = [&](Loop *L) {
1975  for (auto *BB : L->getBlocks()) {
1976  for (auto &I : *BB) {
1977  if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
1978  isa<InlineAsm>(I)) {
1979  LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
1980  return false;
1981  }
1982  if (auto *II = dyn_cast<IntrinsicInst>(&I))
1983  IsTailPredLoop |=
1984  II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
1985  II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
1986  II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
1987  II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
1988  II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
1989  }
1990  }
1991  return true;
1992  };
1993 
1994  // Visit inner loops.
1995  for (auto Inner : *L)
1996  if (!ScanLoop(Inner))
1997  return false;
1998 
1999  if (!ScanLoop(L))
2000  return false;
2001 
2002  // TODO: Check whether the trip count calculation is expensive. If L is the
2003  // inner loop but we know it has a low trip count, calculating that trip
2004  // count (in the parent loop) may be detrimental.
2005 
2006  LLVMContext &C = L->getHeader()->getContext();
2007  HWLoopInfo.CounterInReg = true;
2008  HWLoopInfo.IsNestingLegal = false;
2009  HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2010  HWLoopInfo.CountType = Type::getInt32Ty(C);
2011  HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2012  return true;
2013 }
2014 
2015 static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2016  // We don't allow icmp's, and because we only look at single block loops,
2017  // we simply count the icmps, i.e. there should only be 1 for the backedge.
2018  if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2019  return false;
2020  // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2021  // not currently canonical, but soon will be. Code without them uses icmp, and
2022  // so is not tail predicated as per the condition above. In order to get the
2023  // same performance we treat min and max the same as an icmp for tailpred
2024  // purposes for the moment (we often rely on non-tailpred and higher VF's to
2025  // pick more optimial instructions like VQDMULH. They need to be recognized
2026  // directly by the vectorizer).
2027  if (auto *II = dyn_cast<IntrinsicInst>(&I))
2028  if ((II->getIntrinsicID() == Intrinsic::smin ||
2029  II->getIntrinsicID() == Intrinsic::smax ||
2030  II->getIntrinsicID() == Intrinsic::umin ||
2031  II->getIntrinsicID() == Intrinsic::umax) &&
2032  ++ICmpCount > 1)
2033  return false;
2034 
2035  if (isa<FCmpInst>(&I))
2036  return false;
2037 
2038  // We could allow extending/narrowing FP loads/stores, but codegen is
2039  // too inefficient so reject this for now.
2040  if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2041  return false;
2042 
2043  // Extends have to be extending-loads
2044  if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2045  if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2046  return false;
2047 
2048  // Truncs have to be narrowing-stores
2049  if (isa<TruncInst>(&I) )
2050  if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2051  return false;
2052 
2053  return true;
2054 }
2055 
2056 // To set up a tail-predicated loop, we need to know the total number of
2057 // elements processed by that loop. Thus, we need to determine the element
2058 // size and:
2059 // 1) it should be uniform for all operations in the vector loop, so we
2060 // e.g. don't want any widening/narrowing operations.
2061 // 2) it should be smaller than i64s because we don't have vector operations
2062 // that work on i64s.
2063 // 3) we don't want elements to be reversed or shuffled, to make sure the
2064 // tail-predication masks/predicates the right lanes.
2065 //
2067  const DataLayout &DL,
2068  const LoopAccessInfo *LAI) {
2069  LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2070 
2071  // If there are live-out values, it is probably a reduction. We can predicate
2072  // most reduction operations freely under MVE using a combination of
2073  // prefer-predicated-reduction-select and inloop reductions. We limit this to
2074  // floating point and integer reductions, but don't check for operators
2075  // specifically here. If the value ends up not being a reduction (and so the
2076  // vectorizer cannot tailfold the loop), we should fall back to standard
2077  // vectorization automatically.
2079  LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2080  bool ReductionsDisabled =
2083 
2084  for (auto *I : LiveOuts) {
2085  if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2086  !I->getType()->isHalfTy()) {
2087  LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2088  "live-out value\n");
2089  return false;
2090  }
2091  if (ReductionsDisabled) {
2092  LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2093  return false;
2094  }
2095  }
2096 
2097  // Next, check that all instructions can be tail-predicated.
2098  PredicatedScalarEvolution PSE = LAI->getPSE();
2099  SmallVector<Instruction *, 16> LoadStores;
2100  int ICmpCount = 0;
2101 
2102  for (BasicBlock *BB : L->blocks()) {
2103  for (Instruction &I : BB->instructionsWithoutDebug()) {
2104  if (isa<PHINode>(&I))
2105  continue;
2106  if (!canTailPredicateInstruction(I, ICmpCount)) {
2107  LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2108  return false;
2109  }
2110 
2111  Type *T = I.getType();
2112  if (T->isPointerTy())
2113  T = T->getPointerElementType();
2114 
2115  if (T->getScalarSizeInBits() > 32) {
2116  LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2117  return false;
2118  }
2119  if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2121  Type *AccessTy = getLoadStoreType(&I);
2122  int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L);
2123  if (NextStride == 1) {
2124  // TODO: for now only allow consecutive strides of 1. We could support
2125  // other strides as long as it is uniform, but let's keep it simple
2126  // for now.
2127  continue;
2128  } else if (NextStride == -1 ||
2129  (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2130  (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2131  LLVM_DEBUG(dbgs()
2132  << "Consecutive strides of 2 found, vld2/vstr2 can't "
2133  "be tail-predicated\n.");
2134  return false;
2135  // TODO: don't tail predicate if there is a reversed load?
2136  } else if (EnableMaskedGatherScatters) {
2137  // Gather/scatters do allow loading from arbitrary strides, at
2138  // least if they are loop invariant.
2139  // TODO: Loop variant strides should in theory work, too, but
2140  // this requires further testing.
2141  const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2142  if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2143  const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2144  if (PSE.getSE()->isLoopInvariant(Step, L))
2145  continue;
2146  }
2147  }
2148  LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2149  "tail-predicate\n.");
2150  return false;
2151  }
2152  }
2153  }
2154 
2155  LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2156  return true;
2157 }
2158 
2160  ScalarEvolution &SE,
2161  AssumptionCache &AC,
2162  TargetLibraryInfo *TLI,
2163  DominatorTree *DT,
2164  const LoopAccessInfo *LAI) {
2165  if (!EnableTailPredication) {
2166  LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2167  return false;
2168  }
2169 
2170  // Creating a predicated vector loop is the first step for generating a
2171  // tail-predicated hardware loop, for which we need the MVE masked
2172  // load/stores instructions:
2173  if (!ST->hasMVEIntegerOps())
2174  return false;
2175 
2176  // For now, restrict this to single block loops.
2177  if (L->getNumBlocks() > 1) {
2178  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2179  "loop.\n");
2180  return false;
2181  }
2182 
2183  assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2184 
2185  HardwareLoopInfo HWLoopInfo(L);
2186  if (!HWLoopInfo.canAnalyze(*LI)) {
2187  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2188  "analyzable.\n");
2189  return false;
2190  }
2191 
2192  // This checks if we have the low-overhead branch architecture
2193  // extension, and if we will create a hardware-loop:
2194  if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2195  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2196  "profitable.\n");
2197  return false;
2198  }
2199 
2200  if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2201  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2202  "a candidate.\n");
2203  return false;
2204  }
2205 
2206  return canTailPredicateLoop(L, LI, SE, DL, LAI);
2207 }
2208 
2210  if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2211  return false;
2212 
2213  // Intrinsic @llvm.get.active.lane.mask is supported.
2214  // It is used in the MVETailPredication pass, which requires the number of
2215  // elements processed by this vector loop to setup the tail-predicated
2216  // loop.
2217  return true;
2218 }
2222  // Enable Upper bound unrolling universally, not dependant upon the conditions
2223  // below.
2224  UP.UpperBound = true;
2225 
2226  // Only currently enable these preferences for M-Class cores.
2227  if (!ST->isMClass())
2228  return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2229 
2230  // Disable loop unrolling for Oz and Os.
2231  UP.OptSizeThreshold = 0;
2232  UP.PartialOptSizeThreshold = 0;
2233  if (L->getHeader()->getParent()->hasOptSize())
2234  return;
2235 
2236  SmallVector<BasicBlock*, 4> ExitingBlocks;
2237  L->getExitingBlocks(ExitingBlocks);
2238  LLVM_DEBUG(dbgs() << "Loop has:\n"
2239  << "Blocks: " << L->getNumBlocks() << "\n"
2240  << "Exit blocks: " << ExitingBlocks.size() << "\n");
2241 
2242  // Only allow another exit other than the latch. This acts as an early exit
2243  // as it mirrors the profitability calculation of the runtime unroller.
2244  if (ExitingBlocks.size() > 2)
2245  return;
2246 
2247  // Limit the CFG of the loop body for targets with a branch predictor.
2248  // Allowing 4 blocks permits if-then-else diamonds in the body.
2249  if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2250  return;
2251 
2252  // Don't unroll vectorized loops, including the remainder loop
2253  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2254  return;
2255 
2256  // Scan the loop: don't unroll loops with calls as this could prevent
2257  // inlining.
2258  InstructionCost Cost = 0;
2259  for (auto *BB : L->getBlocks()) {
2260  for (auto &I : *BB) {
2261  // Don't unroll vectorised loop. MVE does not benefit from it as much as
2262  // scalar code.
2263  if (I.getType()->isVectorTy())
2264  return;
2265 
2266  if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2267  if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2268  if (!isLoweredToCall(F))
2269  continue;
2270  }
2271  return;
2272  }
2273 
2274  SmallVector<const Value*, 4> Operands(I.operand_values());
2275  Cost +=
2277  }
2278  }
2279 
2280  // On v6m cores, there are very few registers available. We can easily end up
2281  // spilling and reloading more registers in an unrolled loop. Look at the
2282  // number of LCSSA phis as a rough measure of how many registers will need to
2283  // be live out of the loop, reducing the default unroll count if more than 1
2284  // value is needed. In the long run, all of this should be being learnt by a
2285  // machine.
2286  unsigned UnrollCount = 4;
2287  if (ST->isThumb1Only()) {
2288  unsigned ExitingValues = 0;
2289  SmallVector<BasicBlock *, 4> ExitBlocks;
2290  L->getExitBlocks(ExitBlocks);
2291  for (auto *Exit : ExitBlocks) {
2292  // Count the number of LCSSA phis. Exclude values coming from GEP's as
2293  // only the last is expected to be needed for address operands.
2294  unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2295  return PH.getNumOperands() != 1 ||
2296  !isa<GetElementPtrInst>(PH.getOperand(0));
2297  });
2298  ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2299  }
2300  if (ExitingValues)
2301  UnrollCount /= ExitingValues;
2302  if (UnrollCount <= 1)
2303  return;
2304  }
2305 
2306  LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2307  LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2308 
2309  UP.Partial = true;
2310  UP.Runtime = true;
2311  UP.UnrollRemainder = true;
2313  UP.UnrollAndJam = true;
2315 
2316  // Force unrolling small loops can be very useful because of the branch
2317  // taken cost of the backedge.
2318  if (Cost < 12)
2319  UP.Force = true;
2320 }
2321 
2324  BaseT::getPeelingPreferences(L, SE, PP);
2325 }
2326 
2327 bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2328  TTI::ReductionFlags Flags) const {
2329  if (!ST->hasMVEIntegerOps())
2330  return false;
2331 
2332  unsigned ScalarBits = Ty->getScalarSizeInBits();
2333  switch (Opcode) {
2334  case Instruction::Add:
2335  return ScalarBits <= 64;
2336  default:
2337  return false;
2338  }
2339 }
2340 
2342  unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2343  if (!ST->hasMVEIntegerOps())
2344  return false;
2345  return true;
2346 }
llvm::ISD::SUB
@ SUB
Definition: ISDOpcodes.h:240
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
ARMSubtarget.h
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::TargetLoweringBase::getMaxStoresPerMemmove
unsigned getMaxStoresPerMemmove(bool OptSize) const
Get maximum # of store operations permitted for llvm.memmove.
Definition: TargetLowering.h:1658
llvm::TargetTransformInfo::CastContextHint::Masked
@ Masked
The cast is used with a masked load/store.
ValueTypes.h
CmpMode::FP
@ FP
llvm::TargetTransformInfo::UnrollingPreferences::PartialOptSizeThreshold
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
Definition: TargetTransformInfo.h:460
llvm::ScalarEvolution::hasLoopInvariantBackedgeTakenCount
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
Definition: ScalarEvolution.cpp:12767
llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition: TargetTransformInfo.h:871
llvm::BasicTTIImplBase< ARMTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:39
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:488
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:211
llvm::SPF_SMAX
@ SPF_SMAX
Unsigned minimum.
Definition: ValueTracking.h:680
llvm::ISD::VECTOR_SHUFFLE
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:563
llvm::ARM_AM::isThumbImmShiftedVal
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
Definition: ARMAddressingModes.h:235
llvm::MVT::v4f16
@ v4f16
Definition: MachineValueType.h:136
llvm::TargetTransformInfo::TCC_Expensive
@ TCC_Expensive
The cost of a 'div' instruction on x86.
Definition: TargetTransformInfo.h:264
llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:184
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AllocatorList.h:22
llvm::TargetTransformInfo::ReductionFlags
Flags describing the kind of vector reduction.
Definition: TargetTransformInfo.h:1366
llvm::ConvertCostTableLookup
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType >> Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
llvm::CostTblEntryT
Cost Table Entry.
Definition: CostTable.h:25
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::LoopBase::getExitBlocks
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
Definition: LoopInfoImpl.h:62
llvm::ARMTTIImpl::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: ARMTargetTransformInfo.cpp:1512
llvm::ARMSubtarget::hasMVEFloatOps
bool hasMVEFloatOps() const
Definition: ARMSubtarget.h:651
llvm::HardwareLoopInfo::LoopDecrement
Value * LoopDecrement
Definition: TargetTransformInfo.h:103
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:633
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
InstCombiner.h
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:721
llvm::InstCombiner::getDominatorTree
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:370
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
IntrinsicInst.h
ceil
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g ceil
Definition: README-FPStack.txt:54
DisableLowOverheadLoops
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
llvm::TypeSize::getFixedSize
ScalarTy getFixedSize() const
Definition: TypeSize.h:425
T
llvm::Function
Definition: Function.h:62
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
llvm::ISD::UDIV
@ UDIV
Definition: ISDOpcodes.h:243
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
llvm::SelectPatternResult::Flavor
SelectPatternFlavor Flavor
Definition: ValueTracking.h:700
llvm::TargetTransformInfoImplCRTPBase< ARMTTIImpl >::getUserCost
InstructionCost getUserCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
Definition: TargetTransformInfoImpl.h:962
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:673
llvm::BasicTTIImplBase< ARMTTIImpl >::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1065
llvm::TargetTransformInfo::AMK_PostIndexed
@ AMK_PostIndexed
Definition: TargetTransformInfo.h:644
llvm::ARMTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: ARMTargetTransformInfo.cpp:1203
llvm::PointerType::get
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Definition: Type.cpp:729
MVEMaxSupportedInterleaveFactor
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor
llvm::ARMTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: ARMTargetTransformInfo.cpp:85
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:308
llvm::PredicatedScalarEvolution
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
Definition: ScalarEvolution.h:2179
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:133
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1175
llvm::InstCombiner::Builder
BuilderTy & Builder
Definition: InstCombiner.h:58
llvm::ARMSubtarget::hasFPARMv8Base
bool hasFPARMv8Base() const
Definition: ARMSubtarget.h:678
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1479
llvm::CmpInst::makeCmpResultType
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1046
llvm::IRBuilder< TargetFolder, IRBuilderCallbackInserter >
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:150
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:460
llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:757
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJamInnerLoopThreshold
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
Definition: TargetTransformInfo.h:507
Local.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
llvm::TargetTransformInfo::UnrollingPreferences::UnrollRemainder
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
Definition: TargetTransformInfo.h:500
llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:785
llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:178
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:214
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::ARMSubtarget::hasV6T2Ops
bool hasV6T2Ops() const
Definition: ARMSubtarget.h:632
EnableMaskedLoadStores
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
llvm::MemOp
Definition: TargetLowering.h:111
APInt.h
llvm::ARMTTIImpl::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:1553
llvm::getLoadStoreType
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
Definition: Instructions.h:5364
llvm::HardwareLoopInfo::isHardwareLoopCandidate
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
Definition: TargetTransformInfo.cpp:100
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:748
llvm::ARMTTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:456
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:484
llvm::findDefsUsedOutsideOfLoop
SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
Definition: LoopUtils.cpp:132
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::getPtrStride
int64_t getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap=ValueToValueMap(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
Definition: LoopAccessAnalysis.cpp:1051
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1412
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:538
llvm::tgtok::Bits
@ Bits
Definition: TGLexer.h:50
llvm::Instruction::isShift
bool isShift() const
Definition: Instruction.h:167
llvm::BasicTTIImplBase< ARMTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:839
llvm::SPF_UMAX
@ SPF_UMAX
Signed maximum.
Definition: ValueTracking.h:681
llvm::Optional
Definition: APInt.h:33
llvm::ARMSubtarget::hasMVEIntegerOps
bool hasMVEIntegerOps() const
Definition: ARMSubtarget.h:650
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:41
llvm::ConstantAsMetadata::get
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:419
llvm::TargetLoweringBase::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: TargetLoweringBase.cpp:1848
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:751
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
RHS
Value * RHS
Definition: X86PartialReduction.cpp:74
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
llvm::matchSelectPattern
SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
Definition: ValueTracking.cpp:6115
llvm::SelectPatternFlavor
SelectPatternFlavor
Specific patterns of select instructions we can match.
Definition: ValueTracking.h:676
llvm::MVT::v2f64
@ v2f64
Definition: MachineValueType.h:172
llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:525
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::count_if
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1682
llvm::TargetLoweringBase::getOperationAction
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
Definition: TargetLowering.h:1077
llvm::ARMTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: ARMTargetTransformInfo.cpp:2219
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:241
llvm::LoopBase::getNumBlocks
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:185
llvm::APIntOps::umin
const APInt & umin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be unsigned.
Definition: APInt.h:2133
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1233
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:869
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::ARMTTIImpl::isLegalMaskedLoad
bool isLegalMaskedLoad(Type *DataTy, Align Alignment)
Definition: ARMTargetTransformInfo.cpp:1095
llvm::Instruction::setMetadata
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1336
llvm::ARMTTIImpl::isHardwareLoopProfitable
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
Definition: ARMTargetTransformInfo.cpp:1920
KnownBits.h
llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(Optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition: TargetTransformInfo.h:1189
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::HardwareLoopInfo::IsNestingLegal
bool IsNestingLegal
Definition: TargetTransformInfo.h:105
floor
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g floor
Definition: README-FPStack.txt:54
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:130
MachineValueType.h
UnrollCount
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:266
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:185
Instruction.h
llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition: DerivedTypes.h:568
LHS
Value * LHS
Definition: X86PartialReduction.cpp:73
llvm::ARMSubtarget::hasLOB
bool hasLOB() const
Definition: ARMSubtarget.h:686
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::InstCombiner::replaceOperand
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:438
llvm::Intrinsic::getType
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:1356
llvm::MVT::i1
@ i1
Definition: MachineValueType.h:43
llvm::APInt::isNonNegative
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:317
llvm::MVT::v8f16
@ v8f16
Definition: MachineValueType.h:137
llvm::APInt::isNegative
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:312
llvm::BasicTTIImplBase::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:493
llvm::ARM_AM::getSOImmVal
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
Definition: ARMAddressingModes.h:163
SubtargetFeature.h
TargetMachine.h
llvm::TypeConversionCostTblEntryT
Type Conversion Cost Table.
Definition: CostTable.h:55
llvm::ScalarEvolution::getOne
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
Definition: ScalarEvolution.h:652
llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:679
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:729
EnableTailPredication
cl::opt< TailPredication::Mode > EnableTailPredication
llvm::BasicTTIImplBase< ARMTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:1118
llvm::ARMTTIImpl::getIntImmCostInst
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
Definition: ARMTargetTransformInfo.cpp:385
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:868
llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition: TargetTransformInfo.h:1074
llvm::User
Definition: User.h:44
llvm::TargetLoweringBase::getMaxStoresPerMemset
unsigned getMaxStoresPerMemset(bool OptSize) const
Get maximum # of store operations permitted for llvm.memset.
Definition: TargetLowering.h:1619
llvm::BasicTTIImplBase< ARMTTIImpl >::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
Definition: BasicTTIImpl.h:2036
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:35
llvm::getKnownAlignment
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition: Local.h:230
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::TargetTransformInfo::UnrollingPreferences::Force
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
Definition: TargetTransformInfo.h:496
llvm::MVT::f64
@ f64
Definition: MachineValueType.h:56
llvm::ARMSubtarget::hasBranchPredictor
bool hasBranchPredictor() const
Definition: ARMSubtarget.h:741
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:57
llvm::EVT::getVectorNumElements
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:309
llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2149
AllowWLSLoops
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:735
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:178
llvm::APInt::getLimitedValue
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition: APInt.h:456
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:226
llvm::ARMTTIImpl::getIntImmCodeSizeCost
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty)
Definition: ARMTargetTransformInfo.cpp:328
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::MVT::v16i1
@ v16i1
Definition: MachineValueType.h:68
llvm::ISD::UDIVREM
@ UDIVREM
Definition: ISDOpcodes.h:256
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
llvm::PatternMatch::m_c_Add
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
Definition: PatternMatch.h:2235
llvm::ARMTargetLowering::useSoftFloat
bool useSoftFloat() const override
Definition: ARMISelLowering.cpp:1594
llvm::MVT::v8i1
@ v8i1
Definition: MachineValueType.h:67
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJam
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
Definition: TargetTransformInfo.h:502
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::PatternMatch::m_ConstantInt
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:145
llvm::EVT::isInteger
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:145
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:191
llvm::HardwareLoopInfo::PerformEntryTest
bool PerformEntryTest
Definition: TargetTransformInfo.h:109
canTailPredicateLoop
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI)
Definition: ARMTargetTransformInfo.cpp:2066
llvm::APInt::isAllOnes
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:347
llvm::ARMTTIImpl::isLoweredToCall
bool isLoweredToCall(const Function *F)
Definition: ARMTargetTransformInfo.cpp:1772
llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:739
llvm::InstCombiner::eraseInstFromFunction
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1467
llvm::APInt::getHighBitsSet
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:279
llvm::LoopBase::getExitingBlocks
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
Definition: LoopInfoImpl.h:34
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:925
LoopUtils.h
llvm::HardwareLoopInfo::CounterInReg
bool CounterInReg
Definition: TargetTransformInfo.h:107
llvm::ARMSubtarget::hasV6Ops
bool hasV6Ops() const
Definition: ARMSubtarget.h:629
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:632
llvm::ARMTTIImpl::emitGetActiveLaneMask
bool emitGetActiveLaneMask() const
Definition: ARMTargetTransformInfo.cpp:2209
PatternMatch.h
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:686
llvm::ARMSubtarget::hasFP64
bool hasFP64() const
Definition: ARMSubtarget.h:714
llvm::MVT::v1i64
@ v1i64
Definition: MachineValueType.h:117
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:62
llvm::isVREVMask
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
Definition: ARMTargetTransformInfo.h:319
llvm::ARM_AM::getT2SOImmVal
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
Definition: ARMAddressingModes.h:320
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
llvm::BasicTTIImplBase< ARMTTIImpl >::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1070
llvm::BasicTTIImplBase< ARMTTIImpl >::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:900
llvm::None
const NoneType None
Definition: None.h:23
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:91
llvm::lltok::Kind
Kind
Definition: LLToken.h:18
llvm::ARMSubtarget::getMVEVectorCostFactor
unsigned getMVEVectorCostFactor(TargetTransformInfo::TargetCostKind CostKind) const
Definition: ARMSubtarget.h:949
llvm::Type::getIntegerBitWidth
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:97
llvm::MVT::v4i8
@ v4i8
Definition: MachineValueType.h:78
llvm::SPF_SMIN
@ SPF_SMIN
Definition: ValueTracking.h:678
Type.h
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:118
llvm::APInt::isAllOnesValue
bool isAllOnesValue() const
NOTE: This is soft-deprecated. Please use isAllOnes() instead.
Definition: APInt.h:356
llvm::Instruction::getMetadata
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:282
llvm::maxnum
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE maxNum semantics.
Definition: APFloat.h:1307
LoopInfo.h
llvm::ARMTTIImpl::getNumMemOps
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
Definition: ARMTargetTransformInfo.cpp:1127
llvm::ARMTTIImpl::isProfitableLSRChainElement
bool isProfitableLSRChainElement(Instruction *I)
Definition: ARMTargetTransformInfo.cpp:1078
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:78
llvm::ARMTTIImpl::preferPredicateOverEpilogue
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, const LoopAccessInfo *LAI)
Definition: ARMTargetTransformInfo.cpp:2159
llvm::ScalarEvolution::getSCEV
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition: ScalarEvolution.cpp:4339
llvm::PatternMatch::m_Xor
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1115
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:190
llvm::APInt::getOneBitSet
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:222
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:870
llvm::MVT::v2i8
@ v2i8
Definition: MachineValueType.h:77
llvm::MVT::v4i64
@ v4i64
Definition: MachineValueType.h:120
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
llvm::ARMTTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:1448
llvm::TargetTransformInfo::CastContextHint::Normal
@ Normal
The cast is used with a normal load/store.
BasicBlock.h
llvm::cl::opt< bool >
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:77
llvm::ARMTTIImpl::getExtendedAddReductionCost
InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1678
llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:535
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::MVT::v16i8
@ v16i8
Definition: MachineValueType.h:80
llvm::CostTableLookup
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType >> Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
llvm::EVT::getSizeInBits
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:341
llvm::ARMSubtarget::isThumb1Only
bool isThumb1Only() const
Definition: ARMSubtarget.h:831
llvm::ScalarEvolution::getUnsignedRangeMax
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
Definition: ScalarEvolution.h:963
llvm::ARMTTIImpl::getMemcpyCost
InstructionCost getMemcpyCost(const Instruction *I)
Definition: ARMTargetTransformInfo.cpp:1193
llvm::ARMTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Op1Info=TTI::OK_AnyValue, TTI::OperandValueKind Op2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: ARMTargetTransformInfo.cpp:1306
llvm::InstCombiner::getAssumptionCache
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:368
llvm::MVT::v16i16
@ v16i16
Definition: MachineValueType.h:93
Index
uint32_t Index
Definition: ELFObjHandler.cpp:83
llvm::MVT::v2i64
@ v2i64
Definition: MachineValueType.h:118
uint64_t
llvm::InstCombiner::getDataLayout
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:371
llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition: ISDOpcodes.h:786
llvm::ARMTTIImpl::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr)
Definition: ARMTargetTransformInfo.cpp:1056
llvm::MVT::v16f32
@ v16f32
Definition: MachineValueType.h:162
llvm::Instruction::user_back
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:91
llvm::TruncInst
This class represents a truncation of integer types.
Definition: Instructions.h:4773
llvm::TargetTransformInfo::OK_UniformConstantValue
@ OK_UniformConstantValue
Definition: TargetTransformInfo.h:890
llvm::ARMSubtarget::isMClass
bool isMClass() const
Definition: ARMSubtarget.h:834
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:431
isSSATMinMaxPattern
static Value * isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
Definition: ARMTargetTransformInfo.cpp:339
I
#define I(x, y, z)
Definition: MD5.cpp:58
getCalledFunction
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
Definition: MemoryBuiltins.cpp:117
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:895
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:441
llvm::ARMTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1709
llvm::LoopAccessInfo
Drive the analysis of memory accesses in the loop.
Definition: LoopAccessAnalysis.h:511
llvm::ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic
Optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
Definition: ARMTargetTransformInfo.cpp:251
llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:141
llvm::MVT::v4f32
@ v4f32
Definition: MachineValueType.h:157
llvm::MVT::i8
@ i8
Definition: MachineValueType.h:44
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:80
llvm::MVT::Other
@ Other
Definition: MachineValueType.h:42
llvm::ARMSubtarget::hasSlowLoadDSubregister
bool hasSlowLoadDSubregister() const
Definition: ARMSubtarget.h:728
llvm::SPF_ABS
@ SPF_ABS
Floating point maxnum.
Definition: ValueTracking.h:684
memcpy
<%struct.s * > cast struct s *S to sbyte *< sbyte * > sbyte uint cast struct s *agg result to sbyte *< sbyte * > sbyte uint cast struct s *memtmp to sbyte *< sbyte * > sbyte uint ret void llc ends up issuing two memcpy or custom lower memcpy(of small size) to be ldmia/stmia. I think option 2 is better but the current register allocator cannot allocate a chunk of registers at a time. A feasible temporary solution is to use specific physical registers at the lowering time for small(<
llvm::TargetLoweringBase::getMaxStoresPerMemcpy
unsigned getMaxStoresPerMemcpy(bool OptSize) const
Get maximum # of store operations permitted for llvm.memcpy.
Definition: TargetLowering.h:1629
llvm::TargetTransformInfoImplBase::isLoweredToCall
bool isLoweredToCall(const Function *F) const
Definition: TargetTransformInfoImpl.h:124
llvm::ARMTTIImpl::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1495
llvm::ARMSubtarget::hasFullFP16
bool hasFullFP16() const
Definition: ARMSubtarget.h:758
llvm::ARMTTIImpl::preferInLoopReduction
bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
Definition: ARMTargetTransformInfo.cpp:2327
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:83
llvm::IRBuilderBase::CreateIntrinsic
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with args, mangled using Types.
Definition: IRBuilder.cpp:851
llvm::BasicTTIImplBase< ARMTTIImpl >::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: BasicTTIImpl.h:1219
llvm::IRBuilderBase::getTrue
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:458
llvm::PatternMatch::m_Constant
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
Definition: PatternMatch.h:142
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:887
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:650
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::ARMTTIImpl::getPreferredAddressingMode
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
Definition: ARMTargetTransformInfo.cpp:104
llvm::APIntOps::smin
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition: APInt.h:2123
llvm::ArrayRef< int >
llvm::LoopInfo
Definition: LoopInfo.h:1086
llvm::EVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:155
ARMAddressingModes.h
llvm::ARMSubtarget::hasNEON
bool hasNEON() const
Definition: ARMSubtarget.h:679
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
DataLayout.h
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:47
llvm::MVT::v2i32
@ v2i32
Definition: MachineValueType.h:101
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:42
llvm::ARMTTIImpl::isLegalMaskedGather
bool isLegalMaskedGather(Type *Ty, Align Alignment)
Definition: ARMTargetTransformInfo.cpp:1115
llvm::BasicTTIImplBase< ARMTTIImpl >::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:678
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:215
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:134
llvm::ISD::SREM
@ SREM
Definition: ISDOpcodes.h:244
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
llvm::MVT::v2f32
@ v2f32
Definition: MachineValueType.h:155
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:991
llvm::ARMTTIImpl::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:910
llvm::TargetTransformInfo::UnrollingPreferences::DefaultUnrollRuntimeCount
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
Definition: TargetTransformInfo.h:467
trunc
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g trunc
Definition: README-FPStack.txt:63
llvm::SPF_FMINNUM
@ SPF_FMINNUM
Unsigned maximum.
Definition: ValueTracking.h:682
llvm::TargetLoweringBase::InstructionOpcodeToISD
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Definition: TargetLoweringBase.cpp:1768
llvm::TargetTransformInfo::AddressingModeKind
AddressingModeKind
Definition: TargetTransformInfo.h:642
llvm::MVT::v4i32
@ v4i32
Definition: MachineValueType.h:103
llvm::BasicTTIImplBase< ARMTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:565
llvm::SPF_FMAXNUM
@ SPF_FMAXNUM
Floating point minnum.
Definition: ValueTracking.h:683
canTailPredicateInstruction
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
Definition: ARMTargetTransformInfo.cpp:2015
llvm::MVT::v8i64
@ v8i64
Definition: MachineValueType.h:121
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:634
llvm::Function::hasOptSize
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:661
llvm::ScalarEvolution::isLoopInvariant
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
Definition: ScalarEvolution.cpp:13036
llvm::InstCombiner::replaceInstUsesWith
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:417
llvm::MVT::v16i32
@ v16i32
Definition: MachineValueType.h:108
llvm::MemOp::Set
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
Definition: TargetLowering.h:144
llvm::MCID::Select
@ Select
Definition: MCInstrDesc.h:162
llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition: TargetLowering.h:338
llvm::LoopBase::isInnermost
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:165
llvm::APIntOps::umax
const APInt & umax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be unsigned.
Definition: APInt.h:2138
llvm::minnum
LLVM_READONLY APFloat minnum(const APFloat &A, const APFloat &B)
Implements IEEE minNum semantics.
Definition: APFloat.h:1296
llvm::MemOp::Copy
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
Definition: TargetLowering.h:129
llvm::KnownBits
Definition: KnownBits.h:23
isFPSatMinMaxPattern
static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm)
Definition: ARMTargetTransformInfo.cpp:373
llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:147
llvm::BasicTTIImplBase< ARMTTIImpl >::getCallInstrCost
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
Definition: BasicTTIImpl.h:2024
CostTable.h
llvm::EVT::getScalarType
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:296
llvm::TargetTransformInfo::UnrollingPreferences::UpperBound
bool UpperBound
Allow using trip count upper bound to unroll loops.
Definition: TargetTransformInfo.h:498
llvm::Align::value
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
llvm::CallBase::arg_size
unsigned arg_size() const
Definition: InstrTypes.h:1341
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:196
llvm::MVT::v8i16
@ v8i16
Definition: MachineValueType.h:92
llvm::ARMTTIImpl::getIntImmCost
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:293
ISDOpcodes.h
llvm::APInt::isNegatedPowerOf2
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:434
llvm::TypeSize
Definition: TypeSize.h:416
Casting.h
llvm::BasicTTIImplBase< ARMTTIImpl >::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:1203
llvm::Value::hasNUses
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
llvm::MVT::i32
@ i32
Definition: MachineValueType.h:46
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:221
llvm::ISD::SDIV
@ SDIV
Definition: ISDOpcodes.h:242
powi
This is blocked on not handling X *X *X powi(X, 3)(see note above). The issue is that we end up getting t
llvm::log2
static double log2(double V)
Definition: AMDGPULibCalls.cpp:801
llvm::APInt::getSplat
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:595
llvm::InstCombiner::SimplifyDemandedBits
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, unsigned Depth=0)=0
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:183
llvm::MVT::v8i32
@ v8i32
Definition: MachineValueType.h:107
llvm::InstCombiner
The core instruction combiner logic.
Definition: InstCombiner.h:45
llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition: ISDOpcodes.h:740
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
llvm::TargetLoweringBase::isOperationLegalOrCustomOrPromote
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Definition: TargetLowering.h:1172
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
llvm::HardwareLoopInfo
Attributes of a target dependent hardware loop.
Definition: TargetTransformInfo.h:95
llvm::TargetTransformInfoImplBase::isConstantStridedAccessLessThan
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
Definition: TargetTransformInfoImpl.h:873
llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:837
llvm::BasicTTIImplBase< ARMTTIImpl >::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1210
llvm::ARMTTIImpl::maybeLoweredToCall
bool maybeLoweredToCall(Instruction &I)
Definition: ARMTargetTransformInfo.cpp:1833
Instructions.h
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:148
llvm::ARMTTIImpl::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:470
llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:657
SmallVector.h
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:780
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:241
llvm::ISD::UREM
@ UREM
Definition: ISDOpcodes.h:245
llvm::MVT::f16
@ f16
Definition: MachineValueType.h:54
llvm::TailPredication::EnabledNoReductions
@ EnabledNoReductions
Definition: ARMTargetTransformInfo.h:44
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1343
llvm::BasicTTIImplBase< ARMTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: BasicTTIImpl.h:872
llvm::BasicTTIImplBase< ARMTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1359
llvm::ARMTTIImpl::isLegalMaskedStore
bool isLegalMaskedStore(Type *DataTy, Align Alignment)
Definition: ARMTargetTransformInfo.h:188
llvm::SPF_UMIN
@ SPF_UMIN
Signed minimum.
Definition: ValueTracking.h:679
llvm::getBooleanLoopAttribute
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1087
llvm::ARMTargetLowering::getNumInterleavedAccesses
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
Definition: ARMISelLowering.cpp:21155
simplifyNeonVld1
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
Definition: ARMTargetTransformInfo.cpp:66
llvm::APInt::getActiveBits
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1436
llvm::ARMTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: ARMTargetTransformInfo.cpp:2322
llvm::BasicTTIImplBase< ARMTTIImpl >::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1159
ARMTargetTransformInfo.h
llvm::HardwareLoopInfo::canAnalyze
bool canAnalyze(LoopInfo &LI)
Definition: TargetTransformInfo.cpp:47
llvm::IRBuilderBase::CreateVectorSplat
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1108
DerivedTypes.h
llvm::SCEV::getType
Type * getType() const
Return the LLVM type of this SCEV expression.
Definition: ScalarEvolution.cpp:381
llvm::getLoadStorePointerOperand
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Definition: Instructions.h:5319
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
EnableMaskedGatherScatters
cl::opt< bool > EnableMaskedGatherScatters
llvm::ScalarEvolution::getAddExpr
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:2440
llvm::APInt::getLowBitsSet
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:289
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:45
llvm::TargetTransformInfo::UnrollingPreferences::OptSizeThreshold
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Definition: TargetTransformInfo.h:453
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:172
llvm::ARMTTIImpl::preferPredicatedReductionSelect
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
Definition: ARMTargetTransformInfo.cpp:2341
llvm::ISD::SDIVREM
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
llvm::ARMTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: ARMTargetTransformInfo.cpp:874
llvm::ARMSubtarget::isThumb2
bool isThumb2() const
Definition: ARMSubtarget.h:832
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:412
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1439
llvm::LoopAccessInfo::getPSE
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
Definition: LoopAccessAnalysis.h:583
llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:726
llvm::MVT::v8i8
@ v8i8
Definition: MachineValueType.h:79
llvm::ScalarEvolution::getBackedgeTakenCount
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
Definition: ScalarEvolution.cpp:7715
llvm::MVT::v8f32
@ v8f32
Definition: MachineValueType.h:161
llvm::PredicatedScalarEvolution::getSE
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
Definition: ScalarEvolution.h:2211
llvm::ARMTTIImpl::instCombineIntrinsic
Optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
Definition: ARMTargetTransformInfo.cpp:120
llvm::BinaryOperator::Create
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), Instruction *InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Definition: Instructions.cpp:2750
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:89
llvm::MVT::v16i64
@ v16i64
Definition: MachineValueType.h:122
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1282
llvm::BasicTTIImplBase< ARMTTIImpl >::getExtendedAddReductionCost
InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2217
llvm::HardwareLoopInfo::CountType
IntegerType * CountType
Definition: TargetTransformInfo.h:102
llvm::EVT::isFixedLengthVector
bool isFixedLengthVector() const
Definition: ValueTypes.h:165
llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:818
llvm::MVT::f32
@ f32
Definition: MachineValueType.h:55
llvm::ARMTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1653
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:212
llvm::EVT::isFloatingPoint
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:140
llvm::TargetTransformInfo::AMK_None
@ AMK_None
Definition: TargetTransformInfo.h:645
llvm::TargetTransformInfo::AMK_PreIndexed
@ AMK_PreIndexed
Definition: TargetTransformInfo.h:643
llvm::APIntOps::smax
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition: APInt.h:2128
llvm::ARMSubtarget::hasVFP2Base
bool hasVFP2Base() const
Definition: ARMSubtarget.h:675
llvm::TargetLoweringBase::LibCall
@ LibCall
Definition: TargetLowering.h:199
llvm::ARMSubtarget::isThumb
bool isThumb() const
Definition: ARMSubtarget.h:829
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:166
llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:289
llvm::MVT::v4i1
@ v4i1
Definition: MachineValueType.h:66
llvm::ARMTargetLowering::isLegalInterleavedAccessType
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
Definition: ARMISelLowering.cpp:21160
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:506
llvm::TailPredication::ForceEnabledNoReductions
@ ForceEnabledNoReductions
Definition: ARMTargetTransformInfo.h:46