LLVM  15.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "ARMSubtarget.h"
12 #include "llvm/ADT/APInt.h"
13 #include "llvm/ADT/SmallVector.h"
14 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
18 #include "llvm/IR/BasicBlock.h"
19 #include "llvm/IR/DataLayout.h"
20 #include "llvm/IR/DerivedTypes.h"
21 #include "llvm/IR/Instruction.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/Intrinsics.h"
24 #include "llvm/IR/IntrinsicInst.h"
25 #include "llvm/IR/IntrinsicsARM.h"
26 #include "llvm/IR/PatternMatch.h"
27 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Casting.h"
30 #include "llvm/Support/KnownBits.h"
36 #include <algorithm>
37 #include <cassert>
38 #include <cstdint>
39 #include <utility>
40 
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "armtti"
44 
46  "enable-arm-maskedldst", cl::Hidden, cl::init(true),
47  cl::desc("Enable the generation of masked loads and stores"));
48 
50  "disable-arm-loloops", cl::Hidden, cl::init(false),
51  cl::desc("Disable the generation of low-overhead loops"));
52 
53 static cl::opt<bool>
54  AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
55  cl::desc("Enable the generation of WLS loops"));
56 
58 
60 
62 
63 /// Convert a vector load intrinsic into a simple llvm load instruction.
64 /// This is beneficial when the underlying object being addressed comes
65 /// from a constant, since we get constant-folding for free.
66 static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
68  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
69 
70  if (!IntrAlign)
71  return nullptr;
72 
73  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
74  ? MemAlign
75  : IntrAlign->getLimitedValue();
76 
77  if (!isPowerOf2_32(Alignment))
78  return nullptr;
79 
80  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
81  PointerType::get(II.getType(), 0));
82  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
83 }
84 
86  const Function *Callee) const {
87  const TargetMachine &TM = getTLI()->getTargetMachine();
88  const FeatureBitset &CallerBits =
89  TM.getSubtargetImpl(*Caller)->getFeatureBits();
90  const FeatureBitset &CalleeBits =
91  TM.getSubtargetImpl(*Callee)->getFeatureBits();
92 
93  // To inline a callee, all features not in the allowed list must match exactly.
94  bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
95  (CalleeBits & ~InlineFeaturesAllowed);
96  // For features in the allowed list, the callee's features must be a subset of
97  // the callers'.
98  bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
99  (CalleeBits & InlineFeaturesAllowed);
100  return MatchExact && MatchSubset;
101 }
102 
105  ScalarEvolution *SE) const {
106  if (ST->hasMVEIntegerOps())
107  return TTI::AMK_PostIndexed;
108 
109  if (L->getHeader()->getParent()->hasOptSize())
110  return TTI::AMK_None;
111 
112  if (ST->isMClass() && ST->isThumb2() &&
113  L->getNumBlocks() == 1)
114  return TTI::AMK_PreIndexed;
115 
116  return TTI::AMK_None;
117 }
118 
121  using namespace PatternMatch;
122  Intrinsic::ID IID = II.getIntrinsicID();
123  switch (IID) {
124  default:
125  break;
126  case Intrinsic::arm_neon_vld1: {
127  Align MemAlign =
129  &IC.getAssumptionCache(), &IC.getDominatorTree());
130  if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
131  return IC.replaceInstUsesWith(II, V);
132  }
133  break;
134  }
135 
136  case Intrinsic::arm_neon_vld2:
137  case Intrinsic::arm_neon_vld3:
138  case Intrinsic::arm_neon_vld4:
139  case Intrinsic::arm_neon_vld2lane:
140  case Intrinsic::arm_neon_vld3lane:
141  case Intrinsic::arm_neon_vld4lane:
142  case Intrinsic::arm_neon_vst1:
143  case Intrinsic::arm_neon_vst2:
144  case Intrinsic::arm_neon_vst3:
145  case Intrinsic::arm_neon_vst4:
146  case Intrinsic::arm_neon_vst2lane:
147  case Intrinsic::arm_neon_vst3lane:
148  case Intrinsic::arm_neon_vst4lane: {
149  Align MemAlign =
151  &IC.getAssumptionCache(), &IC.getDominatorTree());
152  unsigned AlignArg = II.arg_size() - 1;
153  Value *AlignArgOp = II.getArgOperand(AlignArg);
154  MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
155  if (Align && *Align < MemAlign) {
156  return IC.replaceOperand(
157  II, AlignArg,
159  false));
160  }
161  break;
162  }
163 
164  case Intrinsic::arm_mve_pred_i2v: {
165  Value *Arg = II.getArgOperand(0);
166  Value *ArgArg;
167  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
168  PatternMatch::m_Value(ArgArg))) &&
169  II.getType() == ArgArg->getType()) {
170  return IC.replaceInstUsesWith(II, ArgArg);
171  }
172  Constant *XorMask;
173  if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
174  PatternMatch::m_Value(ArgArg)),
175  PatternMatch::m_Constant(XorMask))) &&
176  II.getType() == ArgArg->getType()) {
177  if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
178  if (CI->getValue().trunc(16).isAllOnes()) {
179  auto TrueVector = IC.Builder.CreateVectorSplat(
180  cast<FixedVectorType>(II.getType())->getNumElements(),
181  IC.Builder.getTrue());
182  return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
183  }
184  }
185  }
186  KnownBits ScalarKnown(32);
187  if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
188  ScalarKnown, 0)) {
189  return &II;
190  }
191  break;
192  }
193  case Intrinsic::arm_mve_pred_v2i: {
194  Value *Arg = II.getArgOperand(0);
195  Value *ArgArg;
196  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
197  PatternMatch::m_Value(ArgArg)))) {
198  return IC.replaceInstUsesWith(II, ArgArg);
199  }
200  if (!II.getMetadata(LLVMContext::MD_range)) {
201  Type *IntTy32 = Type::getInt32Ty(II.getContext());
202  Metadata *M[] = {
204  ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
205  II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
206  return &II;
207  }
208  break;
209  }
210  case Intrinsic::arm_mve_vadc:
211  case Intrinsic::arm_mve_vadc_predicated: {
212  unsigned CarryOp =
213  (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
214  assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
215  "Bad type for intrinsic!");
216 
217  KnownBits CarryKnown(32);
218  if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
219  CarryKnown)) {
220  return &II;
221  }
222  break;
223  }
224  case Intrinsic::arm_mve_vmldava: {
225  Instruction *I = cast<Instruction>(&II);
226  if (I->hasOneUse()) {
227  auto *User = cast<Instruction>(*I->user_begin());
228  Value *OpZ;
229  if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
230  match(I->getOperand(3), m_Zero())) {
231  Value *OpX = I->getOperand(4);
232  Value *OpY = I->getOperand(5);
233  Type *OpTy = OpX->getType();
234 
236  Value *V =
237  IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
238  {I->getOperand(0), I->getOperand(1),
239  I->getOperand(2), OpZ, OpX, OpY});
240 
241  IC.replaceInstUsesWith(*User, V);
242  return IC.eraseInstFromFunction(*User);
243  }
244  }
245  return None;
246  }
247  }
248  return None;
249 }
250 
252  InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
253  APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
254  std::function<void(Instruction *, unsigned, APInt, APInt &)>
255  SimplifyAndSetOp) const {
256 
257  // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
258  // opcode specifying a Top/Bottom instruction, which can change between
259  // instructions.
260  auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
261  unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
262  unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
263 
264  // The only odd/even lanes of operand 0 will only be demanded depending
265  // on whether this is a top/bottom instruction.
266  APInt DemandedElts =
267  APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
268  : APInt::getHighBitsSet(2, 1));
269  SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
270  // The other lanes will be defined from the inserted elements.
271  UndefElts &= APInt::getSplat(NumElts, !IsTop ? APInt::getLowBitsSet(2, 1)
272  : APInt::getHighBitsSet(2, 1));
273  return None;
274  };
275 
276  switch (II.getIntrinsicID()) {
277  default:
278  break;
279  case Intrinsic::arm_mve_vcvt_narrow:
280  SimplifyNarrowInstrTopBottom(2);
281  break;
282  case Intrinsic::arm_mve_vqmovn:
283  SimplifyNarrowInstrTopBottom(4);
284  break;
285  case Intrinsic::arm_mve_vshrn:
286  SimplifyNarrowInstrTopBottom(7);
287  break;
288  }
289 
290  return None;
291 }
292 
295  assert(Ty->isIntegerTy());
296 
297  unsigned Bits = Ty->getPrimitiveSizeInBits();
298  if (Bits == 0 || Imm.getActiveBits() >= 64)
299  return 4;
300 
301  int64_t SImmVal = Imm.getSExtValue();
302  uint64_t ZImmVal = Imm.getZExtValue();
303  if (!ST->isThumb()) {
304  if ((SImmVal >= 0 && SImmVal < 65536) ||
305  (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
306  (ARM_AM::getSOImmVal(~ZImmVal) != -1))
307  return 1;
308  return ST->hasV6T2Ops() ? 2 : 3;
309  }
310  if (ST->isThumb2()) {
311  if ((SImmVal >= 0 && SImmVal < 65536) ||
312  (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
313  (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
314  return 1;
315  return ST->hasV6T2Ops() ? 2 : 3;
316  }
317  // Thumb1, any i8 imm cost 1.
318  if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
319  return 1;
320  if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
321  return 2;
322  // Load from constantpool.
323  return 3;
324 }
325 
326 // Constants smaller than 256 fit in the immediate field of
327 // Thumb1 instructions so we return a zero cost and 1 otherwise.
329  const APInt &Imm, Type *Ty) {
330  if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
331  return 0;
332 
333  return 1;
334 }
335 
336 // Checks whether Inst is part of a min(max()) or max(min()) pattern
337 // that will match to an SSAT instruction. Returns the instruction being
338 // saturated, or null if no saturation pattern was found.
340  Value *LHS, *RHS;
341  ConstantInt *C;
343 
344  if (InstSPF == SPF_SMAX &&
346  C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
347 
348  auto isSSatMin = [&](Value *MinInst) {
349  if (isa<SelectInst>(MinInst)) {
350  Value *MinLHS, *MinRHS;
351  ConstantInt *MinC;
352  SelectPatternFlavor MinSPF =
353  matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
354  if (MinSPF == SPF_SMIN &&
356  MinC->getValue() == ((-Imm) - 1))
357  return true;
358  }
359  return false;
360  };
361 
362  if (isSSatMin(Inst->getOperand(1)))
363  return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
364  if (Inst->hasNUses(2) &&
365  (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
366  return Inst->getOperand(1);
367  }
368  return nullptr;
369 }
370 
371 // Look for a FP Saturation pattern, where the instruction can be simplified to
372 // a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
373 static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
374  if (Imm.getBitWidth() != 64 ||
375  Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
376  return false;
377  Value *FP = isSSATMinMaxPattern(Inst, Imm);
378  if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
379  FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
380  if (!FP)
381  return false;
382  return isa<FPToSIInst>(FP);
383 }
384 
385 InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
386  const APInt &Imm, Type *Ty,
388  Instruction *Inst) {
389  // Division by a constant can be turned into multiplication, but only if we
390  // know it's constant. So it's not so much that the immediate is cheap (it's
391  // not), but that the alternative is worse.
392  // FIXME: this is probably unneeded with GlobalISel.
393  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
394  Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
395  Idx == 1)
396  return 0;
397 
398  // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
399  // splitting any large offsets.
400  if (Opcode == Instruction::GetElementPtr && Idx != 0)
401  return 0;
402 
403  if (Opcode == Instruction::And) {
404  // UXTB/UXTH
405  if (Imm == 255 || Imm == 65535)
406  return 0;
407  // Conversion to BIC is free, and means we can use ~Imm instead.
408  return std::min(getIntImmCost(Imm, Ty, CostKind),
409  getIntImmCost(~Imm, Ty, CostKind));
410  }
411 
412  if (Opcode == Instruction::Add)
413  // Conversion to SUB is free, and means we can use -Imm instead.
414  return std::min(getIntImmCost(Imm, Ty, CostKind),
415  getIntImmCost(-Imm, Ty, CostKind));
416 
417  if (Opcode == Instruction::ICmp && Imm.isNegative() &&
418  Ty->getIntegerBitWidth() == 32) {
419  int64_t NegImm = -Imm.getSExtValue();
420  if (ST->isThumb2() && NegImm < 1<<12)
421  // icmp X, #-C -> cmn X, #C
422  return 0;
423  if (ST->isThumb() && NegImm < 1<<8)
424  // icmp X, #-C -> adds X, #C
425  return 0;
426  }
427 
428  // xor a, -1 can always be folded to MVN
429  if (Opcode == Instruction::Xor && Imm.isAllOnes())
430  return 0;
431 
432  // Ensures negative constant of min(max()) or max(min()) patterns that
433  // match to SSAT instructions don't get hoisted
434  if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
435  Ty->getIntegerBitWidth() <= 32) {
436  if (isSSATMinMaxPattern(Inst, Imm) ||
437  (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
438  isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
439  return 0;
440  }
441 
442  if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
443  return 0;
444 
445  // We can convert <= -1 to < 0, which is generally quite cheap.
446  if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnesValue()) {
447  ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
448  if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
449  return std::min(getIntImmCost(Imm, Ty, CostKind),
450  getIntImmCost(Imm + 1, Ty, CostKind));
451  }
452 
453  return getIntImmCost(Imm, Ty, CostKind);
454 }
455 
458  const Instruction *I) {
460  (ST->hasNEON() || ST->hasMVEIntegerOps())) {
461  // FIXME: The vectorizer is highly sensistive to the cost of these
462  // instructions, which suggests that it may be using the costs incorrectly.
463  // But, for now, just make them free to avoid performance regressions for
464  // vector targets.
465  return 0;
466  }
467  return BaseT::getCFInstrCost(Opcode, CostKind, I);
468 }
469 
471  Type *Src,
474  const Instruction *I) {
475  int ISD = TLI->InstructionOpcodeToISD(Opcode);
476  assert(ISD && "Invalid opcode");
477 
478  // TODO: Allow non-throughput costs that aren't binary.
479  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
481  return Cost == 0 ? 0 : 1;
482  return Cost;
483  };
484  auto IsLegalFPType = [this](EVT VT) {
485  EVT EltVT = VT.getScalarType();
486  return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
487  (EltVT == MVT::f64 && ST->hasFP64()) ||
488  (EltVT == MVT::f16 && ST->hasFullFP16());
489  };
490 
491  EVT SrcTy = TLI->getValueType(DL, Src);
492  EVT DstTy = TLI->getValueType(DL, Dst);
493 
494  if (!SrcTy.isSimple() || !DstTy.isSimple())
495  return AdjustCost(
496  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
497 
498  // Extending masked load/Truncating masked stores is expensive because we
499  // currently don't split them. This means that we'll likely end up
500  // loading/storing each element individually (hence the high cost).
501  if ((ST->hasMVEIntegerOps() &&
502  (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
503  Opcode == Instruction::SExt)) ||
504  (ST->hasMVEFloatOps() &&
505  (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
506  IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
507  if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
508  return 2 * DstTy.getVectorNumElements() *
510 
511  // The extend of other kinds of load is free
512  if (CCH == TTI::CastContextHint::Normal ||
514  static const TypeConversionCostTblEntry LoadConversionTbl[] = {
527  };
528  if (const auto *Entry = ConvertCostTableLookup(
529  LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
530  return AdjustCost(Entry->Cost);
531 
532  static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
539  // The following extend from a legal type to an illegal type, so need to
540  // split the load. This introduced an extra load operation, but the
541  // extend is still "free".
548  };
549  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
550  if (const auto *Entry =
551  ConvertCostTableLookup(MVELoadConversionTbl, ISD,
552  DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
553  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
554  }
555 
556  static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
557  // FPExtends are similar but also require the VCVT instructions.
560  };
561  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
562  if (const auto *Entry =
563  ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
564  DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
565  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
566  }
567 
568  // The truncate of a store is free. This is the mirror of extends above.
569  static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
577  };
578  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
579  if (const auto *Entry =
580  ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
581  SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
582  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
583  }
584 
585  static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
588  };
589  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
590  if (const auto *Entry =
591  ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
592  SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
593  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
594  }
595  }
596 
597  // NEON vector operations that can extend their inputs.
598  if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
599  I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
600  static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
601  // vaddl
602  { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
603  { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
604  // vsubl
605  { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
606  { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
607  // vmull
608  { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
609  { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
610  // vshll
611  { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
612  { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
613  };
614 
615  auto *User = cast<Instruction>(*I->user_begin());
616  int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
617  if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
618  DstTy.getSimpleVT(),
619  SrcTy.getSimpleVT())) {
620  return AdjustCost(Entry->Cost);
621  }
622  }
623 
624  // Single to/from double precision conversions.
625  if (Src->isVectorTy() && ST->hasNEON() &&
626  ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
627  DstTy.getScalarType() == MVT::f32) ||
628  (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
629  DstTy.getScalarType() == MVT::f64))) {
630  static const CostTblEntry NEONFltDblTbl[] = {
631  // Vector fptrunc/fpext conversions.
634  {ISD::FP_EXTEND, MVT::v4f32, 4}};
635 
636  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
637  if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
638  return AdjustCost(LT.first * Entry->Cost);
639  }
640 
641  // Some arithmetic, load and store operations have specific instructions
642  // to cast up/down their types automatically at no extra cost.
643  // TODO: Get these tables to know at least what the related operations are.
644  static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
651 
652  // The number of vmovl instructions for the extension.
671 
672  // Operations that we legalize using splitting.
675 
676  // Vector float <-> i32 conversions.
679 
700 
707 
708  // Vector double <-> i32 conversions.
711 
718 
725  };
726 
727  if (SrcTy.isVector() && ST->hasNEON()) {
728  if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
729  DstTy.getSimpleVT(),
730  SrcTy.getSimpleVT()))
731  return AdjustCost(Entry->Cost);
732  }
733 
734  // Scalar float to integer conversions.
735  static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
756  };
757  if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
758  if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
759  DstTy.getSimpleVT(),
760  SrcTy.getSimpleVT()))
761  return AdjustCost(Entry->Cost);
762  }
763 
764  // Scalar integer to float conversions.
765  static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
786  };
787 
788  if (SrcTy.isInteger() && ST->hasNEON()) {
789  if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
790  ISD, DstTy.getSimpleVT(),
791  SrcTy.getSimpleVT()))
792  return AdjustCost(Entry->Cost);
793  }
794 
795  // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
796  // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
797  // are linearised so take more.
798  static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
811  };
812 
813  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
814  if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
815  ISD, DstTy.getSimpleVT(),
816  SrcTy.getSimpleVT()))
817  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
818  }
819 
820  if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
821  // As general rule, fp converts that were not matched above are scalarized
822  // and cost 1 vcvt for each lane, so long as the instruction is available.
823  // If not it will become a series of function calls.
824  const InstructionCost CallCost =
825  getCallInstrCost(nullptr, Dst, {Src}, CostKind);
826  int Lanes = 1;
827  if (SrcTy.isFixedLengthVector())
828  Lanes = SrcTy.getVectorNumElements();
829 
830  if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
831  return Lanes;
832  else
833  return Lanes * CallCost;
834  }
835 
836  if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
837  SrcTy.isFixedLengthVector()) {
838  // Treat a truncate with larger than legal source (128bits for MVE) as
839  // expensive, 2 instructions per lane.
840  if ((SrcTy.getScalarType() == MVT::i8 ||
841  SrcTy.getScalarType() == MVT::i16 ||
842  SrcTy.getScalarType() == MVT::i32) &&
843  SrcTy.getSizeInBits() > 128 &&
844  SrcTy.getSizeInBits() > DstTy.getSizeInBits())
845  return SrcTy.getVectorNumElements() * 2;
846  }
847 
848  // Scalar integer conversion costs.
849  static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
850  // i16 -> i64 requires two dependent operations.
852 
853  // Truncates on i64 are assumed to be free.
856  { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
858  };
859 
860  if (SrcTy.isInteger()) {
861  if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
862  DstTy.getSimpleVT(),
863  SrcTy.getSimpleVT()))
864  return AdjustCost(Entry->Cost);
865  }
866 
867  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
869  : 1;
870  return AdjustCost(
871  BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
872 }
873 
875  unsigned Index) {
876  // Penalize inserting into an D-subregister. We end up with a three times
877  // lower estimated throughput on swift.
878  if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
879  ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
880  return 3;
881 
882  if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
883  Opcode == Instruction::ExtractElement)) {
884  // Cross-class copies are expensive on many microarchitectures,
885  // so assume they are expensive by default.
886  if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
887  return 3;
888 
889  // Even if it's not a cross class copy, this likely leads to mixing
890  // of NEON and VFP code and should be therefore penalized.
891  if (ValTy->isVectorTy() &&
892  ValTy->getScalarSizeInBits() <= 32)
893  return std::max<InstructionCost>(
894  BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
895  }
896 
897  if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
898  Opcode == Instruction::ExtractElement)) {
899  // Integer cross-lane moves are more expensive than float, which can
900  // sometimes just be vmovs. Integer involve being passes to GPR registers,
901  // causing more of a delay.
902  std::pair<InstructionCost, MVT> LT =
903  getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
904  return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
905  }
906 
907  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
908 }
909 
911  Type *CondTy,
912  CmpInst::Predicate VecPred,
914  const Instruction *I) {
915  int ISD = TLI->InstructionOpcodeToISD(Opcode);
916 
917  // Thumb scalar code size cost for select.
918  if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
919  ST->isThumb() && !ValTy->isVectorTy()) {
920  // Assume expensive structs.
921  if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
922  return TTI::TCC_Expensive;
923 
924  // Select costs can vary because they:
925  // - may require one or more conditional mov (including an IT),
926  // - can't operate directly on immediates,
927  // - require live flags, which we can't copy around easily.
928  InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
929 
930  // Possible IT instruction for Thumb2, or more for Thumb1.
931  ++Cost;
932 
933  // i1 values may need rematerialising by using mov immediates and/or
934  // flag setting instructions.
935  if (ValTy->isIntegerTy(1))
936  ++Cost;
937 
938  return Cost;
939  }
940 
941  // If this is a vector min/max/abs, use the cost of that intrinsic directly
942  // instead. Hopefully when min/max intrinsics are more prevalent this code
943  // will not be needed.
944  const Instruction *Sel = I;
945  if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
946  Sel->hasOneUse())
947  Sel = cast<Instruction>(Sel->user_back());
948  if (Sel && ValTy->isVectorTy() &&
949  (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
950  const Value *LHS, *RHS;
952  unsigned IID = 0;
953  switch (SPF) {
954  case SPF_ABS:
955  IID = Intrinsic::abs;
956  break;
957  case SPF_SMIN:
958  IID = Intrinsic::smin;
959  break;
960  case SPF_SMAX:
961  IID = Intrinsic::smax;
962  break;
963  case SPF_UMIN:
964  IID = Intrinsic::umin;
965  break;
966  case SPF_UMAX:
967  IID = Intrinsic::umax;
968  break;
969  case SPF_FMINNUM:
970  IID = Intrinsic::minnum;
971  break;
972  case SPF_FMAXNUM:
973  IID = Intrinsic::maxnum;
974  break;
975  default:
976  break;
977  }
978  if (IID) {
979  // The ICmp is free, the select gets the cost of the min/max/etc
980  if (Sel != I)
981  return 0;
982  IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
983  return getIntrinsicInstrCost(CostAttrs, CostKind);
984  }
985  }
986 
987  // On NEON a vector select gets lowered to vbsl.
988  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
989  // Lowering of some vector selects is currently far from perfect.
990  static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
991  { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
992  { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
994  };
995 
996  EVT SelCondTy = TLI->getValueType(DL, CondTy);
997  EVT SelValTy = TLI->getValueType(DL, ValTy);
998  if (SelCondTy.isSimple() && SelValTy.isSimple()) {
999  if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1000  SelCondTy.getSimpleVT(),
1001  SelValTy.getSimpleVT()))
1002  return Entry->Cost;
1003  }
1004 
1005  std::pair<InstructionCost, MVT> LT =
1006  TLI->getTypeLegalizationCost(DL, ValTy);
1007  return LT.first;
1008  }
1009 
1010  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1011  (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1012  cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1013  FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1014  FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
1015  if (!VecCondTy)
1016  VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
1017 
1018  // If we don't have mve.fp any fp operations will need to be scalarized.
1019  if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1020  // One scalaization insert, one scalarization extract and the cost of the
1021  // fcmps.
1022  return BaseT::getScalarizationOverhead(VecValTy, false, true) +
1023  BaseT::getScalarizationOverhead(VecCondTy, true, false) +
1024  VecValTy->getNumElements() *
1025  getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1026  VecCondTy->getScalarType(), VecPred, CostKind,
1027  I);
1028  }
1029 
1030  std::pair<InstructionCost, MVT> LT =
1031  TLI->getTypeLegalizationCost(DL, ValTy);
1032  int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1033  // There are two types - the input that specifies the type of the compare
1034  // and the output vXi1 type. Because we don't know how the output will be
1035  // split, we may need an expensive shuffle to get two in sync. This has the
1036  // effect of making larger than legal compares (v8i32 for example)
1037  // expensive.
1038  if (LT.second.getVectorNumElements() > 2) {
1039  if (LT.first > 1)
1040  return LT.first * BaseCost +
1041  BaseT::getScalarizationOverhead(VecCondTy, true, false);
1042  return BaseCost;
1043  }
1044  }
1045 
1046  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1047  // for "multiple beats" potentially needed by MVE instructions.
1048  int BaseCost = 1;
1049  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1050  BaseCost = ST->getMVEVectorCostFactor(CostKind);
1051 
1052  return BaseCost *
1053  BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1054 }
1055 
1057  ScalarEvolution *SE,
1058  const SCEV *Ptr) {
1059  // Address computations in vectorized code with non-consecutive addresses will
1060  // likely result in more instructions compared to scalar code where the
1061  // computation can more often be merged into the index mode. The resulting
1062  // extra micro-ops can significantly decrease throughput.
1063  unsigned NumVectorInstToHideOverhead = 10;
1064  int MaxMergeDistance = 64;
1065 
1066  if (ST->hasNEON()) {
1067  if (Ty->isVectorTy() && SE &&
1068  !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1069  return NumVectorInstToHideOverhead;
1070 
1071  // In many cases the address computation is not merged into the instruction
1072  // addressing mode.
1073  return 1;
1074  }
1075  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1076 }
1077 
1079  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1080  // If a VCTP is part of a chain, it's already profitable and shouldn't be
1081  // optimized, else LSR may block tail-predication.
1082  switch (II->getIntrinsicID()) {
1083  case Intrinsic::arm_mve_vctp8:
1084  case Intrinsic::arm_mve_vctp16:
1085  case Intrinsic::arm_mve_vctp32:
1086  case Intrinsic::arm_mve_vctp64:
1087  return true;
1088  default:
1089  break;
1090  }
1091  }
1092  return false;
1093 }
1094 
1095 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1096  if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1097  return false;
1098 
1099  if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1100  // Don't support v2i1 yet.
1101  if (VecTy->getNumElements() == 2)
1102  return false;
1103 
1104  // We don't support extending fp types.
1105  unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1106  if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1107  return false;
1108  }
1109 
1110  unsigned EltWidth = DataTy->getScalarSizeInBits();
1111  return (EltWidth == 32 && Alignment >= 4) ||
1112  (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1113 }
1114 
1116  if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1117  return false;
1118 
1119  unsigned EltWidth = Ty->getScalarSizeInBits();
1120  return ((EltWidth == 32 && Alignment >= 4) ||
1121  (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1122 }
1123 
1124 /// Given a memcpy/memset/memmove instruction, return the number of memory
1125 /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1126 /// call is used.
1128  MemOp MOp;
1129  unsigned DstAddrSpace = ~0u;
1130  unsigned SrcAddrSpace = ~0u;
1131  const Function *F = I->getParent()->getParent();
1132 
1133  if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1134  ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1135  // If 'size' is not a constant, a library call will be generated.
1136  if (!C)
1137  return -1;
1138 
1139  const unsigned Size = C->getValue().getZExtValue();
1140  const Align DstAlign = *MC->getDestAlign();
1141  const Align SrcAlign = *MC->getSourceAlign();
1142 
1143  MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1144  /*IsVolatile*/ false);
1145  DstAddrSpace = MC->getDestAddressSpace();
1146  SrcAddrSpace = MC->getSourceAddressSpace();
1147  }
1148  else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1149  ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1150  // If 'size' is not a constant, a library call will be generated.
1151  if (!C)
1152  return -1;
1153 
1154  const unsigned Size = C->getValue().getZExtValue();
1155  const Align DstAlign = *MS->getDestAlign();
1156 
1157  MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1158  /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1159  DstAddrSpace = MS->getDestAddressSpace();
1160  }
1161  else
1162  llvm_unreachable("Expected a memcpy/move or memset!");
1163 
1164  unsigned Limit, Factor = 2;
1165  switch(I->getIntrinsicID()) {
1166  case Intrinsic::memcpy:
1167  Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1168  break;
1169  case Intrinsic::memmove:
1170  Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1171  break;
1172  case Intrinsic::memset:
1173  Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1174  Factor = 1;
1175  break;
1176  default:
1177  llvm_unreachable("Expected a memcpy/move or memset!");
1178  }
1179 
1180  // MemOps will be poplulated with a list of data types that needs to be
1181  // loaded and stored. That's why we multiply the number of elements by 2 to
1182  // get the cost for this memcpy.
1183  std::vector<EVT> MemOps;
1184  if (getTLI()->findOptimalMemOpLowering(
1185  MemOps, Limit, MOp, DstAddrSpace,
1186  SrcAddrSpace, F->getAttributes()))
1187  return MemOps.size() * Factor;
1188 
1189  // If we can't find an optimal memop lowering, return the default cost
1190  return -1;
1191 }
1192 
1194  int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1195 
1196  // To model the cost of a library call, we assume 1 for the call, and
1197  // 3 for the argument setup.
1198  if (NumOps == -1)
1199  return 4;
1200  return NumOps;
1201 }
1202 
1205  int Index, VectorType *SubTp,
1208  if (ST->hasNEON()) {
1209  if (Kind == TTI::SK_Broadcast) {
1210  static const CostTblEntry NEONDupTbl[] = {
1211  // VDUP handles these cases.
1218 
1223 
1224  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1225  if (const auto *Entry =
1226  CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1227  return LT.first * Entry->Cost;
1228  }
1229  if (Kind == TTI::SK_Reverse) {
1230  static const CostTblEntry NEONShuffleTbl[] = {
1231  // Reverse shuffle cost one instruction if we are shuffling within a
1232  // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1239 
1244 
1245  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1246  if (const auto *Entry =
1247  CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1248  return LT.first * Entry->Cost;
1249  }
1250  if (Kind == TTI::SK_Select) {
1251  static const CostTblEntry NEONSelShuffleTbl[] = {
1252  // Select shuffle cost table for ARM. Cost is the number of
1253  // instructions
1254  // required to create the shuffled vector.
1255 
1260 
1264 
1266 
1268 
1269  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1270  if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1271  ISD::VECTOR_SHUFFLE, LT.second))
1272  return LT.first * Entry->Cost;
1273  }
1274  }
1275  if (ST->hasMVEIntegerOps()) {
1276  if (Kind == TTI::SK_Broadcast) {
1277  static const CostTblEntry MVEDupTbl[] = {
1278  // VDUP handles these cases.
1284 
1285  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1286  if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1287  LT.second))
1288  return LT.first * Entry->Cost *
1290  }
1291 
1292  if (!Mask.empty()) {
1293  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1294  if (LT.second.isVector() &&
1295  Mask.size() <= LT.second.getVectorNumElements() &&
1296  (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1297  isVREVMask(Mask, LT.second, 64)))
1299  }
1300  }
1301 
1302  int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1304  : 1;
1305  return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
1306 }
1307 
1309  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1311  TTI::OperandValueProperties Opd1PropInfo,
1313  const Instruction *CxtI) {
1314  int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1315  if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1316  // Make operations on i1 relatively expensive as this often involves
1317  // combining predicates. AND and XOR should be easier to handle with IT
1318  // blocks.
1319  switch (ISDOpcode) {
1320  default:
1321  break;
1322  case ISD::AND:
1323  case ISD::XOR:
1324  return 2;
1325  case ISD::OR:
1326  return 3;
1327  }
1328  }
1329 
1330  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1331 
1332  if (ST->hasNEON()) {
1333  const unsigned FunctionCallDivCost = 20;
1334  const unsigned ReciprocalDivCost = 10;
1335  static const CostTblEntry CostTbl[] = {
1336  // Division.
1337  // These costs are somewhat random. Choose a cost of 20 to indicate that
1338  // vectorizing devision (added function call) is going to be very expensive.
1339  // Double registers types.
1340  { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1341  { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1342  { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1343  { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1344  { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1345  { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1346  { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1347  { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1348  { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1349  { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1350  { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1351  { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1352  { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1353  { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1354  { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1355  { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1356  // Quad register types.
1357  { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1358  { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1359  { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1360  { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1361  { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1362  { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1363  { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1364  { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1365  { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1366  { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1367  { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1368  { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1369  { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1370  { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1371  { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1372  { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1373  // Multiplication.
1374  };
1375 
1376  if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1377  return LT.first * Entry->Cost;
1378 
1380  Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
1381 
1382  // This is somewhat of a hack. The problem that we are facing is that SROA
1383  // creates a sequence of shift, and, or instructions to construct values.
1384  // These sequences are recognized by the ISel and have zero-cost. Not so for
1385  // the vectorized code. Because we have support for v2i64 but not i64 those
1386  // sequences look particularly beneficial to vectorize.
1387  // To work around this we increase the cost of v2i64 operations to make them
1388  // seem less beneficial.
1389  if (LT.second == MVT::v2i64 &&
1391  Cost += 4;
1392 
1393  return Cost;
1394  }
1395 
1396  // If this operation is a shift on arm/thumb2, it might well be folded into
1397  // the following instruction, hence having a cost of 0.
1398  auto LooksLikeAFreeShift = [&]() {
1399  if (ST->isThumb1Only() || Ty->isVectorTy())
1400  return false;
1401 
1402  if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1403  return false;
1405  return false;
1406 
1407  // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1408  switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1409  case Instruction::Add:
1410  case Instruction::Sub:
1411  case Instruction::And:
1412  case Instruction::Xor:
1413  case Instruction::Or:
1414  case Instruction::ICmp:
1415  return true;
1416  default:
1417  return false;
1418  }
1419  };
1420  if (LooksLikeAFreeShift())
1421  return 0;
1422 
1423  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1424  // for "multiple beats" potentially needed by MVE instructions.
1425  int BaseCost = 1;
1426  if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1427  BaseCost = ST->getMVEVectorCostFactor(CostKind);
1428 
1429  // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1430  // without treating floats as more expensive that scalars or increasing the
1431  // costs for custom operations. The results is also multiplied by the
1432  // MVEVectorCostFactor where appropriate.
1433  if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1434  return LT.first * BaseCost;
1435 
1436  // Else this is expand, assume that we need to scalarize this op.
1437  if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1438  unsigned Num = VTy->getNumElements();
1439  InstructionCost Cost =
1441  // Return the cost of multiple scalar invocation plus the cost of
1442  // inserting and extracting the values.
1443  SmallVector<Type *> Tys(Args.size(), Ty);
1444  return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
1445  }
1446 
1447  return BaseCost;
1448 }
1449 
1451  MaybeAlign Alignment,
1452  unsigned AddressSpace,
1454  const Instruction *I) {
1455  // TODO: Handle other cost kinds.
1457  return 1;
1458 
1459  // Type legalization can't handle structs
1460  if (TLI->getValueType(DL, Src, true) == MVT::Other)
1461  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1462  CostKind);
1463 
1464  if (ST->hasNEON() && Src->isVectorTy() &&
1465  (Alignment && *Alignment != Align(16)) &&
1466  cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1467  // Unaligned loads/stores are extremely inefficient.
1468  // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1469  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1470  return LT.first * 4;
1471  }
1472 
1473  // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1474  // Same for stores.
1475  if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1476  ((Opcode == Instruction::Load && I->hasOneUse() &&
1477  isa<FPExtInst>(*I->user_begin())) ||
1478  (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1479  FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1480  Type *DstTy =
1481  Opcode == Instruction::Load
1482  ? (*I->user_begin())->getType()
1483  : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1484  if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1485  DstTy->getScalarType()->isFloatTy())
1486  return ST->getMVEVectorCostFactor(CostKind);
1487  }
1488 
1489  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1491  : 1;
1492  return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1493  CostKind, I);
1494 }
1495 
1497 ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1498  unsigned AddressSpace,
1500  if (ST->hasMVEIntegerOps()) {
1501  if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1502  return ST->getMVEVectorCostFactor(CostKind);
1503  if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1504  return ST->getMVEVectorCostFactor(CostKind);
1505  }
1506  if (!isa<FixedVectorType>(Src))
1507  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1508  CostKind);
1509  // Scalar cost, which is currently very high due to the efficiency of the
1510  // generated code.
1511  return cast<FixedVectorType>(Src)->getNumElements() * 8;
1512 }
1513 
1515  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1516  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1517  bool UseMaskForCond, bool UseMaskForGaps) {
1518  assert(Factor >= 2 && "Invalid interleave factor");
1519  assert(isa<VectorType>(VecTy) && "Expect a vector type");
1520 
1521  // vldN/vstN doesn't support vector types of i64/f64 element.
1522  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1523 
1524  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1525  !UseMaskForCond && !UseMaskForGaps) {
1526  unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1527  auto *SubVecTy =
1528  FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1529 
1530  // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1531  // Accesses having vector types that are a multiple of 128 bits can be
1532  // matched to more than one vldN/vstN instruction.
1533  int BaseCost =
1534  ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1535  if (NumElts % Factor == 0 &&
1536  TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1537  return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1538 
1539  // Some smaller than legal interleaved patterns are cheap as we can make
1540  // use of the vmovn or vrev patterns to interleave a standard load. This is
1541  // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1542  // promoted differently). The cost of 2 here is then a load and vrev or
1543  // vmovn.
1544  if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1545  VecTy->isIntOrIntVectorTy() &&
1546  DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
1547  return 2 * BaseCost;
1548  }
1549 
1550  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1551  Alignment, AddressSpace, CostKind,
1552  UseMaskForCond, UseMaskForGaps);
1553 }
1554 
1556  unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1557  Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1558  using namespace PatternMatch;
1559  if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1560  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1561  Alignment, CostKind, I);
1562 
1563  assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1564  auto *VTy = cast<FixedVectorType>(DataTy);
1565 
1566  // TODO: Splitting, once we do that.
1567 
1568  unsigned NumElems = VTy->getNumElements();
1569  unsigned EltSize = VTy->getScalarSizeInBits();
1570  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
1571 
1572  // For now, it is assumed that for the MVE gather instructions the loads are
1573  // all effectively serialised. This means the cost is the scalar cost
1574  // multiplied by the number of elements being loaded. This is possibly very
1575  // conservative, but even so we still end up vectorising loops because the
1576  // cost per iteration for many loops is lower than for scalar loops.
1577  InstructionCost VectorCost =
1578  NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1579  // The scalarization cost should be a lot higher. We use the number of vector
1580  // elements plus the scalarization overhead.
1581  InstructionCost ScalarCost =
1582  NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
1583  BaseT::getScalarizationOverhead(VTy, false, true);
1584 
1585  if (EltSize < 8 || Alignment < EltSize / 8)
1586  return ScalarCost;
1587 
1588  unsigned ExtSize = EltSize;
1589  // Check whether there's a single user that asks for an extended type
1590  if (I != nullptr) {
1591  // Dependent of the caller of this function, a gather instruction will
1592  // either have opcode Instruction::Load or be a call to the masked_gather
1593  // intrinsic
1594  if ((I->getOpcode() == Instruction::Load ||
1595  match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1596  I->hasOneUse()) {
1597  const User *Us = *I->users().begin();
1598  if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1599  // only allow valid type combinations
1600  unsigned TypeSize =
1601  cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1602  if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1603  (TypeSize == 16 && EltSize == 8)) &&
1604  TypeSize * NumElems == 128) {
1605  ExtSize = TypeSize;
1606  }
1607  }
1608  }
1609  // Check whether the input data needs to be truncated
1610  TruncInst *T;
1611  if ((I->getOpcode() == Instruction::Store ||
1612  match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1613  (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1614  // Only allow valid type combinations
1615  unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1616  if (((EltSize == 16 && TypeSize == 32) ||
1617  (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1618  TypeSize * NumElems == 128)
1619  ExtSize = TypeSize;
1620  }
1621  }
1622 
1623  if (ExtSize * NumElems != 128 || NumElems < 4)
1624  return ScalarCost;
1625 
1626  // Any (aligned) i32 gather will not need to be scalarised.
1627  if (ExtSize == 32)
1628  return VectorCost;
1629  // For smaller types, we need to ensure that the gep's inputs are correctly
1630  // extended from a small enough value. Other sizes (including i64) are
1631  // scalarized for now.
1632  if (ExtSize != 8 && ExtSize != 16)
1633  return ScalarCost;
1634 
1635  if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1636  Ptr = BC->getOperand(0);
1637  if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1638  if (GEP->getNumOperands() != 2)
1639  return ScalarCost;
1640  unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1641  // Scale needs to be correct (which is only relevant for i16s).
1642  if (Scale != 1 && Scale * 8 != ExtSize)
1643  return ScalarCost;
1644  // And we need to zext (not sext) the indexes from a small enough type.
1645  if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1646  if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1647  return VectorCost;
1648  }
1649  return ScalarCost;
1650  }
1651  return ScalarCost;
1652 }
1653 
1659  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1660 
1661  EVT ValVT = TLI->getValueType(DL, ValTy);
1662  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1663  if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1664  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1665 
1666  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1667 
1668  static const CostTblEntry CostTblAdd[]{
1669  {ISD::ADD, MVT::v16i8, 1},
1670  {ISD::ADD, MVT::v8i16, 1},
1671  {ISD::ADD, MVT::v4i32, 1},
1672  };
1673  if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1674  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1675 
1676  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1677 }
1678 
1680 ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
1681  Type *ResTy, VectorType *ValTy,
1683  EVT ValVT = TLI->getValueType(DL, ValTy);
1684  EVT ResVT = TLI->getValueType(DL, ResTy);
1685 
1686  if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1687  std::pair<InstructionCost, MVT> LT =
1688  TLI->getTypeLegalizationCost(DL, ValTy);
1689 
1690  // The legal cases are:
1691  // VADDV u/s 8/16/32
1692  // VMLAV u/s 8/16/32
1693  // VADDLV u/s 32
1694  // VMLALV u/s 16/32
1695  // Codegen currently cannot always handle larger than legal vectors very
1696  // well, especially for predicated reductions where the mask needs to be
1697  // split, so restrict to 128bit or smaller input types.
1698  unsigned RevVTSize = ResVT.getSizeInBits();
1699  if (ValVT.getSizeInBits() <= 128 &&
1700  ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1701  (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) ||
1702  (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1703  return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1704  }
1705 
1706  return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
1707  CostKind);
1708 }
1709 
1713  switch (ICA.getID()) {
1714  case Intrinsic::get_active_lane_mask:
1715  // Currently we make a somewhat optimistic assumption that
1716  // active_lane_mask's are always free. In reality it may be freely folded
1717  // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1718  // of add/icmp code. We may need to improve this in the future, but being
1719  // able to detect if it is free or not involves looking at a lot of other
1720  // code. We currently assume that the vectorizer inserted these, and knew
1721  // what it was doing in adding one.
1722  if (ST->hasMVEIntegerOps())
1723  return 0;
1724  break;
1725  case Intrinsic::sadd_sat:
1726  case Intrinsic::ssub_sat:
1727  case Intrinsic::uadd_sat:
1728  case Intrinsic::usub_sat: {
1729  if (!ST->hasMVEIntegerOps())
1730  break;
1731  Type *VT = ICA.getReturnType();
1732 
1733  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1734  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1735  LT.second == MVT::v16i8) {
1736  // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1737  // need to extend the type, as it uses shr(qadd(shl, shl)).
1738  unsigned Instrs =
1739  LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1740  return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1741  }
1742  break;
1743  }
1744  case Intrinsic::abs:
1745  case Intrinsic::smin:
1746  case Intrinsic::smax:
1747  case Intrinsic::umin:
1748  case Intrinsic::umax: {
1749  if (!ST->hasMVEIntegerOps())
1750  break;
1751  Type *VT = ICA.getReturnType();
1752 
1753  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1754  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1755  LT.second == MVT::v16i8)
1756  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1757  break;
1758  }
1759  case Intrinsic::minnum:
1760  case Intrinsic::maxnum: {
1761  if (!ST->hasMVEFloatOps())
1762  break;
1763  Type *VT = ICA.getReturnType();
1764  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1765  if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1766  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1767  break;
1768  }
1769  case Intrinsic::fptosi_sat:
1770  case Intrinsic::fptoui_sat: {
1771  if (ICA.getArgTypes().empty())
1772  break;
1773  bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1774  auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]);
1775  EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
1776  // Check for the legal types, with the corect subtarget features.
1777  if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
1778  (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
1779  (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
1780  return LT.first;
1781 
1782  // Equally for MVE vector types
1783  if (ST->hasMVEFloatOps() &&
1784  (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
1785  LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
1786  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1787 
1788  // Otherwise we use a legal convert followed by a min+max
1789  if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
1790  (ST->hasFP64() && LT.second == MVT::f64) ||
1791  (ST->hasFullFP16() && LT.second == MVT::f16) ||
1792  (ST->hasMVEFloatOps() &&
1793  (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
1794  LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
1795  Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
1796  LT.second.getScalarSizeInBits());
1797  InstructionCost Cost =
1798  LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1799  IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
1800  : Intrinsic::umin,
1801  LegalTy, {LegalTy, LegalTy});
1802  Cost += getIntrinsicInstrCost(Attrs1, CostKind);
1803  IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
1804  : Intrinsic::umax,
1805  LegalTy, {LegalTy, LegalTy});
1806  Cost += getIntrinsicInstrCost(Attrs2, CostKind);
1807  return LT.first * Cost;
1808  }
1809  break;
1810  }
1811  }
1812 
1814 }
1815 
1817  if (!F->isIntrinsic())
1818  return BaseT::isLoweredToCall(F);
1819 
1820  // Assume all Arm-specific intrinsics map to an instruction.
1821  if (F->getName().startswith("llvm.arm"))
1822  return false;
1823 
1824  switch (F->getIntrinsicID()) {
1825  default: break;
1826  case Intrinsic::powi:
1827  case Intrinsic::sin:
1828  case Intrinsic::cos:
1829  case Intrinsic::pow:
1830  case Intrinsic::log:
1831  case Intrinsic::log10:
1832  case Intrinsic::log2:
1833  case Intrinsic::exp:
1834  case Intrinsic::exp2:
1835  return true;
1836  case Intrinsic::sqrt:
1837  case Intrinsic::fabs:
1838  case Intrinsic::copysign:
1839  case Intrinsic::floor:
1840  case Intrinsic::ceil:
1841  case Intrinsic::trunc:
1842  case Intrinsic::rint:
1843  case Intrinsic::nearbyint:
1844  case Intrinsic::round:
1845  case Intrinsic::canonicalize:
1846  case Intrinsic::lround:
1847  case Intrinsic::llround:
1848  case Intrinsic::lrint:
1849  case Intrinsic::llrint:
1850  if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
1851  return true;
1852  if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
1853  return true;
1854  // Some operations can be handled by vector instructions and assume
1855  // unsupported vectors will be expanded into supported scalar ones.
1856  // TODO Handle scalar operations properly.
1857  return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
1858  case Intrinsic::masked_store:
1859  case Intrinsic::masked_load:
1860  case Intrinsic::masked_gather:
1861  case Intrinsic::masked_scatter:
1862  return !ST->hasMVEIntegerOps();
1863  case Intrinsic::sadd_with_overflow:
1864  case Intrinsic::uadd_with_overflow:
1865  case Intrinsic::ssub_with_overflow:
1866  case Intrinsic::usub_with_overflow:
1867  case Intrinsic::sadd_sat:
1868  case Intrinsic::uadd_sat:
1869  case Intrinsic::ssub_sat:
1870  case Intrinsic::usub_sat:
1871  return false;
1872  }
1873 
1874  return BaseT::isLoweredToCall(F);
1875 }
1876 
1878  unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1879  EVT VT = TLI->getValueType(DL, I.getType(), true);
1880  if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1881  return true;
1882 
1883  // Check if an intrinsic will be lowered to a call and assume that any
1884  // other CallInst will generate a bl.
1885  if (auto *Call = dyn_cast<CallInst>(&I)) {
1886  if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1887  switch(II->getIntrinsicID()) {
1888  case Intrinsic::memcpy:
1889  case Intrinsic::memset:
1890  case Intrinsic::memmove:
1891  return getNumMemOps(II) == -1;
1892  default:
1893  if (const Function *F = Call->getCalledFunction())
1894  return isLoweredToCall(F);
1895  }
1896  }
1897  return true;
1898  }
1899 
1900  // FPv5 provides conversions between integer, double-precision,
1901  // single-precision, and half-precision formats.
1902  switch (I.getOpcode()) {
1903  default:
1904  break;
1905  case Instruction::FPToSI:
1906  case Instruction::FPToUI:
1907  case Instruction::SIToFP:
1908  case Instruction::UIToFP:
1909  case Instruction::FPTrunc:
1910  case Instruction::FPExt:
1911  return !ST->hasFPARMv8Base();
1912  }
1913 
1914  // FIXME: Unfortunately the approach of checking the Operation Action does
1915  // not catch all cases of Legalization that use library calls. Our
1916  // Legalization step categorizes some transformations into library calls as
1917  // Custom, Expand or even Legal when doing type legalization. So for now
1918  // we have to special case for instance the SDIV of 64bit integers and the
1919  // use of floating point emulation.
1920  if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1921  switch (ISD) {
1922  default:
1923  break;
1924  case ISD::SDIV:
1925  case ISD::UDIV:
1926  case ISD::SREM:
1927  case ISD::UREM:
1928  case ISD::SDIVREM:
1929  case ISD::UDIVREM:
1930  return true;
1931  }
1932  }
1933 
1934  // Assume all other non-float operations are supported.
1935  if (!VT.isFloatingPoint())
1936  return false;
1937 
1938  // We'll need a library call to handle most floats when using soft.
1939  if (TLI->useSoftFloat()) {
1940  switch (I.getOpcode()) {
1941  default:
1942  return true;
1943  case Instruction::Alloca:
1944  case Instruction::Load:
1945  case Instruction::Store:
1946  case Instruction::Select:
1947  case Instruction::PHI:
1948  return false;
1949  }
1950  }
1951 
1952  // We'll need a libcall to perform double precision operations on a single
1953  // precision only FPU.
1954  if (I.getType()->isDoubleTy() && !ST->hasFP64())
1955  return true;
1956 
1957  // Likewise for half precision arithmetic.
1958  if (I.getType()->isHalfTy() && !ST->hasFullFP16())
1959  return true;
1960 
1961  return false;
1962 }
1963 
1965  AssumptionCache &AC,
1966  TargetLibraryInfo *LibInfo,
1967  HardwareLoopInfo &HWLoopInfo) {
1968  // Low-overhead branches are only supported in the 'low-overhead branch'
1969  // extension of v8.1-m.
1970  if (!ST->hasLOB() || DisableLowOverheadLoops) {
1971  LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
1972  return false;
1973  }
1974 
1976  LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
1977  return false;
1978  }
1979 
1980  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1981  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
1982  LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
1983  return false;
1984  }
1985 
1986  const SCEV *TripCountSCEV =
1987  SE.getAddExpr(BackedgeTakenCount,
1988  SE.getOne(BackedgeTakenCount->getType()));
1989 
1990  // We need to store the trip count in LR, a 32-bit register.
1991  if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
1992  LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
1993  return false;
1994  }
1995 
1996  // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
1997  // point in generating a hardware loop if that's going to happen.
1998 
1999  auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2000  if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
2001  switch (Call->getIntrinsicID()) {
2002  default:
2003  break;
2004  case Intrinsic::start_loop_iterations:
2005  case Intrinsic::test_start_loop_iterations:
2006  case Intrinsic::loop_decrement:
2007  case Intrinsic::loop_decrement_reg:
2008  return true;
2009  }
2010  }
2011  return false;
2012  };
2013 
2014  // Scan the instructions to see if there's any that we know will turn into a
2015  // call or if this loop is already a low-overhead loop or will become a tail
2016  // predicated loop.
2017  bool IsTailPredLoop = false;
2018  auto ScanLoop = [&](Loop *L) {
2019  for (auto *BB : L->getBlocks()) {
2020  for (auto &I : *BB) {
2021  if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2022  isa<InlineAsm>(I)) {
2023  LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2024  return false;
2025  }
2026  if (auto *II = dyn_cast<IntrinsicInst>(&I))
2027  IsTailPredLoop |=
2028  II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2029  II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2030  II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2031  II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2032  II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2033  }
2034  }
2035  return true;
2036  };
2037 
2038  // Visit inner loops.
2039  for (auto Inner : *L)
2040  if (!ScanLoop(Inner))
2041  return false;
2042 
2043  if (!ScanLoop(L))
2044  return false;
2045 
2046  // TODO: Check whether the trip count calculation is expensive. If L is the
2047  // inner loop but we know it has a low trip count, calculating that trip
2048  // count (in the parent loop) may be detrimental.
2049 
2050  LLVMContext &C = L->getHeader()->getContext();
2051  HWLoopInfo.CounterInReg = true;
2052  HWLoopInfo.IsNestingLegal = false;
2053  HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2054  HWLoopInfo.CountType = Type::getInt32Ty(C);
2055  HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2056  return true;
2057 }
2058 
2059 static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2060  // We don't allow icmp's, and because we only look at single block loops,
2061  // we simply count the icmps, i.e. there should only be 1 for the backedge.
2062  if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2063  return false;
2064  // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2065  // not currently canonical, but soon will be. Code without them uses icmp, and
2066  // so is not tail predicated as per the condition above. In order to get the
2067  // same performance we treat min and max the same as an icmp for tailpred
2068  // purposes for the moment (we often rely on non-tailpred and higher VF's to
2069  // pick more optimial instructions like VQDMULH. They need to be recognized
2070  // directly by the vectorizer).
2071  if (auto *II = dyn_cast<IntrinsicInst>(&I))
2072  if ((II->getIntrinsicID() == Intrinsic::smin ||
2073  II->getIntrinsicID() == Intrinsic::smax ||
2074  II->getIntrinsicID() == Intrinsic::umin ||
2075  II->getIntrinsicID() == Intrinsic::umax) &&
2076  ++ICmpCount > 1)
2077  return false;
2078 
2079  if (isa<FCmpInst>(&I))
2080  return false;
2081 
2082  // We could allow extending/narrowing FP loads/stores, but codegen is
2083  // too inefficient so reject this for now.
2084  if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2085  return false;
2086 
2087  // Extends have to be extending-loads
2088  if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2089  if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2090  return false;
2091 
2092  // Truncs have to be narrowing-stores
2093  if (isa<TruncInst>(&I) )
2094  if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2095  return false;
2096 
2097  return true;
2098 }
2099 
2100 // To set up a tail-predicated loop, we need to know the total number of
2101 // elements processed by that loop. Thus, we need to determine the element
2102 // size and:
2103 // 1) it should be uniform for all operations in the vector loop, so we
2104 // e.g. don't want any widening/narrowing operations.
2105 // 2) it should be smaller than i64s because we don't have vector operations
2106 // that work on i64s.
2107 // 3) we don't want elements to be reversed or shuffled, to make sure the
2108 // tail-predication masks/predicates the right lanes.
2109 //
2111  const DataLayout &DL,
2112  const LoopAccessInfo *LAI) {
2113  LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2114 
2115  // If there are live-out values, it is probably a reduction. We can predicate
2116  // most reduction operations freely under MVE using a combination of
2117  // prefer-predicated-reduction-select and inloop reductions. We limit this to
2118  // floating point and integer reductions, but don't check for operators
2119  // specifically here. If the value ends up not being a reduction (and so the
2120  // vectorizer cannot tailfold the loop), we should fall back to standard
2121  // vectorization automatically.
2123  LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2124  bool ReductionsDisabled =
2127 
2128  for (auto *I : LiveOuts) {
2129  if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2130  !I->getType()->isHalfTy()) {
2131  LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2132  "live-out value\n");
2133  return false;
2134  }
2135  if (ReductionsDisabled) {
2136  LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2137  return false;
2138  }
2139  }
2140 
2141  // Next, check that all instructions can be tail-predicated.
2142  PredicatedScalarEvolution PSE = LAI->getPSE();
2143  SmallVector<Instruction *, 16> LoadStores;
2144  int ICmpCount = 0;
2145 
2146  for (BasicBlock *BB : L->blocks()) {
2147  for (Instruction &I : BB->instructionsWithoutDebug()) {
2148  if (isa<PHINode>(&I))
2149  continue;
2150  if (!canTailPredicateInstruction(I, ICmpCount)) {
2151  LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2152  return false;
2153  }
2154 
2155  Type *T = I.getType();
2156  if (T->getScalarSizeInBits() > 32) {
2157  LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2158  return false;
2159  }
2160  if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2162  Type *AccessTy = getLoadStoreType(&I);
2163  int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L);
2164  if (NextStride == 1) {
2165  // TODO: for now only allow consecutive strides of 1. We could support
2166  // other strides as long as it is uniform, but let's keep it simple
2167  // for now.
2168  continue;
2169  } else if (NextStride == -1 ||
2170  (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2171  (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2172  LLVM_DEBUG(dbgs()
2173  << "Consecutive strides of 2 found, vld2/vstr2 can't "
2174  "be tail-predicated\n.");
2175  return false;
2176  // TODO: don't tail predicate if there is a reversed load?
2177  } else if (EnableMaskedGatherScatters) {
2178  // Gather/scatters do allow loading from arbitrary strides, at
2179  // least if they are loop invariant.
2180  // TODO: Loop variant strides should in theory work, too, but
2181  // this requires further testing.
2182  const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2183  if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2184  const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2185  if (PSE.getSE()->isLoopInvariant(Step, L))
2186  continue;
2187  }
2188  }
2189  LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2190  "tail-predicate\n.");
2191  return false;
2192  }
2193  }
2194  }
2195 
2196  LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2197  return true;
2198 }
2199 
2201  ScalarEvolution &SE,
2202  AssumptionCache &AC,
2203  TargetLibraryInfo *TLI,
2204  DominatorTree *DT,
2205  const LoopAccessInfo *LAI) {
2206  if (!EnableTailPredication) {
2207  LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2208  return false;
2209  }
2210 
2211  // Creating a predicated vector loop is the first step for generating a
2212  // tail-predicated hardware loop, for which we need the MVE masked
2213  // load/stores instructions:
2214  if (!ST->hasMVEIntegerOps())
2215  return false;
2216 
2217  // For now, restrict this to single block loops.
2218  if (L->getNumBlocks() > 1) {
2219  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2220  "loop.\n");
2221  return false;
2222  }
2223 
2224  assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2225 
2226  HardwareLoopInfo HWLoopInfo(L);
2227  if (!HWLoopInfo.canAnalyze(*LI)) {
2228  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2229  "analyzable.\n");
2230  return false;
2231  }
2232 
2233  // This checks if we have the low-overhead branch architecture
2234  // extension, and if we will create a hardware-loop:
2235  if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2236  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2237  "profitable.\n");
2238  return false;
2239  }
2240 
2241  if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2242  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2243  "a candidate.\n");
2244  return false;
2245  }
2246 
2247  return canTailPredicateLoop(L, LI, SE, DL, LAI);
2248 }
2249 
2251  if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2252  return false;
2253 
2254  // Intrinsic @llvm.get.active.lane.mask is supported.
2255  // It is used in the MVETailPredication pass, which requires the number of
2256  // elements processed by this vector loop to setup the tail-predicated
2257  // loop.
2258  return true;
2259 }
2263  // Enable Upper bound unrolling universally, not dependant upon the conditions
2264  // below.
2265  UP.UpperBound = true;
2266 
2267  // Only currently enable these preferences for M-Class cores.
2268  if (!ST->isMClass())
2269  return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2270 
2271  // Disable loop unrolling for Oz and Os.
2272  UP.OptSizeThreshold = 0;
2273  UP.PartialOptSizeThreshold = 0;
2274  if (L->getHeader()->getParent()->hasOptSize())
2275  return;
2276 
2277  SmallVector<BasicBlock*, 4> ExitingBlocks;
2278  L->getExitingBlocks(ExitingBlocks);
2279  LLVM_DEBUG(dbgs() << "Loop has:\n"
2280  << "Blocks: " << L->getNumBlocks() << "\n"
2281  << "Exit blocks: " << ExitingBlocks.size() << "\n");
2282 
2283  // Only allow another exit other than the latch. This acts as an early exit
2284  // as it mirrors the profitability calculation of the runtime unroller.
2285  if (ExitingBlocks.size() > 2)
2286  return;
2287 
2288  // Limit the CFG of the loop body for targets with a branch predictor.
2289  // Allowing 4 blocks permits if-then-else diamonds in the body.
2290  if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2291  return;
2292 
2293  // Don't unroll vectorized loops, including the remainder loop
2294  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2295  return;
2296 
2297  // Scan the loop: don't unroll loops with calls as this could prevent
2298  // inlining.
2299  InstructionCost Cost = 0;
2300  for (auto *BB : L->getBlocks()) {
2301  for (auto &I : *BB) {
2302  // Don't unroll vectorised loop. MVE does not benefit from it as much as
2303  // scalar code.
2304  if (I.getType()->isVectorTy())
2305  return;
2306 
2307  if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2308  if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2309  if (!isLoweredToCall(F))
2310  continue;
2311  }
2312  return;
2313  }
2314 
2315  SmallVector<const Value*, 4> Operands(I.operand_values());
2316  Cost +=
2318  }
2319  }
2320 
2321  // On v6m cores, there are very few registers available. We can easily end up
2322  // spilling and reloading more registers in an unrolled loop. Look at the
2323  // number of LCSSA phis as a rough measure of how many registers will need to
2324  // be live out of the loop, reducing the default unroll count if more than 1
2325  // value is needed. In the long run, all of this should be being learnt by a
2326  // machine.
2327  unsigned UnrollCount = 4;
2328  if (ST->isThumb1Only()) {
2329  unsigned ExitingValues = 0;
2330  SmallVector<BasicBlock *, 4> ExitBlocks;
2331  L->getExitBlocks(ExitBlocks);
2332  for (auto *Exit : ExitBlocks) {
2333  // Count the number of LCSSA phis. Exclude values coming from GEP's as
2334  // only the last is expected to be needed for address operands.
2335  unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2336  return PH.getNumOperands() != 1 ||
2337  !isa<GetElementPtrInst>(PH.getOperand(0));
2338  });
2339  ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2340  }
2341  if (ExitingValues)
2342  UnrollCount /= ExitingValues;
2343  if (UnrollCount <= 1)
2344  return;
2345  }
2346 
2347  LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2348  LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2349 
2350  UP.Partial = true;
2351  UP.Runtime = true;
2352  UP.UnrollRemainder = true;
2354  UP.UnrollAndJam = true;
2356 
2357  // Force unrolling small loops can be very useful because of the branch
2358  // taken cost of the backedge.
2359  if (Cost < 12)
2360  UP.Force = true;
2361 }
2362 
2365  BaseT::getPeelingPreferences(L, SE, PP);
2366 }
2367 
2368 bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2369  TTI::ReductionFlags Flags) const {
2370  if (!ST->hasMVEIntegerOps())
2371  return false;
2372 
2373  unsigned ScalarBits = Ty->getScalarSizeInBits();
2374  switch (Opcode) {
2375  case Instruction::Add:
2376  return ScalarBits <= 64;
2377  default:
2378  return false;
2379  }
2380 }
2381 
2383  unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2384  if (!ST->hasMVEIntegerOps())
2385  return false;
2386  return true;
2387 }
llvm::ISD::SUB
@ SUB
Definition: ISDOpcodes.h:240
ARMSubtarget.h
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::TargetLoweringBase::getMaxStoresPerMemmove
unsigned getMaxStoresPerMemmove(bool OptSize) const
Get maximum # of store operations permitted for llvm.memmove.
Definition: TargetLowering.h:1678
llvm::TargetTransformInfo::CastContextHint::Masked
@ Masked
The cast is used with a masked load/store.
ValueTypes.h
CmpMode::FP
@ FP
llvm::TargetTransformInfo::UnrollingPreferences::PartialOptSizeThreshold
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
Definition: TargetTransformInfo.h:460
llvm::ScalarEvolution::hasLoopInvariantBackedgeTakenCount
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
Definition: ScalarEvolution.cpp:12970
llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition: TargetTransformInfo.h:885
llvm::BasicTTIImplBase< ARMTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:37
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:488
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:211
llvm::SPF_SMAX
@ SPF_SMAX
Unsigned minimum.
Definition: ValueTracking.h:702
llvm::ISD::VECTOR_SHUFFLE
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:586
llvm::ARM_AM::isThumbImmShiftedVal
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
Definition: ARMAddressingModes.h:235
llvm::MVT::v4f16
@ v4f16
Definition: MachineValueType.h:142
llvm::TargetTransformInfo::TCC_Expensive
@ TCC_Expensive
The cost of a 'div' instruction on x86.
Definition: TargetTransformInfo.h:264
llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:179
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
llvm::TargetTransformInfo::ReductionFlags
Flags describing the kind of vector reduction.
Definition: TargetTransformInfo.h:1398
llvm::ConvertCostTableLookup
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType >> Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
llvm::CostTblEntryT
Cost Table Entry.
Definition: CostTable.h:25
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::LoopBase::getExitBlocks
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
Definition: LoopInfoImpl.h:61
llvm::ARMTTIImpl::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: ARMTargetTransformInfo.cpp:1514
llvm::HardwareLoopInfo::LoopDecrement
Value * LoopDecrement
Definition: TargetTransformInfo.h:103
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:667
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
InstCombiner.h
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:719
llvm::InstCombiner::getDominatorTree
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:370
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:104
IntrinsicInst.h
ceil
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g ceil
Definition: README-FPStack.txt:54
DisableLowOverheadLoops
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
llvm::TypeSize::getFixedSize
ScalarTy getFixedSize() const
Definition: TypeSize.h:444
llvm::Function
Definition: Function.h:60
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:546
llvm::ISD::UDIV
@ UDIV
Definition: ISDOpcodes.h:243
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:53
llvm::SelectPatternResult::Flavor
SelectPatternFlavor Flavor
Definition: ValueTracking.h:722
llvm::TargetTransformInfoImplCRTPBase< ARMTTIImpl >::getUserCost
InstructionCost getUserCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
Definition: TargetTransformInfoImpl.h:978
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:673
llvm::BasicTTIImplBase< ARMTTIImpl >::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1093
llvm::TargetTransformInfo::AMK_PostIndexed
@ AMK_PostIndexed
Definition: TargetTransformInfo.h:644
llvm::PointerType::get
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Definition: Type.cpp:727
MVEMaxSupportedInterleaveFactor
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor
llvm::ARMTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: ARMTargetTransformInfo.cpp:85
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:309
llvm::PredicatedScalarEvolution
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
Definition: ScalarEvolution.h:2176
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:133
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1185
llvm::InstCombiner::Builder
BuilderTy & Builder
Definition: InstCombiner.h:58
llvm::ARMSubtarget::hasFPARMv8Base
bool hasFPARMv8Base() const
Definition: ARMSubtarget.h:337
llvm::CmpInst::makeCmpResultType
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1044
llvm::IRBuilder< TargetFolder, IRBuilderCallbackInserter >
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:150
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:449
llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:784
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJamInnerLoopThreshold
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
Definition: TargetTransformInfo.h:507
Local.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
llvm::TargetTransformInfo::UnrollingPreferences::UnrollRemainder
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
Definition: TargetTransformInfo.h:500
llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:819
llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:179
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:214
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
EnableMaskedLoadStores
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
llvm::MemOp
Definition: TargetLowering.h:111
APInt.h
llvm::ARMTTIImpl::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:1555
llvm::getLoadStoreType
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
Definition: Instructions.h:5362
llvm::HardwareLoopInfo::isHardwareLoopCandidate
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
Definition: TargetTransformInfo.cpp:103
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:746
llvm::ARMTTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:456
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:484
llvm::findDefsUsedOutsideOfLoop
SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
Definition: LoopUtils.cpp:126
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::getPtrStride
int64_t getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap=ValueToValueMap(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
Definition: LoopAccessAnalysis.cpp:1186
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1411
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:538
llvm::tgtok::Bits
@ Bits
Definition: TGLexer.h:50
llvm::Instruction::isShift
bool isShift() const
Definition: Instruction.h:164
llvm::BasicTTIImplBase< ARMTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:866
llvm::SPF_UMAX
@ SPF_UMAX
Signed maximum.
Definition: ValueTracking.h:703
llvm::Optional
Definition: APInt.h:33
T
#define T
Definition: Mips16ISelLowering.cpp:341
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:40
llvm::ConstantAsMetadata::get
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:420
llvm::TargetLoweringBase::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: TargetLoweringBase.cpp:1847
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:749
RHS
Value * RHS
Definition: X86PartialReduction.cpp:76
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
llvm::matchSelectPattern
SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
Definition: ValueTracking.cpp:6350
llvm::SelectPatternFlavor
SelectPatternFlavor
Specific patterns of select instructions we can match.
Definition: ValueTracking.h:698
llvm::MVT::v2f64
@ v2f64
Definition: MachineValueType.h:178
llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:525
llvm::count_if
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1716
llvm::TargetLoweringBase::getOperationAction
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
Definition: TargetLowering.h:1091
llvm::ARMTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: ARMTargetTransformInfo.cpp:2260
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:239
llvm::LoopBase::getNumBlocks
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:201
llvm::APIntOps::umin
const APInt & umin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be unsigned.
Definition: APInt.h:2137
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1300
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:883
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::ARMTTIImpl::isLegalMaskedLoad
bool isLegalMaskedLoad(Type *DataTy, Align Alignment)
Definition: ARMTargetTransformInfo.cpp:1095
llvm::Instruction::setMetadata
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1366
llvm::ARMTTIImpl::isHardwareLoopProfitable
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
Definition: ARMTargetTransformInfo.cpp:1964
KnownBits.h
llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(Optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition: TargetTransformInfo.h:1218
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
llvm::HardwareLoopInfo::IsNestingLegal
bool IsNestingLegal
Definition: TargetTransformInfo.h:105
floor
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g floor
Definition: README-FPStack.txt:54
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:129
MachineValueType.h
UnrollCount
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:266
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:186
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
Instruction.h
llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition: DerivedTypes.h:568
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
LHS
Value * LHS
Definition: X86PartialReduction.cpp:75
llvm::IntrinsicCostAttributes::getArgTypes
const SmallVectorImpl< Type * > & getArgTypes() const
Definition: TargetTransformInfo.h:154
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::InstCombiner::replaceOperand
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:438
llvm::Intrinsic::getType
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:1374
llvm::MVT::i1
@ i1
Definition: MachineValueType.h:43
llvm::MVT::v8f16
@ v8f16
Definition: MachineValueType.h:143
llvm::BasicTTIImplBase::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:516
llvm::ARM_AM::getSOImmVal
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
Definition: ARMAddressingModes.h:163
SubtargetFeature.h
TargetMachine.h
llvm::TypeConversionCostTblEntryT
Type Conversion Cost Table.
Definition: CostTable.h:55
llvm::ScalarEvolution::getOne
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
Definition: ScalarEvolution.h:645
llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:713
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:763
EnableTailPredication
cl::opt< TailPredication::Mode > EnableTailPredication
llvm::BasicTTIImplBase< ARMTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:1149
llvm::ARMTTIImpl::getIntImmCostInst
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
Definition: ARMTargetTransformInfo.cpp:385
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:882
llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition: TargetTransformInfo.h:1103
llvm::User
Definition: User.h:44
llvm::TargetLoweringBase::getMaxStoresPerMemset
unsigned getMaxStoresPerMemset(bool OptSize) const
Get maximum # of store operations permitted for llvm.memset.
Definition: TargetLowering.h:1639
llvm::BasicTTIImplBase< ARMTTIImpl >::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
Definition: BasicTTIImpl.h:2125
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:34
llvm::getKnownAlignment
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition: Local.h:222
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::TargetTransformInfo::UnrollingPreferences::Force
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
Definition: TargetTransformInfo.h:496
llvm::MVT::f64
@ f64
Definition: MachineValueType.h:58
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:56
llvm::EVT::getVectorNumElements
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:308
llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2243
AllowWLSLoops
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:769
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:194
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:227
llvm::ARMTTIImpl::getIntImmCodeSizeCost
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty)
Definition: ARMTargetTransformInfo.cpp:328
llvm::MVT::v16i1
@ v16i1
Definition: MachineValueType.h:70
llvm::ISD::UDIVREM
@ UDIVREM
Definition: ISDOpcodes.h:256
llvm::dwarf::Index
Index
Definition: Dwarf.h:472
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
llvm::PatternMatch::m_c_Add
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
Definition: PatternMatch.h:2231
llvm::ARMTargetLowering::useSoftFloat
bool useSoftFloat() const override
Definition: ARMISelLowering.cpp:1602
llvm::MVT::v8i1
@ v8i1
Definition: MachineValueType.h:69
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJam
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
Definition: TargetTransformInfo.h:502
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:187
llvm::PatternMatch::m_ConstantInt
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:147
llvm::EVT::isInteger
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:144
llvm::Instruction
Definition: Instruction.h:42
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:189
llvm::HardwareLoopInfo::PerformEntryTest
bool PerformEntryTest
Definition: TargetTransformInfo.h:109
canTailPredicateLoop
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI)
Definition: ARMTargetTransformInfo.cpp:2110
llvm::ARMTTIImpl::isLoweredToCall
bool isLoweredToCall(const Function *F)
Definition: ARMTargetTransformInfo.cpp:1816
llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:773
llvm::InstCombiner::eraseInstFromFunction
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
llvm::APInt::getHighBitsSet
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:279
llvm::LoopBase::getExitingBlocks
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
Definition: LoopInfoImpl.h:33
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:928
LoopUtils.h
llvm::HardwareLoopInfo::CounterInReg
bool CounterInReg
Definition: TargetTransformInfo.h:107
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:666
llvm::ARMTTIImpl::emitGetActiveLaneMask
bool emitGetActiveLaneMask() const
Definition: ARMTargetTransformInfo.cpp:2250
PatternMatch.h
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:684
llvm::MVT::v1i64
@ v1i64
Definition: MachineValueType.h:123
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:62
llvm::isVREVMask
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
Definition: ARMTargetTransformInfo.h:320
llvm::ARM_AM::getT2SOImmVal
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
Definition: ARMAddressingModes.h:320
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
llvm::BasicTTIImplBase< ARMTTIImpl >::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1098
llvm::BasicTTIImplBase< ARMTTIImpl >::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:928
llvm::None
const NoneType None
Definition: None.h:24
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:97
llvm::lltok::Kind
Kind
Definition: LLToken.h:18
llvm::ARMSubtarget::getMVEVectorCostFactor
unsigned getMVEVectorCostFactor(TargetTransformInfo::TargetCostKind CostKind) const
Definition: ARMSubtarget.h:541
llvm::Type::getIntegerBitWidth
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:97
llvm::MVT::v4i8
@ v4i8
Definition: MachineValueType.h:84
llvm::SPF_SMIN
@ SPF_SMIN
Definition: ValueTracking.h:700
Type.h
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:118
llvm::Instruction::getMetadata
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:279
llvm::maxnum
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE maxNum semantics.
Definition: APFloat.h:1307
LoopInfo.h
llvm::ARMTTIImpl::getNumMemOps
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
Definition: ARMTargetTransformInfo.cpp:1127
llvm::ARMTTIImpl::isProfitableLSRChainElement
bool isProfitableLSRChainElement(Instruction *I)
Definition: ARMTargetTransformInfo.cpp:1078
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:74
llvm::ARMTTIImpl::preferPredicateOverEpilogue
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, const LoopAccessInfo *LAI)
Definition: ARMTargetTransformInfo.cpp:2200
llvm::ScalarEvolution::getSCEV
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition: ScalarEvolution.cpp:4406
llvm::PatternMatch::m_Xor
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1099
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:191
llvm::APInt::getOneBitSet
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:222
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:884
llvm::MVT::v2i8
@ v2i8
Definition: MachineValueType.h:83
llvm::MVT::v4i64
@ v4i64
Definition: MachineValueType.h:126
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
llvm::ARMTTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:1450
llvm::TargetTransformInfo::CastContextHint::Normal
@ Normal
The cast is used with a normal load/store.
BasicBlock.h
llvm::cl::opt< bool >
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:75
llvm::ARMTTIImpl::getExtendedAddReductionCost
InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1680
llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:531
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::MVT::v16i8
@ v16i8
Definition: MachineValueType.h:86
llvm::CostTableLookup
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType >> Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
llvm::EVT::getSizeInBits
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:340
llvm::ARMSubtarget::isThumb1Only
bool isThumb1Only() const
Definition: ARMSubtarget.h:422
llvm::ScalarEvolution::getUnsignedRangeMax
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
Definition: ScalarEvolution.h:956
llvm::ARMTTIImpl::getMemcpyCost
InstructionCost getMemcpyCost(const Instruction *I)
Definition: ARMTargetTransformInfo.cpp:1193
llvm::ARMTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Op1Info=TTI::OK_AnyValue, TTI::OperandValueKind Op2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: ARMTargetTransformInfo.cpp:1308
llvm::InstCombiner::getAssumptionCache
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:368
llvm::MVT::v16i16
@ v16i16
Definition: MachineValueType.h:99
llvm::MVT::v2i64
@ v2i64
Definition: MachineValueType.h:124
uint64_t
llvm::InstCombiner::getDataLayout
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:371
llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition: ISDOpcodes.h:820
llvm::ARMTTIImpl::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr)
Definition: ARMTargetTransformInfo.cpp:1056
llvm::MVT::v16f32
@ v16f32
Definition: MachineValueType.h:168
llvm::Instruction::user_back
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:88
llvm::TruncInst
This class represents a truncation of integer types.
Definition: Instructions.h:4767
llvm::TargetTransformInfo::OK_UniformConstantValue
@ OK_UniformConstantValue
Definition: TargetTransformInfo.h:904
llvm::ARMSubtarget::isMClass
bool isMClass() const
Definition: ARMSubtarget.h:424
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:431
llvm::ARMTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=None)
Definition: ARMTargetTransformInfo.cpp:1203
isSSATMinMaxPattern
static Value * isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
Definition: ARMTargetTransformInfo.cpp:339
I
#define I(x, y, z)
Definition: MD5.cpp:58
getCalledFunction
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
Definition: MemoryBuiltins.cpp:160
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:909
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
llvm::ARMTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1711
llvm::LoopAccessInfo
Drive the analysis of memory accesses in the loop.
Definition: LoopAccessAnalysis.h:559
llvm::ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic
Optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
Definition: ARMTargetTransformInfo.cpp:251
llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
llvm::MVT::v4f32
@ v4f32
Definition: MachineValueType.h:163
llvm::MVT::i8
@ i8
Definition: MachineValueType.h:46
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
llvm::MVT::Other
@ Other
Definition: MachineValueType.h:42
llvm::SPF_ABS
@ SPF_ABS
Floating point maxnum.
Definition: ValueTracking.h:706
memcpy
<%struct.s * > cast struct s *S to sbyte *< sbyte * > sbyte uint cast struct s *agg result to sbyte *< sbyte * > sbyte uint cast struct s *memtmp to sbyte *< sbyte * > sbyte uint ret void llc ends up issuing two memcpy or custom lower memcpy(of small size) to be ldmia/stmia. I think option 2 is better but the current register allocator cannot allocate a chunk of registers at a time. A feasible temporary solution is to use specific physical registers at the lowering time for small(<
llvm::TargetLoweringBase::getMaxStoresPerMemcpy
unsigned getMaxStoresPerMemcpy(bool OptSize) const
Get maximum # of store operations permitted for llvm.memcpy.
Definition: TargetLowering.h:1649
llvm::TargetTransformInfoImplBase::isLoweredToCall
bool isLoweredToCall(const Function *F) const
Definition: TargetTransformInfoImpl.h:121
llvm::ARMTTIImpl::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1497
llvm::ARMTTIImpl::preferInLoopReduction
bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
Definition: ARMTargetTransformInfo.cpp:2368
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:82
llvm::IRBuilderBase::CreateIntrinsic
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with args, mangled using Types.
Definition: IRBuilder.cpp:886
llvm::BasicTTIImplBase< ARMTTIImpl >::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: BasicTTIImpl.h:1250
llvm::IRBuilderBase::getTrue
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:441
llvm::PatternMatch::m_Constant
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
Definition: PatternMatch.h:144
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:901
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::ARMTTIImpl::getPreferredAddressingMode
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
Definition: ARMTargetTransformInfo.cpp:104
llvm::APIntOps::smin
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition: APInt.h:2127
llvm::ArrayRef< int >
llvm::LoopInfo
Definition: LoopInfo.h:1102
llvm::EVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:154
ARMAddressingModes.h
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
DataLayout.h
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:49
llvm::MVT::v2i32
@ v2i32
Definition: MachineValueType.h:107
llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:352
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:42
llvm::ARMTTIImpl::isLegalMaskedGather
bool isLegalMaskedGather(Type *Ty, Align Alignment)
Definition: ARMTargetTransformInfo.cpp:1115
llvm::BasicTTIImplBase< ARMTTIImpl >::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:701
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:215
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::ISD::SREM
@ SREM
Definition: ISDOpcodes.h:244
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
llvm::MVT::v2f32
@ v2f32
Definition: MachineValueType.h:161
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:991
llvm::ARMTTIImpl::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:910
llvm::TargetTransformInfo::UnrollingPreferences::DefaultUnrollRuntimeCount
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
Definition: TargetTransformInfo.h:467
trunc
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g trunc
Definition: README-FPStack.txt:63
llvm::SPF_FMINNUM
@ SPF_FMINNUM
Unsigned maximum.
Definition: ValueTracking.h:704
llvm::TargetLoweringBase::InstructionOpcodeToISD
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Definition: TargetLoweringBase.cpp:1767
llvm::TargetTransformInfo::AddressingModeKind
AddressingModeKind
Definition: TargetTransformInfo.h:642
llvm::MVT::v4i32
@ v4i32
Definition: MachineValueType.h:109
llvm::BasicTTIImplBase< ARMTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:588
llvm::SPF_FMAXNUM
@ SPF_FMAXNUM
Floating point minnum.
Definition: ValueTracking.h:705
llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
canTailPredicateInstruction
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
Definition: ARMTargetTransformInfo.cpp:2059
llvm::MVT::v8i64
@ v8i64
Definition: MachineValueType.h:127
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:668
llvm::Function::hasOptSize
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:664
llvm::ScalarEvolution::isLoopInvariant
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
Definition: ScalarEvolution.cpp:13240
llvm::InstCombiner::replaceInstUsesWith
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:417
llvm::MVT::v16i32
@ v16i32
Definition: MachineValueType.h:114
llvm::MemOp::Set
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
Definition: TargetLowering.h:144
llvm::MCID::Select
@ Select
Definition: MCInstrDesc.h:164
llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition: TargetLowering.h:347
llvm::LoopBase::isInnermost
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:181
llvm::APIntOps::umax
const APInt & umax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be unsigned.
Definition: APInt.h:2142
llvm::minnum
LLVM_READONLY APFloat minnum(const APFloat &A, const APFloat &B)
Implements IEEE minNum semantics.
Definition: APFloat.h:1296
llvm::MemOp::Copy
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
Definition: TargetLowering.h:129
llvm::KnownBits
Definition: KnownBits.h:23
llvm::Type::getIntNTy
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:243
isFPSatMinMaxPattern
static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm)
Definition: ARMTargetTransformInfo.cpp:373
llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:148
llvm::BasicTTIImplBase< ARMTTIImpl >::getCallInstrCost
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
Definition: BasicTTIImpl.h:2113
CostTable.h
llvm::EVT::getScalarType
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:295
llvm::TargetTransformInfo::UnrollingPreferences::UpperBound
bool UpperBound
Allow using trip count upper bound to unroll loops.
Definition: TargetTransformInfo.h:498
llvm::Align::value
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
llvm::CallBase::arg_size
unsigned arg_size() const
Definition: InstrTypes.h:1339
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:197
llvm::MVT::v8i16
@ v8i16
Definition: MachineValueType.h:98
llvm::ARMTTIImpl::getIntImmCost
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:293
ISDOpcodes.h
llvm::TypeSize
Definition: TypeSize.h:435
Casting.h
llvm::BasicTTIImplBase< ARMTTIImpl >::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:1234
llvm::Value::hasNUses
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:145
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
llvm::MVT::i32
@ i32
Definition: MachineValueType.h:48
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:222
llvm::ISD::SDIV
@ SDIV
Definition: ISDOpcodes.h:242
powi
This is blocked on not handling X *X *X powi(X, 3)(see note above). The issue is that we end up getting t
llvm::log2
static double log2(double V)
Definition: AMDGPULibCalls.cpp:802
llvm::APInt::getSplat
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:612
llvm::InstCombiner::SimplifyDemandedBits
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, unsigned Depth=0)=0
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:185
llvm::MVT::v8i32
@ v8i32
Definition: MachineValueType.h:113
llvm::InstCombiner
The core instruction combiner logic.
Definition: InstCombiner.h:45
llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition: ISDOpcodes.h:774
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
llvm::TargetLoweringBase::isOperationLegalOrCustomOrPromote
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Definition: TargetLowering.h:1186
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:46
llvm::HardwareLoopInfo
Attributes of a target dependent hardware loop.
Definition: TargetTransformInfo.h:95
llvm::RISCVMatInt::Imm
@ Imm
Definition: RISCVMatInt.h:23
llvm::TargetTransformInfoImplBase::isConstantStridedAccessLessThan
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
Definition: TargetTransformInfoImpl.h:889
llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:871
llvm::BasicTTIImplBase< ARMTTIImpl >::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1241
llvm::ARMTTIImpl::maybeLoweredToCall
bool maybeLoweredToCall(Instruction &I)
Definition: ARMTargetTransformInfo.cpp:1877
Instructions.h
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:148
llvm::ARMTTIImpl::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:470
llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:691
SmallVector.h
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:766
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:241
llvm::ISD::UREM
@ UREM
Definition: ISDOpcodes.h:245
llvm::MVT::f16
@ f16
Definition: MachineValueType.h:56
llvm::TailPredication::EnabledNoReductions
@ EnabledNoReductions
Definition: ARMTargetTransformInfo.h:44
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1341
llvm::BasicTTIImplBase< ARMTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1394
llvm::ARMTTIImpl::isLegalMaskedStore
bool isLegalMaskedStore(Type *DataTy, Align Alignment)
Definition: ARMTargetTransformInfo.h:188
llvm::SPF_UMIN
@ SPF_UMIN
Signed minimum.
Definition: ValueTracking.h:701
llvm::getBooleanLoopAttribute
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1084
llvm::ARMTargetLowering::getNumInterleavedAccesses
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
Definition: ARMISelLowering.cpp:21220
simplifyNeonVld1
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
Definition: ARMTargetTransformInfo.cpp:66
llvm::ARMTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: ARMTargetTransformInfo.cpp:2363
llvm::BasicTTIImplBase< ARMTTIImpl >::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1190
ARMTargetTransformInfo.h
llvm::HardwareLoopInfo::canAnalyze
bool canAnalyze(LoopInfo &LI)
Definition: TargetTransformInfo.cpp:50
llvm::IRBuilderBase::CreateVectorSplat
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1143
DerivedTypes.h
llvm::SCEV::getType
Type * getType() const
Return the LLVM type of this SCEV expression.
Definition: ScalarEvolution.cpp:392
llvm::getLoadStorePointerOperand
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Definition: Instructions.h:5317
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
EnableMaskedGatherScatters
cl::opt< bool > EnableMaskedGatherScatters
llvm::ScalarEvolution::getAddExpr
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:2453
llvm::APInt::getLowBitsSet
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:289
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:47
llvm::TargetTransformInfo::UnrollingPreferences::OptSizeThreshold
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Definition: TargetTransformInfo.h:453
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:171
llvm::ARMTTIImpl::preferPredicatedReductionSelect
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
Definition: ARMTargetTransformInfo.cpp:2382
llvm::ISD::SDIVREM
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
llvm::ARMTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: ARMTargetTransformInfo.cpp:874
llvm::ARMSubtarget::isThumb2
bool isThumb2() const
Definition: ARMSubtarget.h:423
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:394
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:405
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1459
llvm::LoopAccessInfo::getPSE
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
Definition: LoopAccessAnalysis.h:636
llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:760
llvm::MVT::v8i8
@ v8i8
Definition: MachineValueType.h:85
llvm::ScalarEvolution::getBackedgeTakenCount
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
Definition: ScalarEvolution.cpp:7901
llvm::MVT::v8f32
@ v8f32
Definition: MachineValueType.h:167
llvm::PredicatedScalarEvolution::getSE
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
Definition: ScalarEvolution.h:2208
llvm::ARMTTIImpl::instCombineIntrinsic
Optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
Definition: ARMTargetTransformInfo.cpp:120
llvm::BinaryOperator::Create
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), Instruction *InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Definition: Instructions.cpp:2778
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:95
llvm::MVT::v16i64
@ v16i64
Definition: MachineValueType.h:128
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1282
llvm::BasicTTIImplBase< ARMTTIImpl >::getExtendedAddReductionCost
InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2316
llvm::HardwareLoopInfo::CountType
IntegerType * CountType
Definition: TargetTransformInfo.h:102
llvm::EVT::isFixedLengthVector
bool isFixedLengthVector() const
Definition: ValueTypes.h:164
llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:852
llvm::MVT::f32
@ f32
Definition: MachineValueType.h:57
llvm::ARMTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1655
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:212
llvm::EVT::isFloatingPoint
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:139
llvm::TargetTransformInfo::AMK_None
@ AMK_None
Definition: TargetTransformInfo.h:645
llvm::TargetTransformInfo::AMK_PreIndexed
@ AMK_PreIndexed
Definition: TargetTransformInfo.h:643
llvm::APIntOps::smax
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition: APInt.h:2132
llvm::ARMSubtarget::hasVFP2Base
bool hasVFP2Base() const
Definition: ARMSubtarget.h:334
llvm::TargetLoweringBase::LibCall
@ LibCall
Definition: TargetLowering.h:199
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:164
llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:288
llvm::MVT::v4i1
@ v4i1
Definition: MachineValueType.h:68
llvm::ARMTargetLowering::isLegalInterleavedAccessType
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
Definition: ARMISelLowering.cpp:21225
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
llvm::BasicTTIImplBase< ARMTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=None)
Definition: BasicTTIImpl.h:899
llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:506
llvm::TailPredication::ForceEnabledNoReductions
@ ForceEnabledNoReductions
Definition: ARMTargetTransformInfo.h:46