LLVM  13.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "ARMSubtarget.h"
12 #include "llvm/ADT/APInt.h"
13 #include "llvm/ADT/SmallVector.h"
14 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
18 #include "llvm/IR/BasicBlock.h"
19 #include "llvm/IR/DataLayout.h"
20 #include "llvm/IR/DerivedTypes.h"
21 #include "llvm/IR/Instruction.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/Intrinsics.h"
24 #include "llvm/IR/IntrinsicInst.h"
25 #include "llvm/IR/IntrinsicsARM.h"
26 #include "llvm/IR/PatternMatch.h"
27 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Casting.h"
30 #include "llvm/Support/KnownBits.h"
36 #include <algorithm>
37 #include <cassert>
38 #include <cstdint>
39 #include <utility>
40 
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "armtti"
44 
46  "enable-arm-maskedldst", cl::Hidden, cl::init(true),
47  cl::desc("Enable the generation of masked loads and stores"));
48 
50  "disable-arm-loloops", cl::Hidden, cl::init(false),
51  cl::desc("Disable the generation of low-overhead loops"));
52 
53 static cl::opt<bool>
54  AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
55  cl::desc("Enable the generation of WLS loops"));
56 
58 
60 
62 
63 /// Convert a vector load intrinsic into a simple llvm load instruction.
64 /// This is beneficial when the underlying object being addressed comes
65 /// from a constant, since we get constant-folding for free.
66 static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
68  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
69 
70  if (!IntrAlign)
71  return nullptr;
72 
73  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
74  ? MemAlign
75  : IntrAlign->getLimitedValue();
76 
77  if (!isPowerOf2_32(Alignment))
78  return nullptr;
79 
80  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
81  PointerType::get(II.getType(), 0));
82  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
83 }
84 
86  const Function *Callee) const {
87  const TargetMachine &TM = getTLI()->getTargetMachine();
88  const FeatureBitset &CallerBits =
89  TM.getSubtargetImpl(*Caller)->getFeatureBits();
90  const FeatureBitset &CalleeBits =
91  TM.getSubtargetImpl(*Callee)->getFeatureBits();
92 
93  // To inline a callee, all features not in the allowed list must match exactly.
94  bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
95  (CalleeBits & ~InlineFeaturesAllowed);
96  // For features in the allowed list, the callee's features must be a subset of
97  // the callers'.
98  bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
99  (CalleeBits & InlineFeaturesAllowed);
100  return MatchExact && MatchSubset;
101 }
102 
105  ScalarEvolution *SE) const {
106  if (ST->hasMVEIntegerOps())
107  return TTI::AMK_PostIndexed;
108 
109  if (L->getHeader()->getParent()->hasOptSize())
110  return TTI::AMK_None;
111 
112  if (ST->isMClass() && ST->isThumb2() &&
113  L->getNumBlocks() == 1)
114  return TTI::AMK_PreIndexed;
115 
116  return TTI::AMK_None;
117 }
118 
121  using namespace PatternMatch;
122  Intrinsic::ID IID = II.getIntrinsicID();
123  switch (IID) {
124  default:
125  break;
126  case Intrinsic::arm_neon_vld1: {
127  Align MemAlign =
129  &IC.getAssumptionCache(), &IC.getDominatorTree());
130  if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
131  return IC.replaceInstUsesWith(II, V);
132  }
133  break;
134  }
135 
136  case Intrinsic::arm_neon_vld2:
137  case Intrinsic::arm_neon_vld3:
138  case Intrinsic::arm_neon_vld4:
139  case Intrinsic::arm_neon_vld2lane:
140  case Intrinsic::arm_neon_vld3lane:
141  case Intrinsic::arm_neon_vld4lane:
142  case Intrinsic::arm_neon_vst1:
143  case Intrinsic::arm_neon_vst2:
144  case Intrinsic::arm_neon_vst3:
145  case Intrinsic::arm_neon_vst4:
146  case Intrinsic::arm_neon_vst2lane:
147  case Intrinsic::arm_neon_vst3lane:
148  case Intrinsic::arm_neon_vst4lane: {
149  Align MemAlign =
151  &IC.getAssumptionCache(), &IC.getDominatorTree());
152  unsigned AlignArg = II.getNumArgOperands() - 1;
153  Value *AlignArgOp = II.getArgOperand(AlignArg);
154  MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
155  if (Align && *Align < MemAlign) {
156  return IC.replaceOperand(
157  II, AlignArg,
159  false));
160  }
161  break;
162  }
163 
164  case Intrinsic::arm_mve_pred_i2v: {
165  Value *Arg = II.getArgOperand(0);
166  Value *ArgArg;
167  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
168  PatternMatch::m_Value(ArgArg))) &&
169  II.getType() == ArgArg->getType()) {
170  return IC.replaceInstUsesWith(II, ArgArg);
171  }
172  Constant *XorMask;
173  if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
174  PatternMatch::m_Value(ArgArg)),
175  PatternMatch::m_Constant(XorMask))) &&
176  II.getType() == ArgArg->getType()) {
177  if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
178  if (CI->getValue().trunc(16).isAllOnesValue()) {
179  auto TrueVector = IC.Builder.CreateVectorSplat(
180  cast<FixedVectorType>(II.getType())->getNumElements(),
181  IC.Builder.getTrue());
182  return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
183  }
184  }
185  }
186  KnownBits ScalarKnown(32);
187  if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
188  ScalarKnown, 0)) {
189  return &II;
190  }
191  break;
192  }
193  case Intrinsic::arm_mve_pred_v2i: {
194  Value *Arg = II.getArgOperand(0);
195  Value *ArgArg;
196  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
197  PatternMatch::m_Value(ArgArg)))) {
198  return IC.replaceInstUsesWith(II, ArgArg);
199  }
200  if (!II.getMetadata(LLVMContext::MD_range)) {
201  Type *IntTy32 = Type::getInt32Ty(II.getContext());
202  Metadata *M[] = {
204  ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))};
205  II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
206  return &II;
207  }
208  break;
209  }
210  case Intrinsic::arm_mve_vadc:
211  case Intrinsic::arm_mve_vadc_predicated: {
212  unsigned CarryOp =
213  (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
214  assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
215  "Bad type for intrinsic!");
216 
217  KnownBits CarryKnown(32);
218  if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
219  CarryKnown)) {
220  return &II;
221  }
222  break;
223  }
224  case Intrinsic::arm_mve_vmldava: {
225  Instruction *I = cast<Instruction>(&II);
226  if (I->hasOneUse()) {
227  auto *User = cast<Instruction>(*I->user_begin());
228  Value *OpZ;
229  if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
230  match(I->getOperand(3), m_Zero())) {
231  Value *OpX = I->getOperand(4);
232  Value *OpY = I->getOperand(5);
233  Type *OpTy = OpX->getType();
234 
236  Value *V =
237  IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
238  {I->getOperand(0), I->getOperand(1),
239  I->getOperand(2), OpZ, OpX, OpY});
240 
241  IC.replaceInstUsesWith(*User, V);
242  return IC.eraseInstFromFunction(*User);
243  }
244  }
245  return None;
246  }
247  }
248  return None;
249 }
250 
253  assert(Ty->isIntegerTy());
254 
255  unsigned Bits = Ty->getPrimitiveSizeInBits();
256  if (Bits == 0 || Imm.getActiveBits() >= 64)
257  return 4;
258 
259  int64_t SImmVal = Imm.getSExtValue();
260  uint64_t ZImmVal = Imm.getZExtValue();
261  if (!ST->isThumb()) {
262  if ((SImmVal >= 0 && SImmVal < 65536) ||
263  (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
264  (ARM_AM::getSOImmVal(~ZImmVal) != -1))
265  return 1;
266  return ST->hasV6T2Ops() ? 2 : 3;
267  }
268  if (ST->isThumb2()) {
269  if ((SImmVal >= 0 && SImmVal < 65536) ||
270  (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
271  (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
272  return 1;
273  return ST->hasV6T2Ops() ? 2 : 3;
274  }
275  // Thumb1, any i8 imm cost 1.
276  if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
277  return 1;
278  if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
279  return 2;
280  // Load from constantpool.
281  return 3;
282 }
283 
284 // Constants smaller than 256 fit in the immediate field of
285 // Thumb1 instructions so we return a zero cost and 1 otherwise.
286 int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
287  const APInt &Imm, Type *Ty) {
288  if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
289  return 0;
290 
291  return 1;
292 }
293 
294 // Checks whether Inst is part of a min(max()) or max(min()) pattern
295 // that will match to an SSAT instruction
296 static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
297  Value *LHS, *RHS;
298  ConstantInt *C;
299  SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
300 
301  if (InstSPF == SPF_SMAX &&
303  C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) {
304 
305  auto isSSatMin = [&](Value *MinInst) {
306  if (isa<SelectInst>(MinInst)) {
307  Value *MinLHS, *MinRHS;
308  ConstantInt *MinC;
309  SelectPatternFlavor MinSPF =
310  matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
311  if (MinSPF == SPF_SMIN &&
313  MinC->getValue() == ((-Imm) - 1))
314  return true;
315  }
316  return false;
317  };
318 
319  if (isSSatMin(Inst->getOperand(1)) ||
320  (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
321  isSSatMin(*(++Inst->user_begin())))))
322  return true;
323  }
324  return false;
325 }
326 
327 int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
328  const APInt &Imm, Type *Ty,
330  Instruction *Inst) {
331  // Division by a constant can be turned into multiplication, but only if we
332  // know it's constant. So it's not so much that the immediate is cheap (it's
333  // not), but that the alternative is worse.
334  // FIXME: this is probably unneeded with GlobalISel.
335  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
336  Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
337  Idx == 1)
338  return 0;
339 
340  if (Opcode == Instruction::And) {
341  // UXTB/UXTH
342  if (Imm == 255 || Imm == 65535)
343  return 0;
344  // Conversion to BIC is free, and means we can use ~Imm instead.
345  return std::min(getIntImmCost(Imm, Ty, CostKind),
346  getIntImmCost(~Imm, Ty, CostKind));
347  }
348 
349  if (Opcode == Instruction::Add)
350  // Conversion to SUB is free, and means we can use -Imm instead.
351  return std::min(getIntImmCost(Imm, Ty, CostKind),
352  getIntImmCost(-Imm, Ty, CostKind));
353 
354  if (Opcode == Instruction::ICmp && Imm.isNegative() &&
355  Ty->getIntegerBitWidth() == 32) {
356  int64_t NegImm = -Imm.getSExtValue();
357  if (ST->isThumb2() && NegImm < 1<<12)
358  // icmp X, #-C -> cmn X, #C
359  return 0;
360  if (ST->isThumb() && NegImm < 1<<8)
361  // icmp X, #-C -> adds X, #C
362  return 0;
363  }
364 
365  // xor a, -1 can always be folded to MVN
366  if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
367  return 0;
368 
369  // Ensures negative constant of min(max()) or max(min()) patterns that
370  // match to SSAT instructions don't get hoisted
371  if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
372  Ty->getIntegerBitWidth() <= 32) {
373  if (isSSATMinMaxPattern(Inst, Imm) ||
374  (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
375  isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
376  return 0;
377  }
378 
379  return getIntImmCost(Imm, Ty, CostKind);
380 }
381 
384  const Instruction *I) {
386  (ST->hasNEON() || ST->hasMVEIntegerOps())) {
387  // FIXME: The vectorizer is highly sensistive to the cost of these
388  // instructions, which suggests that it may be using the costs incorrectly.
389  // But, for now, just make them free to avoid performance regressions for
390  // vector targets.
391  return 0;
392  }
393  return BaseT::getCFInstrCost(Opcode, CostKind, I);
394 }
395 
397  Type *Src,
400  const Instruction *I) {
401  int ISD = TLI->InstructionOpcodeToISD(Opcode);
402  assert(ISD && "Invalid opcode");
403 
404  // TODO: Allow non-throughput costs that aren't binary.
405  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
407  return Cost == 0 ? 0 : 1;
408  return Cost;
409  };
410  auto IsLegalFPType = [this](EVT VT) {
411  EVT EltVT = VT.getScalarType();
412  return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
413  (EltVT == MVT::f64 && ST->hasFP64()) ||
414  (EltVT == MVT::f16 && ST->hasFullFP16());
415  };
416 
417  EVT SrcTy = TLI->getValueType(DL, Src);
418  EVT DstTy = TLI->getValueType(DL, Dst);
419 
420  if (!SrcTy.isSimple() || !DstTy.isSimple())
421  return AdjustCost(
422  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
423 
424  // Extending masked load/Truncating masked stores is expensive because we
425  // currently don't split them. This means that we'll likely end up
426  // loading/storing each element individually (hence the high cost).
427  if ((ST->hasMVEIntegerOps() &&
428  (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
429  Opcode == Instruction::SExt)) ||
430  (ST->hasMVEFloatOps() &&
431  (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
432  IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
433  if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
434  return 2 * DstTy.getVectorNumElements() *
436 
437  // The extend of other kinds of load is free
438  if (CCH == TTI::CastContextHint::Normal ||
440  static const TypeConversionCostTblEntry LoadConversionTbl[] = {
453  };
454  if (const auto *Entry = ConvertCostTableLookup(
455  LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
456  return AdjustCost(Entry->Cost);
457 
458  static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
465  // The following extend from a legal type to an illegal type, so need to
466  // split the load. This introduced an extra load operation, but the
467  // extend is still "free".
474  };
475  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
476  if (const auto *Entry =
477  ConvertCostTableLookup(MVELoadConversionTbl, ISD,
478  DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
479  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
480  }
481 
482  static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
483  // FPExtends are similar but also require the VCVT instructions.
486  };
487  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
488  if (const auto *Entry =
489  ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
490  DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
491  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
492  }
493 
494  // The truncate of a store is free. This is the mirror of extends above.
495  static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
503  };
504  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
505  if (const auto *Entry =
506  ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
507  SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
508  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
509  }
510 
511  static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
514  };
515  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
516  if (const auto *Entry =
517  ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
518  SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
519  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
520  }
521  }
522 
523  // NEON vector operations that can extend their inputs.
524  if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
525  I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
526  static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
527  // vaddl
528  { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
529  { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
530  // vsubl
531  { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
532  { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
533  // vmull
534  { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
535  { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
536  // vshll
537  { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
538  { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
539  };
540 
541  auto *User = cast<Instruction>(*I->user_begin());
542  int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
543  if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
544  DstTy.getSimpleVT(),
545  SrcTy.getSimpleVT())) {
546  return AdjustCost(Entry->Cost);
547  }
548  }
549 
550  // Single to/from double precision conversions.
551  if (Src->isVectorTy() && ST->hasNEON() &&
552  ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
553  DstTy.getScalarType() == MVT::f32) ||
554  (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
555  DstTy.getScalarType() == MVT::f64))) {
556  static const CostTblEntry NEONFltDblTbl[] = {
557  // Vector fptrunc/fpext conversions.
560  {ISD::FP_EXTEND, MVT::v4f32, 4}};
561 
562  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
563  if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
564  return AdjustCost(LT.first * Entry->Cost);
565  }
566 
567  // Some arithmetic, load and store operations have specific instructions
568  // to cast up/down their types automatically at no extra cost.
569  // TODO: Get these tables to know at least what the related operations are.
570  static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
577 
578  // The number of vmovl instructions for the extension.
597 
598  // Operations that we legalize using splitting.
601 
602  // Vector float <-> i32 conversions.
605 
626 
633 
634  // Vector double <-> i32 conversions.
637 
644 
651  };
652 
653  if (SrcTy.isVector() && ST->hasNEON()) {
654  if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
655  DstTy.getSimpleVT(),
656  SrcTy.getSimpleVT()))
657  return AdjustCost(Entry->Cost);
658  }
659 
660  // Scalar float to integer conversions.
661  static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
682  };
683  if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
684  if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
685  DstTy.getSimpleVT(),
686  SrcTy.getSimpleVT()))
687  return AdjustCost(Entry->Cost);
688  }
689 
690  // Scalar integer to float conversions.
691  static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
712  };
713 
714  if (SrcTy.isInteger() && ST->hasNEON()) {
715  if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
716  ISD, DstTy.getSimpleVT(),
717  SrcTy.getSimpleVT()))
718  return AdjustCost(Entry->Cost);
719  }
720 
721  // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
722  // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
723  // are linearised so take more.
724  static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
737  };
738 
739  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
740  if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
741  ISD, DstTy.getSimpleVT(),
742  SrcTy.getSimpleVT()))
743  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
744  }
745 
746  if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
747  // As general rule, fp converts that were not matched above are scalarized
748  // and cost 1 vcvt for each lane, so long as the instruction is available.
749  // If not it will become a series of function calls.
750  const InstructionCost CallCost =
751  getCallInstrCost(nullptr, Dst, {Src}, CostKind);
752  int Lanes = 1;
753  if (SrcTy.isFixedLengthVector())
754  Lanes = SrcTy.getVectorNumElements();
755 
756  if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
757  return Lanes;
758  else
759  return Lanes * CallCost;
760  }
761 
762  if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
763  SrcTy.isFixedLengthVector()) {
764  // Treat a truncate with larger than legal source (128bits for MVE) as
765  // expensive, 2 instructions per lane.
766  if ((SrcTy.getScalarType() == MVT::i8 ||
767  SrcTy.getScalarType() == MVT::i16 ||
768  SrcTy.getScalarType() == MVT::i32) &&
769  SrcTy.getSizeInBits() > 128 &&
770  SrcTy.getSizeInBits() > DstTy.getSizeInBits())
771  return SrcTy.getVectorNumElements() * 2;
772  }
773 
774  // Scalar integer conversion costs.
775  static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
776  // i16 -> i64 requires two dependent operations.
778 
779  // Truncates on i64 are assumed to be free.
782  { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
784  };
785 
786  if (SrcTy.isInteger()) {
787  if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
788  DstTy.getSimpleVT(),
789  SrcTy.getSimpleVT()))
790  return AdjustCost(Entry->Cost);
791  }
792 
793  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
795  : 1;
796  return AdjustCost(
797  BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
798 }
799 
801  unsigned Index) {
802  // Penalize inserting into an D-subregister. We end up with a three times
803  // lower estimated throughput on swift.
804  if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
805  ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
806  return 3;
807 
808  if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
809  Opcode == Instruction::ExtractElement)) {
810  // Cross-class copies are expensive on many microarchitectures,
811  // so assume they are expensive by default.
812  if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
813  return 3;
814 
815  // Even if it's not a cross class copy, this likely leads to mixing
816  // of NEON and VFP code and should be therefore penalized.
817  if (ValTy->isVectorTy() &&
818  ValTy->getScalarSizeInBits() <= 32)
819  return std::max<InstructionCost>(
820  BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
821  }
822 
823  if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
824  Opcode == Instruction::ExtractElement)) {
825  // Integer cross-lane moves are more expensive than float, which can
826  // sometimes just be vmovs. Integer involve being passes to GPR registers,
827  // causing more of a delay.
828  std::pair<unsigned, MVT> LT =
829  getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
830  return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
831  }
832 
833  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
834 }
835 
837  Type *CondTy,
838  CmpInst::Predicate VecPred,
840  const Instruction *I) {
841  int ISD = TLI->InstructionOpcodeToISD(Opcode);
842 
843  // Thumb scalar code size cost for select.
844  if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
845  ST->isThumb() && !ValTy->isVectorTy()) {
846  // Assume expensive structs.
847  if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
848  return TTI::TCC_Expensive;
849 
850  // Select costs can vary because they:
851  // - may require one or more conditional mov (including an IT),
852  // - can't operate directly on immediates,
853  // - require live flags, which we can't copy around easily.
854  int Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
855 
856  // Possible IT instruction for Thumb2, or more for Thumb1.
857  ++Cost;
858 
859  // i1 values may need rematerialising by using mov immediates and/or
860  // flag setting instructions.
861  if (ValTy->isIntegerTy(1))
862  ++Cost;
863 
864  return Cost;
865  }
866 
867  // If this is a vector min/max/abs, use the cost of that intrinsic directly
868  // instead. Hopefully when min/max intrinsics are more prevalent this code
869  // will not be needed.
870  const Instruction *Sel = I;
871  if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
872  Sel->hasOneUse())
873  Sel = cast<Instruction>(Sel->user_back());
874  if (Sel && ValTy->isVectorTy() &&
875  (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
876  const Value *LHS, *RHS;
877  SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
878  unsigned IID = 0;
879  switch (SPF) {
880  case SPF_ABS:
881  IID = Intrinsic::abs;
882  break;
883  case SPF_SMIN:
884  IID = Intrinsic::smin;
885  break;
886  case SPF_SMAX:
887  IID = Intrinsic::smax;
888  break;
889  case SPF_UMIN:
890  IID = Intrinsic::umin;
891  break;
892  case SPF_UMAX:
893  IID = Intrinsic::umax;
894  break;
895  case SPF_FMINNUM:
896  IID = Intrinsic::minnum;
897  break;
898  case SPF_FMAXNUM:
899  IID = Intrinsic::maxnum;
900  break;
901  default:
902  break;
903  }
904  if (IID) {
905  // The ICmp is free, the select gets the cost of the min/max/etc
906  if (Sel != I)
907  return 0;
908  IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
909  return getIntrinsicInstrCost(CostAttrs, CostKind);
910  }
911  }
912 
913  // On NEON a vector select gets lowered to vbsl.
914  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
915  // Lowering of some vector selects is currently far from perfect.
916  static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
917  { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
918  { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
920  };
921 
922  EVT SelCondTy = TLI->getValueType(DL, CondTy);
923  EVT SelValTy = TLI->getValueType(DL, ValTy);
924  if (SelCondTy.isSimple() && SelValTy.isSimple()) {
925  if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
926  SelCondTy.getSimpleVT(),
927  SelValTy.getSimpleVT()))
928  return Entry->Cost;
929  }
930 
931  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
932  return LT.first;
933  }
934 
935  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
936  (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
937  cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
938  FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
939  FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
940  if (!VecCondTy)
941  VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
942 
943  // If we don't have mve.fp any fp operations will need to be scalarized.
944  if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
945  // One scalaization insert, one scalarization extract and the cost of the
946  // fcmps.
947  return BaseT::getScalarizationOverhead(VecValTy, false, true) +
948  BaseT::getScalarizationOverhead(VecCondTy, true, false) +
949  VecValTy->getNumElements() *
950  getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
951  VecCondTy->getScalarType(), VecPred, CostKind,
952  I);
953  }
954 
955  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
956  int BaseCost = ST->getMVEVectorCostFactor(CostKind);
957  // There are two types - the input that specifies the type of the compare
958  // and the output vXi1 type. Because we don't know how the output will be
959  // split, we may need an expensive shuffle to get two in sync. This has the
960  // effect of making larger than legal compares (v8i32 for example)
961  // expensive.
962  if (LT.second.getVectorNumElements() > 2) {
963  if (LT.first > 1)
964  return LT.first * BaseCost +
965  BaseT::getScalarizationOverhead(VecCondTy, true, false);
966  return BaseCost;
967  }
968  }
969 
970  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
971  // for "multiple beats" potentially needed by MVE instructions.
972  int BaseCost = 1;
973  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
974  BaseCost = ST->getMVEVectorCostFactor(CostKind);
975 
976  return BaseCost *
977  BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
978 }
979 
981  const SCEV *Ptr) {
982  // Address computations in vectorized code with non-consecutive addresses will
983  // likely result in more instructions compared to scalar code where the
984  // computation can more often be merged into the index mode. The resulting
985  // extra micro-ops can significantly decrease throughput.
986  unsigned NumVectorInstToHideOverhead = 10;
987  int MaxMergeDistance = 64;
988 
989  if (ST->hasNEON()) {
990  if (Ty->isVectorTy() && SE &&
991  !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
992  return NumVectorInstToHideOverhead;
993 
994  // In many cases the address computation is not merged into the instruction
995  // addressing mode.
996  return 1;
997  }
998  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
999 }
1000 
1002  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1003  // If a VCTP is part of a chain, it's already profitable and shouldn't be
1004  // optimized, else LSR may block tail-predication.
1005  switch (II->getIntrinsicID()) {
1006  case Intrinsic::arm_mve_vctp8:
1007  case Intrinsic::arm_mve_vctp16:
1008  case Intrinsic::arm_mve_vctp32:
1009  case Intrinsic::arm_mve_vctp64:
1010  return true;
1011  default:
1012  break;
1013  }
1014  }
1015  return false;
1016 }
1017 
1018 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1020  return false;
1021 
1022  if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1023  // Don't support v2i1 yet.
1024  if (VecTy->getNumElements() == 2)
1025  return false;
1026 
1027  // We don't support extending fp types.
1028  unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1029  if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1030  return false;
1031  }
1032 
1033  unsigned EltWidth = DataTy->getScalarSizeInBits();
1034  return (EltWidth == 32 && Alignment >= 4) ||
1035  (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1036 }
1037 
1040  return false;
1041 
1042  // This method is called in 2 places:
1043  // - from the vectorizer with a scalar type, in which case we need to get
1044  // this as good as we can with the limited info we have (and rely on the cost
1045  // model for the rest).
1046  // - from the masked intrinsic lowering pass with the actual vector type.
1047  // For MVE, we have a custom lowering pass that will already have custom
1048  // legalised any gathers that we can to MVE intrinsics, and want to expand all
1049  // the rest. The pass runs before the masked intrinsic lowering pass, so if we
1050  // are here, we know we want to expand.
1051  if (isa<VectorType>(Ty))
1052  return false;
1053 
1054  unsigned EltWidth = Ty->getScalarSizeInBits();
1055  return ((EltWidth == 32 && Alignment >= 4) ||
1056  (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1057 }
1058 
1059 /// Given a memcpy/memset/memmove instruction, return the number of memory
1060 /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1061 /// call is used.
1063  MemOp MOp;
1064  unsigned DstAddrSpace = ~0u;
1065  unsigned SrcAddrSpace = ~0u;
1066  const Function *F = I->getParent()->getParent();
1067 
1068  if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1069  ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1070  // If 'size' is not a constant, a library call will be generated.
1071  if (!C)
1072  return -1;
1073 
1074  const unsigned Size = C->getValue().getZExtValue();
1075  const Align DstAlign = *MC->getDestAlign();
1076  const Align SrcAlign = *MC->getSourceAlign();
1077 
1078  MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1079  /*IsVolatile*/ false);
1080  DstAddrSpace = MC->getDestAddressSpace();
1081  SrcAddrSpace = MC->getSourceAddressSpace();
1082  }
1083  else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1084  ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1085  // If 'size' is not a constant, a library call will be generated.
1086  if (!C)
1087  return -1;
1088 
1089  const unsigned Size = C->getValue().getZExtValue();
1090  const Align DstAlign = *MS->getDestAlign();
1091 
1092  MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1093  /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1094  DstAddrSpace = MS->getDestAddressSpace();
1095  }
1096  else
1097  llvm_unreachable("Expected a memcpy/move or memset!");
1098 
1099  unsigned Limit, Factor = 2;
1100  switch(I->getIntrinsicID()) {
1101  case Intrinsic::memcpy:
1102  Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1103  break;
1104  case Intrinsic::memmove:
1105  Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1106  break;
1107  case Intrinsic::memset:
1108  Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1109  Factor = 1;
1110  break;
1111  default:
1112  llvm_unreachable("Expected a memcpy/move or memset!");
1113  }
1114 
1115  // MemOps will be poplulated with a list of data types that needs to be
1116  // loaded and stored. That's why we multiply the number of elements by 2 to
1117  // get the cost for this memcpy.
1118  std::vector<EVT> MemOps;
1119  if (getTLI()->findOptimalMemOpLowering(
1120  MemOps, Limit, MOp, DstAddrSpace,
1121  SrcAddrSpace, F->getAttributes()))
1122  return MemOps.size() * Factor;
1123 
1124  // If we can't find an optimal memop lowering, return the default cost
1125  return -1;
1126 }
1127 
1129  int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1130 
1131  // To model the cost of a library call, we assume 1 for the call, and
1132  // 3 for the argument setup.
1133  if (NumOps == -1)
1134  return 4;
1135  return NumOps;
1136 }
1137 
1140  int Index, VectorType *SubTp) {
1141  if (ST->hasNEON()) {
1142  if (Kind == TTI::SK_Broadcast) {
1143  static const CostTblEntry NEONDupTbl[] = {
1144  // VDUP handles these cases.
1151 
1156 
1157  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1158 
1159  if (const auto *Entry =
1160  CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1161  return LT.first * Entry->Cost;
1162  }
1163  if (Kind == TTI::SK_Reverse) {
1164  static const CostTblEntry NEONShuffleTbl[] = {
1165  // Reverse shuffle cost one instruction if we are shuffling within a
1166  // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1173 
1178 
1179  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1180 
1181  if (const auto *Entry =
1182  CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1183  return LT.first * Entry->Cost;
1184  }
1185  if (Kind == TTI::SK_Select) {
1186  static const CostTblEntry NEONSelShuffleTbl[] = {
1187  // Select shuffle cost table for ARM. Cost is the number of
1188  // instructions
1189  // required to create the shuffled vector.
1190 
1195 
1199 
1201 
1203 
1204  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1205  if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1206  ISD::VECTOR_SHUFFLE, LT.second))
1207  return LT.first * Entry->Cost;
1208  }
1209  }
1210  if (ST->hasMVEIntegerOps()) {
1211  if (Kind == TTI::SK_Broadcast) {
1212  static const CostTblEntry MVEDupTbl[] = {
1213  // VDUP handles these cases.
1219 
1220  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1221 
1222  if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1223  LT.second))
1224  return LT.first * Entry->Cost *
1226  }
1227 
1228  if (!Mask.empty()) {
1229  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1230  if (Mask.size() <= LT.second.getVectorNumElements() &&
1231  (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1232  isVREVMask(Mask, LT.second, 64)))
1234  }
1235  }
1236 
1237  int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1239  : 1;
1240  return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
1241 }
1242 
1244  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1246  TTI::OperandValueProperties Opd1PropInfo,
1248  const Instruction *CxtI) {
1249  int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1250  if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1251  // Make operations on i1 relatively expensive as this often involves
1252  // combining predicates. AND and XOR should be easier to handle with IT
1253  // blocks.
1254  switch (ISDOpcode) {
1255  default:
1256  break;
1257  case ISD::AND:
1258  case ISD::XOR:
1259  return 2;
1260  case ISD::OR:
1261  return 3;
1262  }
1263  }
1264 
1265  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1266 
1267  if (ST->hasNEON()) {
1268  const unsigned FunctionCallDivCost = 20;
1269  const unsigned ReciprocalDivCost = 10;
1270  static const CostTblEntry CostTbl[] = {
1271  // Division.
1272  // These costs are somewhat random. Choose a cost of 20 to indicate that
1273  // vectorizing devision (added function call) is going to be very expensive.
1274  // Double registers types.
1275  { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1276  { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1277  { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1278  { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1279  { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1280  { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1281  { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1282  { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1283  { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1284  { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1285  { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1286  { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1287  { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1288  { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1289  { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1290  { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1291  // Quad register types.
1292  { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1293  { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1294  { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1295  { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1296  { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1297  { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1298  { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1299  { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1300  { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1301  { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1302  { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1303  { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1304  { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1305  { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1306  { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1307  { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1308  // Multiplication.
1309  };
1310 
1311  if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1312  return LT.first * Entry->Cost;
1313 
1315  Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
1316 
1317  // This is somewhat of a hack. The problem that we are facing is that SROA
1318  // creates a sequence of shift, and, or instructions to construct values.
1319  // These sequences are recognized by the ISel and have zero-cost. Not so for
1320  // the vectorized code. Because we have support for v2i64 but not i64 those
1321  // sequences look particularly beneficial to vectorize.
1322  // To work around this we increase the cost of v2i64 operations to make them
1323  // seem less beneficial.
1324  if (LT.second == MVT::v2i64 &&
1326  Cost += 4;
1327 
1328  return Cost;
1329  }
1330 
1331  // If this operation is a shift on arm/thumb2, it might well be folded into
1332  // the following instruction, hence having a cost of 0.
1333  auto LooksLikeAFreeShift = [&]() {
1334  if (ST->isThumb1Only() || Ty->isVectorTy())
1335  return false;
1336 
1337  if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1338  return false;
1340  return false;
1341 
1342  // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1343  switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1344  case Instruction::Add:
1345  case Instruction::Sub:
1346  case Instruction::And:
1347  case Instruction::Xor:
1348  case Instruction::Or:
1349  case Instruction::ICmp:
1350  return true;
1351  default:
1352  return false;
1353  }
1354  };
1355  if (LooksLikeAFreeShift())
1356  return 0;
1357 
1358  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1359  // for "multiple beats" potentially needed by MVE instructions.
1360  int BaseCost = 1;
1361  if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1362  BaseCost = ST->getMVEVectorCostFactor(CostKind);
1363 
1364  // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1365  // without treating floats as more expensive that scalars or increasing the
1366  // costs for custom operations. The results is also multiplied by the
1367  // MVEVectorCostFactor where appropriate.
1368  if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1369  return LT.first * BaseCost;
1370 
1371  // Else this is expand, assume that we need to scalarize this op.
1372  if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1373  unsigned Num = VTy->getNumElements();
1374  InstructionCost Cost =
1376  // Return the cost of multiple scalar invocation plus the cost of
1377  // inserting and extracting the values.
1378  SmallVector<Type *> Tys(Args.size(), Ty);
1379  return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
1380  }
1381 
1382  return BaseCost;
1383 }
1384 
1386  MaybeAlign Alignment,
1387  unsigned AddressSpace,
1389  const Instruction *I) {
1390  // TODO: Handle other cost kinds.
1392  return 1;
1393 
1394  // Type legalization can't handle structs
1395  if (TLI->getValueType(DL, Src, true) == MVT::Other)
1396  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1397  CostKind);
1398 
1399  if (ST->hasNEON() && Src->isVectorTy() &&
1400  (Alignment && *Alignment != Align(16)) &&
1401  cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1402  // Unaligned loads/stores are extremely inefficient.
1403  // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1404  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1405  return LT.first * 4;
1406  }
1407 
1408  // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1409  // Same for stores.
1410  if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1411  ((Opcode == Instruction::Load && I->hasOneUse() &&
1412  isa<FPExtInst>(*I->user_begin())) ||
1413  (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1414  FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1415  Type *DstTy =
1416  Opcode == Instruction::Load
1417  ? (*I->user_begin())->getType()
1418  : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1419  if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1420  DstTy->getScalarType()->isFloatTy())
1421  return ST->getMVEVectorCostFactor(CostKind);
1422  }
1423 
1424  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1426  : 1;
1427  return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1428  CostKind, I);
1429 }
1430 
1432 ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1433  unsigned AddressSpace,
1435  if (ST->hasMVEIntegerOps()) {
1436  if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1437  return ST->getMVEVectorCostFactor(CostKind);
1438  if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1439  return ST->getMVEVectorCostFactor(CostKind);
1440  }
1441  if (!isa<FixedVectorType>(Src))
1442  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1443  CostKind);
1444  // Scalar cost, which is currently very high due to the efficiency of the
1445  // generated code.
1446  return cast<FixedVectorType>(Src)->getNumElements() * 8;
1447 }
1448 
1450  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1451  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1452  bool UseMaskForCond, bool UseMaskForGaps) {
1453  assert(Factor >= 2 && "Invalid interleave factor");
1454  assert(isa<VectorType>(VecTy) && "Expect a vector type");
1455 
1456  // vldN/vstN doesn't support vector types of i64/f64 element.
1457  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1458 
1459  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1460  !UseMaskForCond && !UseMaskForGaps) {
1461  unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1462  auto *SubVecTy =
1463  FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1464 
1465  // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1466  // Accesses having vector types that are a multiple of 128 bits can be
1467  // matched to more than one vldN/vstN instruction.
1468  int BaseCost =
1470  if (NumElts % Factor == 0 &&
1471  TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1472  return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1473 
1474  // Some smaller than legal interleaved patterns are cheap as we can make
1475  // use of the vmovn or vrev patterns to interleave a standard load. This is
1476  // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1477  // promoted differently). The cost of 2 here is then a load and vrev or
1478  // vmovn.
1479  if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1480  VecTy->isIntOrIntVectorTy() &&
1481  DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
1482  return 2 * BaseCost;
1483  }
1484 
1485  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1486  Alignment, AddressSpace, CostKind,
1487  UseMaskForCond, UseMaskForGaps);
1488 }
1489 
1491  unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1492  Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1493  using namespace PatternMatch;
1494  if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1495  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1496  Alignment, CostKind, I);
1497 
1498  assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1499  auto *VTy = cast<FixedVectorType>(DataTy);
1500 
1501  // TODO: Splitting, once we do that.
1502 
1503  unsigned NumElems = VTy->getNumElements();
1504  unsigned EltSize = VTy->getScalarSizeInBits();
1505  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
1506 
1507  // For now, it is assumed that for the MVE gather instructions the loads are
1508  // all effectively serialised. This means the cost is the scalar cost
1509  // multiplied by the number of elements being loaded. This is possibly very
1510  // conservative, but even so we still end up vectorising loops because the
1511  // cost per iteration for many loops is lower than for scalar loops.
1512  unsigned VectorCost =
1513  NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1514  // The scalarization cost should be a lot higher. We use the number of vector
1515  // elements plus the scalarization overhead.
1516  unsigned ScalarCost = NumElems * LT.first +
1517  BaseT::getScalarizationOverhead(VTy, true, false) +
1518  BaseT::getScalarizationOverhead(VTy, false, true);
1519 
1520  if (EltSize < 8 || Alignment < EltSize / 8)
1521  return ScalarCost;
1522 
1523  unsigned ExtSize = EltSize;
1524  // Check whether there's a single user that asks for an extended type
1525  if (I != nullptr) {
1526  // Dependent of the caller of this function, a gather instruction will
1527  // either have opcode Instruction::Load or be a call to the masked_gather
1528  // intrinsic
1529  if ((I->getOpcode() == Instruction::Load ||
1530  match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1531  I->hasOneUse()) {
1532  const User *Us = *I->users().begin();
1533  if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1534  // only allow valid type combinations
1535  unsigned TypeSize =
1536  cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1537  if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1538  (TypeSize == 16 && EltSize == 8)) &&
1539  TypeSize * NumElems == 128) {
1540  ExtSize = TypeSize;
1541  }
1542  }
1543  }
1544  // Check whether the input data needs to be truncated
1545  TruncInst *T;
1546  if ((I->getOpcode() == Instruction::Store ||
1547  match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1548  (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1549  // Only allow valid type combinations
1550  unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1551  if (((EltSize == 16 && TypeSize == 32) ||
1552  (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1553  TypeSize * NumElems == 128)
1554  ExtSize = TypeSize;
1555  }
1556  }
1557 
1558  if (ExtSize * NumElems != 128 || NumElems < 4)
1559  return ScalarCost;
1560 
1561  // Any (aligned) i32 gather will not need to be scalarised.
1562  if (ExtSize == 32)
1563  return VectorCost;
1564  // For smaller types, we need to ensure that the gep's inputs are correctly
1565  // extended from a small enough value. Other sizes (including i64) are
1566  // scalarized for now.
1567  if (ExtSize != 8 && ExtSize != 16)
1568  return ScalarCost;
1569 
1570  if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1571  Ptr = BC->getOperand(0);
1572  if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1573  if (GEP->getNumOperands() != 2)
1574  return ScalarCost;
1575  unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1576  // Scale needs to be correct (which is only relevant for i16s).
1577  if (Scale != 1 && Scale * 8 != ExtSize)
1578  return ScalarCost;
1579  // And we need to zext (not sext) the indexes from a small enough type.
1580  if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1581  if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1582  return VectorCost;
1583  }
1584  return ScalarCost;
1585  }
1586  return ScalarCost;
1587 }
1588 
1591  bool IsPairwiseForm,
1593  EVT ValVT = TLI->getValueType(DL, ValTy);
1594  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1595  if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1596  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
1597  CostKind);
1598 
1599  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1600 
1601  static const CostTblEntry CostTblAdd[]{
1602  {ISD::ADD, MVT::v16i8, 1},
1603  {ISD::ADD, MVT::v8i16, 1},
1604  {ISD::ADD, MVT::v4i32, 1},
1605  };
1606  if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1607  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1608 
1609  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
1610  CostKind);
1611 }
1612 
1614 ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
1615  Type *ResTy, VectorType *ValTy,
1617  EVT ValVT = TLI->getValueType(DL, ValTy);
1618  EVT ResVT = TLI->getValueType(DL, ResTy);
1619  if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1620  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1621  if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) ||
1622  (LT.second == MVT::v8i16 &&
1623  ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) ||
1624  (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64))
1625  return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1626  }
1627 
1628  return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
1629  CostKind);
1630 }
1631 
1635  switch (ICA.getID()) {
1636  case Intrinsic::get_active_lane_mask:
1637  // Currently we make a somewhat optimistic assumption that
1638  // active_lane_mask's are always free. In reality it may be freely folded
1639  // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1640  // of add/icmp code. We may need to improve this in the future, but being
1641  // able to detect if it is free or not involves looking at a lot of other
1642  // code. We currently assume that the vectorizer inserted these, and knew
1643  // what it was doing in adding one.
1644  if (ST->hasMVEIntegerOps())
1645  return 0;
1646  break;
1647  case Intrinsic::sadd_sat:
1648  case Intrinsic::ssub_sat:
1649  case Intrinsic::uadd_sat:
1650  case Intrinsic::usub_sat: {
1651  if (!ST->hasMVEIntegerOps())
1652  break;
1653  Type *VT = ICA.getReturnType();
1654 
1655  std::pair<int, MVT> LT =
1656  TLI->getTypeLegalizationCost(DL, VT);
1657  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1658  LT.second == MVT::v16i8) {
1659  // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1660  // need to extend the type, as it uses shr(qadd(shl, shl)).
1661  unsigned Instrs =
1662  LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1663  return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1664  }
1665  break;
1666  }
1667  case Intrinsic::abs:
1668  case Intrinsic::smin:
1669  case Intrinsic::smax:
1670  case Intrinsic::umin:
1671  case Intrinsic::umax: {
1672  if (!ST->hasMVEIntegerOps())
1673  break;
1674  Type *VT = ICA.getReturnType();
1675 
1676  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1677  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1678  LT.second == MVT::v16i8)
1679  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1680  break;
1681  }
1682  case Intrinsic::minnum:
1683  case Intrinsic::maxnum: {
1684  if (!ST->hasMVEFloatOps())
1685  break;
1686  Type *VT = ICA.getReturnType();
1687  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1688  if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1689  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1690  break;
1691  }
1692  }
1693 
1695 }
1696 
1698  if (!F->isIntrinsic())
1700 
1701  // Assume all Arm-specific intrinsics map to an instruction.
1702  if (F->getName().startswith("llvm.arm"))
1703  return false;
1704 
1705  switch (F->getIntrinsicID()) {
1706  default: break;
1707  case Intrinsic::powi:
1708  case Intrinsic::sin:
1709  case Intrinsic::cos:
1710  case Intrinsic::pow:
1711  case Intrinsic::log:
1712  case Intrinsic::log10:
1713  case Intrinsic::log2:
1714  case Intrinsic::exp:
1715  case Intrinsic::exp2:
1716  return true;
1717  case Intrinsic::sqrt:
1718  case Intrinsic::fabs:
1719  case Intrinsic::copysign:
1720  case Intrinsic::floor:
1721  case Intrinsic::ceil:
1722  case Intrinsic::trunc:
1723  case Intrinsic::rint:
1724  case Intrinsic::nearbyint:
1725  case Intrinsic::round:
1726  case Intrinsic::canonicalize:
1727  case Intrinsic::lround:
1728  case Intrinsic::llround:
1729  case Intrinsic::lrint:
1730  case Intrinsic::llrint:
1731  if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
1732  return true;
1733  if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
1734  return true;
1735  // Some operations can be handled by vector instructions and assume
1736  // unsupported vectors will be expanded into supported scalar ones.
1737  // TODO Handle scalar operations properly.
1738  return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
1739  case Intrinsic::masked_store:
1740  case Intrinsic::masked_load:
1741  case Intrinsic::masked_gather:
1742  case Intrinsic::masked_scatter:
1743  return !ST->hasMVEIntegerOps();
1744  case Intrinsic::sadd_with_overflow:
1745  case Intrinsic::uadd_with_overflow:
1746  case Intrinsic::ssub_with_overflow:
1747  case Intrinsic::usub_with_overflow:
1748  case Intrinsic::sadd_sat:
1749  case Intrinsic::uadd_sat:
1750  case Intrinsic::ssub_sat:
1751  case Intrinsic::usub_sat:
1752  return false;
1753  }
1754 
1755  return BaseT::isLoweredToCall(F);
1756 }
1757 
1759  unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1760  EVT VT = TLI->getValueType(DL, I.getType(), true);
1761  if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1762  return true;
1763 
1764  // Check if an intrinsic will be lowered to a call and assume that any
1765  // other CallInst will generate a bl.
1766  if (auto *Call = dyn_cast<CallInst>(&I)) {
1767  if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1768  switch(II->getIntrinsicID()) {
1769  case Intrinsic::memcpy:
1770  case Intrinsic::memset:
1771  case Intrinsic::memmove:
1772  return getNumMemOps(II) == -1;
1773  default:
1774  if (const Function *F = Call->getCalledFunction())
1775  return isLoweredToCall(F);
1776  }
1777  }
1778  return true;
1779  }
1780 
1781  // FPv5 provides conversions between integer, double-precision,
1782  // single-precision, and half-precision formats.
1783  switch (I.getOpcode()) {
1784  default:
1785  break;
1786  case Instruction::FPToSI:
1787  case Instruction::FPToUI:
1788  case Instruction::SIToFP:
1789  case Instruction::UIToFP:
1790  case Instruction::FPTrunc:
1791  case Instruction::FPExt:
1792  return !ST->hasFPARMv8Base();
1793  }
1794 
1795  // FIXME: Unfortunately the approach of checking the Operation Action does
1796  // not catch all cases of Legalization that use library calls. Our
1797  // Legalization step categorizes some transformations into library calls as
1798  // Custom, Expand or even Legal when doing type legalization. So for now
1799  // we have to special case for instance the SDIV of 64bit integers and the
1800  // use of floating point emulation.
1801  if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1802  switch (ISD) {
1803  default:
1804  break;
1805  case ISD::SDIV:
1806  case ISD::UDIV:
1807  case ISD::SREM:
1808  case ISD::UREM:
1809  case ISD::SDIVREM:
1810  case ISD::UDIVREM:
1811  return true;
1812  }
1813  }
1814 
1815  // Assume all other non-float operations are supported.
1816  if (!VT.isFloatingPoint())
1817  return false;
1818 
1819  // We'll need a library call to handle most floats when using soft.
1820  if (TLI->useSoftFloat()) {
1821  switch (I.getOpcode()) {
1822  default:
1823  return true;
1824  case Instruction::Alloca:
1825  case Instruction::Load:
1826  case Instruction::Store:
1827  case Instruction::Select:
1828  case Instruction::PHI:
1829  return false;
1830  }
1831  }
1832 
1833  // We'll need a libcall to perform double precision operations on a single
1834  // precision only FPU.
1835  if (I.getType()->isDoubleTy() && !ST->hasFP64())
1836  return true;
1837 
1838  // Likewise for half precision arithmetic.
1839  if (I.getType()->isHalfTy() && !ST->hasFullFP16())
1840  return true;
1841 
1842  return false;
1843 }
1844 
1846  AssumptionCache &AC,
1847  TargetLibraryInfo *LibInfo,
1848  HardwareLoopInfo &HWLoopInfo) {
1849  // Low-overhead branches are only supported in the 'low-overhead branch'
1850  // extension of v8.1-m.
1851  if (!ST->hasLOB() || DisableLowOverheadLoops) {
1852  LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
1853  return false;
1854  }
1855 
1857  LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
1858  return false;
1859  }
1860 
1861  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1862  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
1863  LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
1864  return false;
1865  }
1866 
1867  const SCEV *TripCountSCEV =
1868  SE.getAddExpr(BackedgeTakenCount,
1869  SE.getOne(BackedgeTakenCount->getType()));
1870 
1871  // We need to store the trip count in LR, a 32-bit register.
1872  if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
1873  LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
1874  return false;
1875  }
1876 
1877  // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
1878  // point in generating a hardware loop if that's going to happen.
1879 
1880  auto IsHardwareLoopIntrinsic = [](Instruction &I) {
1881  if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
1882  switch (Call->getIntrinsicID()) {
1883  default:
1884  break;
1885  case Intrinsic::start_loop_iterations:
1886  case Intrinsic::test_start_loop_iterations:
1887  case Intrinsic::loop_decrement:
1888  case Intrinsic::loop_decrement_reg:
1889  return true;
1890  }
1891  }
1892  return false;
1893  };
1894 
1895  // Scan the instructions to see if there's any that we know will turn into a
1896  // call or if this loop is already a low-overhead loop or will become a tail
1897  // predicated loop.
1898  bool IsTailPredLoop = false;
1899  auto ScanLoop = [&](Loop *L) {
1900  for (auto *BB : L->getBlocks()) {
1901  for (auto &I : *BB) {
1902  if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
1903  isa<InlineAsm>(I)) {
1904  LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
1905  return false;
1906  }
1907  if (auto *II = dyn_cast<IntrinsicInst>(&I))
1908  IsTailPredLoop |=
1909  II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
1910  II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
1911  II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
1912  II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
1913  II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
1914  }
1915  }
1916  return true;
1917  };
1918 
1919  // Visit inner loops.
1920  for (auto Inner : *L)
1921  if (!ScanLoop(Inner))
1922  return false;
1923 
1924  if (!ScanLoop(L))
1925  return false;
1926 
1927  // TODO: Check whether the trip count calculation is expensive. If L is the
1928  // inner loop but we know it has a low trip count, calculating that trip
1929  // count (in the parent loop) may be detrimental.
1930 
1931  LLVMContext &C = L->getHeader()->getContext();
1932  HWLoopInfo.CounterInReg = true;
1933  HWLoopInfo.IsNestingLegal = false;
1934  HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
1935  HWLoopInfo.CountType = Type::getInt32Ty(C);
1936  HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
1937  return true;
1938 }
1939 
1940 static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
1941  // We don't allow icmp's, and because we only look at single block loops,
1942  // we simply count the icmps, i.e. there should only be 1 for the backedge.
1943  if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
1944  return false;
1945 
1946  if (isa<FCmpInst>(&I))
1947  return false;
1948 
1949  // We could allow extending/narrowing FP loads/stores, but codegen is
1950  // too inefficient so reject this for now.
1951  if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
1952  return false;
1953 
1954  // Extends have to be extending-loads
1955  if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
1956  if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
1957  return false;
1958 
1959  // Truncs have to be narrowing-stores
1960  if (isa<TruncInst>(&I) )
1961  if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
1962  return false;
1963 
1964  return true;
1965 }
1966 
1967 // To set up a tail-predicated loop, we need to know the total number of
1968 // elements processed by that loop. Thus, we need to determine the element
1969 // size and:
1970 // 1) it should be uniform for all operations in the vector loop, so we
1971 // e.g. don't want any widening/narrowing operations.
1972 // 2) it should be smaller than i64s because we don't have vector operations
1973 // that work on i64s.
1974 // 3) we don't want elements to be reversed or shuffled, to make sure the
1975 // tail-predication masks/predicates the right lanes.
1976 //
1978  const DataLayout &DL,
1979  const LoopAccessInfo *LAI) {
1980  LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
1981 
1982  // If there are live-out values, it is probably a reduction. We can predicate
1983  // most reduction operations freely under MVE using a combination of
1984  // prefer-predicated-reduction-select and inloop reductions. We limit this to
1985  // floating point and integer reductions, but don't check for operators
1986  // specifically here. If the value ends up not being a reduction (and so the
1987  // vectorizer cannot tailfold the loop), we should fall back to standard
1988  // vectorization automatically.
1990  LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
1991  bool ReductionsDisabled =
1994 
1995  for (auto *I : LiveOuts) {
1996  if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
1997  !I->getType()->isHalfTy()) {
1998  LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
1999  "live-out value\n");
2000  return false;
2001  }
2002  if (ReductionsDisabled) {
2003  LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2004  return false;
2005  }
2006  }
2007 
2008  // Next, check that all instructions can be tail-predicated.
2009  PredicatedScalarEvolution PSE = LAI->getPSE();
2010  SmallVector<Instruction *, 16> LoadStores;
2011  int ICmpCount = 0;
2012 
2013  for (BasicBlock *BB : L->blocks()) {
2014  for (Instruction &I : BB->instructionsWithoutDebug()) {
2015  if (isa<PHINode>(&I))
2016  continue;
2017  if (!canTailPredicateInstruction(I, ICmpCount)) {
2018  LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2019  return false;
2020  }
2021 
2022  Type *T = I.getType();
2023  if (T->isPointerTy())
2024  T = T->getPointerElementType();
2025 
2026  if (T->getScalarSizeInBits() > 32) {
2027  LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2028  return false;
2029  }
2030  if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2031  Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
2032  int64_t NextStride = getPtrStride(PSE, Ptr, L);
2033  if (NextStride == 1) {
2034  // TODO: for now only allow consecutive strides of 1. We could support
2035  // other strides as long as it is uniform, but let's keep it simple
2036  // for now.
2037  continue;
2038  } else if (NextStride == -1 ||
2039  (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2040  (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2041  LLVM_DEBUG(dbgs()
2042  << "Consecutive strides of 2 found, vld2/vstr2 can't "
2043  "be tail-predicated\n.");
2044  return false;
2045  // TODO: don't tail predicate if there is a reversed load?
2046  } else if (EnableMaskedGatherScatters) {
2047  // Gather/scatters do allow loading from arbitrary strides, at
2048  // least if they are loop invariant.
2049  // TODO: Loop variant strides should in theory work, too, but
2050  // this requires further testing.
2051  const SCEV *PtrScev =
2053  if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2054  const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2055  if (PSE.getSE()->isLoopInvariant(Step, L))
2056  continue;
2057  }
2058  }
2059  LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2060  "tail-predicate\n.");
2061  return false;
2062  }
2063  }
2064  }
2065 
2066  LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2067  return true;
2068 }
2069 
2071  ScalarEvolution &SE,
2072  AssumptionCache &AC,
2073  TargetLibraryInfo *TLI,
2074  DominatorTree *DT,
2075  const LoopAccessInfo *LAI) {
2076  if (!EnableTailPredication) {
2077  LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2078  return false;
2079  }
2080 
2081  // Creating a predicated vector loop is the first step for generating a
2082  // tail-predicated hardware loop, for which we need the MVE masked
2083  // load/stores instructions:
2084  if (!ST->hasMVEIntegerOps())
2085  return false;
2086 
2087  // For now, restrict this to single block loops.
2088  if (L->getNumBlocks() > 1) {
2089  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2090  "loop.\n");
2091  return false;
2092  }
2093 
2094  assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2095 
2096  HardwareLoopInfo HWLoopInfo(L);
2097  if (!HWLoopInfo.canAnalyze(*LI)) {
2098  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2099  "analyzable.\n");
2100  return false;
2101  }
2102 
2103  // This checks if we have the low-overhead branch architecture
2104  // extension, and if we will create a hardware-loop:
2105  if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2106  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2107  "profitable.\n");
2108  return false;
2109  }
2110 
2111  if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2112  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2113  "a candidate.\n");
2114  return false;
2115  }
2116 
2117  return canTailPredicateLoop(L, LI, SE, DL, LAI);
2118 }
2119 
2121  if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2122  return false;
2123 
2124  // Intrinsic @llvm.get.active.lane.mask is supported.
2125  // It is used in the MVETailPredication pass, which requires the number of
2126  // elements processed by this vector loop to setup the tail-predicated
2127  // loop.
2128  return true;
2129 }
2132  // Enable Upper bound unrolling universally, not dependant upon the conditions
2133  // below.
2134  UP.UpperBound = true;
2135 
2136  // Only currently enable these preferences for M-Class cores.
2137  if (!ST->isMClass())
2139 
2140  // Disable loop unrolling for Oz and Os.
2141  UP.OptSizeThreshold = 0;
2142  UP.PartialOptSizeThreshold = 0;
2143  if (L->getHeader()->getParent()->hasOptSize())
2144  return;
2145 
2146  SmallVector<BasicBlock*, 4> ExitingBlocks;
2147  L->getExitingBlocks(ExitingBlocks);
2148  LLVM_DEBUG(dbgs() << "Loop has:\n"
2149  << "Blocks: " << L->getNumBlocks() << "\n"
2150  << "Exit blocks: " << ExitingBlocks.size() << "\n");
2151 
2152  // Only allow another exit other than the latch. This acts as an early exit
2153  // as it mirrors the profitability calculation of the runtime unroller.
2154  if (ExitingBlocks.size() > 2)
2155  return;
2156 
2157  // Limit the CFG of the loop body for targets with a branch predictor.
2158  // Allowing 4 blocks permits if-then-else diamonds in the body.
2159  if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2160  return;
2161 
2162  // Don't unroll vectorized loops, including the remainder loop
2163  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2164  return;
2165 
2166  // Scan the loop: don't unroll loops with calls as this could prevent
2167  // inlining.
2168  InstructionCost Cost = 0;
2169  for (auto *BB : L->getBlocks()) {
2170  for (auto &I : *BB) {
2171  // Don't unroll vectorised loop. MVE does not benefit from it as much as
2172  // scalar code.
2173  if (I.getType()->isVectorTy())
2174  return;
2175 
2176  if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2177  if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2178  if (!isLoweredToCall(F))
2179  continue;
2180  }
2181  return;
2182  }
2183 
2184  SmallVector<const Value*, 4> Operands(I.operand_values());
2185  Cost +=
2187  }
2188  }
2189 
2190  LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2191 
2192  UP.Partial = true;
2193  UP.Runtime = true;
2194  UP.UnrollRemainder = true;
2196  UP.UnrollAndJam = true;
2198 
2199  // Force unrolling small loops can be very useful because of the branch
2200  // taken cost of the backedge.
2201  if (Cost < 12)
2202  UP.Force = true;
2203 }
2204 
2207  BaseT::getPeelingPreferences(L, SE, PP);
2208 }
2209 
2210 bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2211  TTI::ReductionFlags Flags) const {
2212  if (!ST->hasMVEIntegerOps())
2213  return false;
2214 
2215  unsigned ScalarBits = Ty->getScalarSizeInBits();
2216  switch (Opcode) {
2217  case Instruction::Add:
2218  return ScalarBits <= 64;
2219  default:
2220  return false;
2221  }
2222 }
2223 
2225  unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2226  if (!ST->hasMVEIntegerOps())
2227  return false;
2228  return true;
2229 }
llvm::ISD::SUB
@ SUB
Definition: ISDOpcodes.h:233
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
ARMSubtarget.h
llvm::InstructionCost
Definition: InstructionCost.h:26
llvm::EngineKind::Kind
Kind
Definition: ExecutionEngine.h:524
llvm::TargetLoweringBase::getMaxStoresPerMemmove
unsigned getMaxStoresPerMemmove(bool OptSize) const
Get maximum # of store operations permitted for llvm.memmove.
Definition: TargetLowering.h:1596
llvm::TargetTransformInfo::CastContextHint::Masked
@ Masked
The cast is used with a masked load/store.
ValueTypes.h
llvm::TargetTransformInfo::UnrollingPreferences::PartialOptSizeThreshold
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
Definition: TargetTransformInfo.h:452
llvm::ScalarEvolution::hasLoopInvariantBackedgeTakenCount
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
Definition: ScalarEvolution.cpp:12217
llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition: TargetTransformInfo.h:848
llvm::BasicTTIImplBase< ARMTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:38
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:480
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:211
llvm::SPF_SMAX
@ SPF_SMAX
Unsigned minimum.
Definition: ValueTracking.h:661
llvm::ISD::VECTOR_SHUFFLE
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:557
llvm::ARM_AM::isThumbImmShiftedVal
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
Definition: ARMAddressingModes.h:235
llvm::MVT::v4f16
@ v4f16
Definition: MachineValueType.h:126
llvm::TargetTransformInfo::TCC_Expensive
@ TCC_Expensive
The cost of a 'div' instruction on x86.
Definition: TargetTransformInfo.h:264
llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:184
llvm
Definition: AllocatorList.h:23
llvm::TargetTransformInfo::ReductionFlags
Flags describing the kind of vector reduction.
Definition: TargetTransformInfo.h:1334
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::SystemZISD::TM
@ TM
Definition: SystemZISelLowering.h:65
llvm::ARMSubtarget::hasMVEFloatOps
bool hasMVEFloatOps() const
Definition: ARMSubtarget.h:616
llvm::HardwareLoopInfo::LoopDecrement
Value * LoopDecrement
Definition: TargetTransformInfo.h:101
llvm::ARMTTIImpl::getAddressComputationCost
int getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr)
Definition: ARMTargetTransformInfo.cpp:980
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:623
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:447
InstCombiner.h
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:722
llvm::InstCombiner::getDominatorTree
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:369
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
IntrinsicInst.h
ceil
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g ceil
Definition: README-FPStack.txt:54
DisableLowOverheadLoops
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
llvm::TypeSize::getFixedSize
ScalarTy getFixedSize() const
Definition: TypeSize.h:426
llvm::Function
Definition: Function.h:61
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
llvm::ISD::UDIV
@ UDIV
Definition: ISDOpcodes.h:236
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
llvm::SelectPatternResult::Flavor
SelectPatternFlavor Flavor
Definition: ValueTracking.h:681
llvm::TargetTransformInfoImplCRTPBase< ARMTTIImpl >::getUserCost
InstructionCost getUserCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
Definition: TargetTransformInfoImpl.h:908
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:674
llvm::BasicTTIImplBase< ARMTTIImpl >::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:923
llvm::TargetTransformInfo::AMK_PostIndexed
@ AMK_PostIndexed
Definition: TargetTransformInfo.h:635
llvm::ARMTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: ARMTargetTransformInfo.cpp:1138
llvm::PointerType::get
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Definition: Type.cpp:687
MVEMaxSupportedInterleaveFactor
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor
llvm::ARMTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: ARMTargetTransformInfo.cpp:85
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:317
llvm::ARMTTIImpl::getIntImmCodeSizeCost
int getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty)
Definition: ARMTargetTransformInfo.cpp:286
llvm::PredicatedScalarEvolution
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
Definition: ScalarEvolution.h:2138
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:131
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::InstCombiner::Builder
BuilderTy & Builder
Definition: InstCombiner.h:56
llvm::ARMSubtarget::hasFPARMv8Base
bool hasFPARMv8Base() const
Definition: ARMSubtarget.h:643
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1643
llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:683
llvm::CmpInst::makeCmpResultType
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1034
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::IRBuilder< TargetFolder, IRBuilderCallbackInserter >
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:148
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:443
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJamInnerLoopThreshold
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
Definition: TargetTransformInfo.h:499
Local.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
llvm::TargetTransformInfo::UnrollingPreferences::UnrollRemainder
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
Definition: TargetTransformInfo.h:492
llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:775
llvm::ARMTTIImpl::getIntImmCost
int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:251
llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:190
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:214
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:140
llvm::CostTblEntry
Cost Table Entry.
Definition: CostTable.h:24
llvm::ARMSubtarget::hasV6T2Ops
bool hasV6T2Ops() const
Definition: ARMSubtarget.h:602
EnableMaskedLoadStores
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
llvm::MemOp
Definition: TargetLowering.h:108
APInt.h
llvm::ARMTTIImpl::getMemcpyCost
int getMemcpyCost(const Instruction *I)
Definition: ARMTargetTransformInfo.cpp:1128
llvm::ARMTTIImpl::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:1490
llvm::HardwareLoopInfo::isHardwareLoopCandidate
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
Definition: TargetTransformInfo.cpp:101
llvm::ARMTTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:382
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:476
llvm::findDefsUsedOutsideOfLoop
SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
Definition: LoopUtils.cpp:132
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1581
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:529
llvm::tgtok::Bits
@ Bits
Definition: TGLexer.h:50
llvm::Instruction::isShift
bool isShift() const
Definition: Instruction.h:167
llvm::SPF_UMAX
@ SPF_UMAX
Signed maximum.
Definition: ValueTracking.h:662
llvm::Optional
Definition: APInt.h:33
T
#define T
Definition: Mips16ISelLowering.cpp:341
llvm::ARMSubtarget::hasMVEIntegerOps
bool hasMVEIntegerOps() const
Definition: ARMSubtarget.h:615
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:40
llvm::ConstantAsMetadata::get
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:419
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:410
llvm::CallBase::getNumArgOperands
unsigned getNumArgOperands() const
Definition: InstrTypes.h:1339
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
llvm::matchSelectPattern
SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
Definition: ValueTracking.cpp:6084
llvm::SelectPatternFlavor
SelectPatternFlavor
Specific patterns of select instructions we can match.
Definition: ValueTracking.h:657
llvm::MVT::v2f64
@ v2f64
Definition: MachineValueType.h:158
llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:527
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::TargetLoweringBase::getOperationAction
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
Definition: TargetLowering.h:1033
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:197
llvm::LoopBase::getNumBlocks
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:185
llvm::APIntOps::umin
const APInt & umin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition: APInt.h:2185
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1198
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:846
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::ARMTTIImpl::isLegalMaskedLoad
bool isLegalMaskedLoad(Type *DataTy, Align Alignment)
Definition: ARMTargetTransformInfo.cpp:1018
llvm::Instruction::setMetadata
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1330
llvm::ARMTTIImpl::isHardwareLoopProfitable
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
Definition: ARMTargetTransformInfo.cpp:1845
KnownBits.h
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::HardwareLoopInfo::IsNestingLegal
bool IsNestingLegal
Definition: TargetTransformInfo.h:103
floor
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g floor
Definition: README-FPStack.txt:54
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:124
MachineValueType.h
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:247
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:205
Instruction.h
llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition: DerivedTypes.h:570
llvm::ARMSubtarget::hasLOB
bool hasLOB() const
Definition: ARMSubtarget.h:651
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:77
llvm::InstCombiner::replaceOperand
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:437
llvm::Intrinsic::getType
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:1251
llvm::MVT::i1
@ i1
Definition: MachineValueType.h:40
llvm::APInt::isNonNegative
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:369
llvm::MVT::v8f16
@ v8f16
Definition: MachineValueType.h:127
llvm::APInt::isNegative
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:364
llvm::ARM_AM::getSOImmVal
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
Definition: ARMAddressingModes.h:163
SubtargetFeature.h
TargetMachine.h
llvm::ScalarEvolution::getOne
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
Definition: ScalarEvolution.h:596
llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:669
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:719
EnableTailPredication
cl::opt< TailPredication::Mode > EnableTailPredication
llvm::BasicTTIImplBase< ARMTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:975
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:845
llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition: TargetTransformInfo.h:1079
llvm::User
Definition: User.h:44
llvm::TargetLoweringBase::getMaxStoresPerMemset
unsigned getMaxStoresPerMemset(bool OptSize) const
Get maximum # of store operations permitted for llvm.memset.
Definition: TargetLowering.h:1557
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:35
llvm::getKnownAlignment
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition: Local.h:223
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::TargetTransformInfo::UnrollingPreferences::Force
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
Definition: TargetTransformInfo.h:488
llvm::MVT::f64
@ f64
Definition: MachineValueType.h:53
llvm::ARMSubtarget::hasBranchPredictor
bool hasBranchPredictor() const
Definition: ARMSubtarget.h:705
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:57
llvm::EVT::getVectorNumElements
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:301
llvm::getPtrStride
int64_t getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap=ValueToValueMap(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of its element size.
Definition: LoopAccessAnalysis.cpp:1017
AllowWLSLoops
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:725
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:178
llvm::APInt::getLimitedValue
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition: APInt.h:487
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:235
llvm::MVT::v16i1
@ v16i1
Definition: MachineValueType.h:65
llvm::ISD::UDIVREM
@ UDIVREM
Definition: ISDOpcodes.h:249
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:119
llvm::PatternMatch::m_c_Add
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
Definition: PatternMatch.h:2215
llvm::ARMTargetLowering::useSoftFloat
bool useSoftFloat() const override
Definition: ARMISelLowering.cpp:1559
llvm::MVT::v8i1
@ v8i1
Definition: MachineValueType.h:64
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJam
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
Definition: TargetTransformInfo.h:494
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::PatternMatch::m_ConstantInt
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:145
llvm::EVT::isInteger
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:139
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:147
llvm::HardwareLoopInfo::PerformEntryTest
bool PerformEntryTest
Definition: TargetTransformInfo.h:107
canTailPredicateLoop
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI)
Definition: ARMTargetTransformInfo.cpp:1977
llvm::ARMTTIImpl::isLoweredToCall
bool isLoweredToCall(const Function *F)
Definition: ARMTargetTransformInfo.cpp:1697
llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:729
llvm::InstCombiner::eraseInstFromFunction
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1631
llvm::TypeConversionCostTblEntry
Type Conversion Cost Table.
Definition: CostTable.h:44
llvm::LoopBase::getExitingBlocks
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
Definition: LoopInfoImpl.h:34
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:885
LoopUtils.h
llvm::HardwareLoopInfo::CounterInReg
bool CounterInReg
Definition: TargetTransformInfo.h:105
llvm::ARMSubtarget::hasV6Ops
bool hasV6Ops() const
Definition: ARMSubtarget.h:599
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:622
llvm::ARMTTIImpl::emitGetActiveLaneMask
bool emitGetActiveLaneMask() const
Definition: ARMTargetTransformInfo.cpp:2120
PatternMatch.h
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:644
llvm::ARMSubtarget::hasFP64
bool hasFP64() const
Definition: ARMSubtarget.h:678
llvm::MVT::v1i64
@ v1i64
Definition: MachineValueType.h:108
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:62
llvm::isVREVMask
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
Definition: ARMTargetTransformInfo.h:296
llvm::ARM_AM::getT2SOImmVal
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
Definition: ARMAddressingModes.h:320
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
llvm::BasicTTIImplBase< ARMTTIImpl >::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:928
llvm::BasicTTIImplBase< ARMTTIImpl >::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:764
llvm::TargetTransformInfoImplBase::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const
Definition: TargetTransformInfoImpl.h:545
llvm::None
const NoneType None
Definition: None.h:23
llvm::ARMTTIImpl::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: ARMTargetTransformInfo.cpp:1449
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:86
llvm::ARMSubtarget::getMVEVectorCostFactor
unsigned getMVEVectorCostFactor(TargetTransformInfo::TargetCostKind CostKind) const
Definition: ARMSubtarget.h:916
llvm::Type::getIntegerBitWidth
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:96
llvm::MVT::v4i8
@ v4i8
Definition: MachineValueType.h:75
llvm::SPF_SMIN
@ SPF_SMIN
Definition: ValueTracking.h:659
Type.h
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:116
llvm::BasicTTIImplBase< ARMTTIImpl >::getAddressComputationCost
unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
Definition: BasicTTIImpl.h:1899
llvm::APInt::isAllOnesValue
bool isAllOnesValue() const
Determine if all bits are set.
Definition: APInt.h:401
llvm::Instruction::getMetadata
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:277
llvm::maxnum
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE maxNum semantics.
Definition: APFloat.h:1297
LoopInfo.h
llvm::ARMTTIImpl::getNumMemOps
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
Definition: ARMTargetTransformInfo.cpp:1062
llvm::ARMTTIImpl::isProfitableLSRChainElement
bool isProfitableLSRChainElement(Instruction *I)
Definition: ARMTargetTransformInfo.cpp:1001
llvm::ARMTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: ARMTargetTransformInfo.cpp:2130
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:78
llvm::ARMTTIImpl::preferPredicateOverEpilogue
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, const LoopAccessInfo *LAI)
Definition: ARMTargetTransformInfo.cpp:2070
getCalledFunction
static const Function * getCalledFunction(const Value *V, bool LookThroughBitCast, bool &IsNoBuiltin)
Definition: MemoryBuiltins.cpp:117
llvm::PatternMatch::m_Xor
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1130
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:202
llvm::APInt::getOneBitSet
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:593
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:847
llvm::MVT::v2i8
@ v2i8
Definition: MachineValueType.h:74
llvm::MVT::v4i64
@ v4i64
Definition: MachineValueType.h:110
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:391
llvm::ARMTTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:1385
llvm::TargetTransformInfo::CastContextHint::Normal
@ Normal
The cast is used with a normal load/store.
BasicBlock.h
llvm::cl::opt< bool >
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:78
llvm::ARMTTIImpl::getExtendedAddReductionCost
InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1614
llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:535
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::MVT::v16i8
@ v16i8
Definition: MachineValueType.h:77
llvm::EVT::getSizeInBits
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:333
llvm::ARMSubtarget::isThumb1Only
bool isThumb1Only() const
Definition: ARMSubtarget.h:802
llvm::ScalarEvolution::getUnsignedRangeMax
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
Definition: ScalarEvolution.h:861
llvm::InstCombiner::getAssumptionCache
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:367
llvm::MVT::v16i16
@ v16i16
Definition: MachineValueType.h:88
Index
uint32_t Index
Definition: ELFObjHandler.cpp:84
llvm::MVT::v2i64
@ v2i64
Definition: MachineValueType.h:109
llvm::InstCombiner::getDataLayout
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:370
llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition: ISDOpcodes.h:776
llvm::CostTableLookup
const CostTblEntry * CostTableLookup(ArrayRef< CostTblEntry > Tbl, int ISD, MVT Ty)
Find in cost table, TypeTy must be comparable to CompareTy by ==.
Definition: CostTable.h:31
llvm::MVT::v16f32
@ v16f32
Definition: MachineValueType.h:148
llvm::Instruction::user_back
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:91
llvm::TruncInst
This class represents a truncation of integer types.
Definition: Instructions.h:4691
llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, bool IsPairwise, TTI::TargetCostKind CostKind)
Try to calculate arithmetic and shuffle op costs for reduction operations.
Definition: BasicTTIImpl.h:1939
llvm::TargetTransformInfo::OK_UniformConstantValue
@ OK_UniformConstantValue
Definition: TargetTransformInfo.h:898
llvm::ARMSubtarget::isMClass
bool isMClass() const
Definition: ARMSubtarget.h:805
llvm::ConvertCostTableLookup
const TypeConversionCostTblEntry * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntry > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table, TypeTy must be comparable to CompareTy by ==.
Definition: CostTable.h:54
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
llvm::DenseMap< const Value *, Value * >
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:423
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:903
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:440
llvm::ARMTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1633
llvm::LoopAccessInfo
Drive the analysis of memory accesses in the loop.
Definition: LoopAccessAnalysis.h:519
llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
llvm::MVT::v4f32
@ v4f32
Definition: MachineValueType.h:145
llvm::MVT::i8
@ i8
Definition: MachineValueType.h:41
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
llvm::MVT::Other
@ Other
Definition: MachineValueType.h:39
llvm::ARMSubtarget::hasSlowLoadDSubregister
bool hasSlowLoadDSubregister() const
Definition: ARMSubtarget.h:692
llvm::SPF_ABS
@ SPF_ABS
Floating point maxnum.
Definition: ValueTracking.h:665
memcpy
<%struct.s * > cast struct s *S to sbyte *< sbyte * > sbyte uint cast struct s *agg result to sbyte *< sbyte * > sbyte uint cast struct s *memtmp to sbyte *< sbyte * > sbyte uint ret void llc ends up issuing two memcpy or custom lower memcpy(of small size) to be ldmia/stmia. I think option 2 is better but the current register allocator cannot allocate a chunk of registers at a time. A feasible temporary solution is to use specific physical registers at the lowering time for small(<
llvm::TargetLoweringBase::getMaxStoresPerMemcpy
unsigned getMaxStoresPerMemcpy(bool OptSize) const
Get maximum # of store operations permitted for llvm.memcpy.
Definition: TargetLowering.h:1567
llvm::TargetTransformInfoImplBase::isLoweredToCall
bool isLoweredToCall(const Function *F) const
Definition: TargetTransformInfoImpl.h:115
llvm::ARMTTIImpl::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1432
llvm::ARMSubtarget::hasFullFP16
bool hasFullFP16() const
Definition: ARMSubtarget.h:722
llvm::ARMTTIImpl::preferInLoopReduction
bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
Definition: ARMTargetTransformInfo.cpp:2210
llvm::IRBuilderBase::CreateIntrinsic
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with args, mangled using Types.
Definition: IRBuilder.cpp:798
llvm::BasicTTIImplBase< ARMTTIImpl >::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: BasicTTIImpl.h:1072
llvm::IRBuilderBase::getTrue
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:458
llvm::PatternMatch::m_Constant
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
Definition: PatternMatch.h:142
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:895
isSSATMinMaxPattern
static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
Definition: ARMTargetTransformInfo.cpp:296
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:649
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:70
llvm::ARMTTIImpl::getPreferredAddressingMode
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
Definition: ARMTargetTransformInfo.cpp:104
llvm::APIntOps::smin
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition: APInt.h:2175
llvm::ArrayRef< int >
llvm::LoopInfo
Definition: LoopInfo.h:1080
llvm::EVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:149
ARMAddressingModes.h
llvm::ARMSubtarget::hasNEON
bool hasNEON() const
Definition: ARMSubtarget.h:644
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
DataLayout.h
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:44
llvm::MVT::v2i32
@ v2i32
Definition: MachineValueType.h:94
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:41
llvm::ARMTTIImpl::isLegalMaskedGather
bool isLegalMaskedGather(Type *Ty, Align Alignment)
Definition: ARMTargetTransformInfo.cpp:1038
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:215
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::ISD::SREM
@ SREM
Definition: ISDOpcodes.h:237
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
llvm::MVT::v2f32
@ v2f32
Definition: MachineValueType.h:143
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:952
llvm::ARMTTIImpl::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:836
llvm::TargetTransformInfo::UnrollingPreferences::DefaultUnrollRuntimeCount
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
Definition: TargetTransformInfo.h:459
trunc
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g trunc
Definition: README-FPStack.txt:63
llvm::SPF_FMINNUM
@ SPF_FMINNUM
Unsigned maximum.
Definition: ValueTracking.h:663
llvm::TargetLoweringBase::InstructionOpcodeToISD
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Definition: TargetLoweringBase.cpp:1731
llvm::TargetTransformInfo::AddressingModeKind
AddressingModeKind
Definition: TargetTransformInfo.h:633
llvm::MVT::v4i32
@ v4i32
Definition: MachineValueType.h:96
llvm::BasicTTIImplBase< ARMTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:491
llvm::SPF_FMAXNUM
@ SPF_FMAXNUM
Floating point minnum.
Definition: ValueTracking.h:664
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
canTailPredicateInstruction
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
Definition: ARMTargetTransformInfo.cpp:1940
llvm::MVT::v8i64
@ v8i64
Definition: MachineValueType.h:111
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:624
llvm::Function::hasOptSize
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:706
llvm::ScalarEvolution::isLoopInvariant
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
Definition: ScalarEvolution.cpp:12485
llvm::BasicTTIImplBase< ARMTTIImpl >::getCallInstrCost
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency)
Compute a cost of the given call instruction.
Definition: BasicTTIImpl.h:1889
llvm::InstCombiner::replaceInstUsesWith
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:416
llvm::MVT::v16i32
@ v16i32
Definition: MachineValueType.h:99
llvm::MemOp::Set
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
Definition: TargetLowering.h:141
llvm::ARMTTIImpl::getIntImmCostInst
int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
Definition: ARMTargetTransformInfo.cpp:327
llvm::BasicTTIImplBase::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: BasicTTIImpl.h:429
llvm::MCID::Select
@ Select
Definition: MCInstrDesc.h:163
llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition: TargetLowering.h:335
llvm::LoopBase::isInnermost
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:165
llvm::APIntOps::umax
const APInt & umax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be unsigned.
Definition: APInt.h:2190
llvm::minnum
LLVM_READONLY APFloat minnum(const APFloat &A, const APFloat &B)
Implements IEEE minNum semantics.
Definition: APFloat.h:1286
llvm::MemOp::Copy
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
Definition: TargetLowering.h:126
llvm::KnownBits
Definition: KnownBits.h:23
llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:148
CostTable.h
llvm::EVT::getScalarType
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:288
llvm::TargetTransformInfo::UnrollingPreferences::UpperBound
bool UpperBound
Allow using trip count upper bound to unroll loops.
Definition: TargetTransformInfo.h:490
llvm::Align::value
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:208
llvm::replaceSymbolicStrideSCEV
const SCEV * replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const ValueToValueMap &PtrToStride, Value *Ptr, Value *OrigPtr=nullptr)
Return the SCEV corresponding to a pointer with the symbolic stride replaced with constant one,...
Definition: LoopAccessAnalysis.cpp:143
llvm::MVT::v8i16
@ v8i16
Definition: MachineValueType.h:87
ISDOpcodes.h
llvm::TypeSize
Definition: TypeSize.h:417
Casting.h
llvm::Value::hasNUses
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:151
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
llvm::MVT::i32
@ i32
Definition: MachineValueType.h:43
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:207
llvm::ISD::SDIV
@ SDIV
Definition: ISDOpcodes.h:235
powi
This is blocked on not handling X *X *X powi(X, 3)(see note above). The issue is that we end up getting t
llvm::log2
static double log2(double V)
Definition: AMDGPULibCalls.cpp:841
llvm::InstCombiner::SimplifyDemandedBits
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, unsigned Depth=0)=0
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:184
llvm::MVT::v8i32
@ v8i32
Definition: MachineValueType.h:98
llvm::InstCombiner
The core instruction combiner logic.
Definition: InstCombiner.h:45
llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition: ISDOpcodes.h:730
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:232
llvm::TargetLoweringBase::isOperationLegalOrCustomOrPromote
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Definition: TargetLowering.h:1128
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
llvm::HardwareLoopInfo
Attributes of a target dependent hardware loop.
Definition: TargetTransformInfo.h:93
llvm::TargetTransformInfoImplBase::isConstantStridedAccessLessThan
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
Definition: TargetTransformInfoImpl.h:818
llvm::TargetLoweringBase::getTypeLegalizationCost
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: TargetLoweringBase.cpp:1811
llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:823
llvm::BasicTTIImplBase< ARMTTIImpl >::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1026
llvm::ARMTTIImpl::maybeLoweredToCall
bool maybeLoweredToCall(Instruction &I)
Definition: ARMTargetTransformInfo.cpp:1758
Instructions.h
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:146
llvm::ARMTTIImpl::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:396
llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:647
SmallVector.h
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:802
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:234
llvm::ISD::UREM
@ UREM
Definition: ISDOpcodes.h:238
llvm::MVT::f16
@ f16
Definition: MachineValueType.h:51
llvm::TailPredication::EnabledNoReductions
@ EnabledNoReductions
Definition: ARMTargetTransformInfo.h:44
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1341
llvm::BasicTTIImplBase< ARMTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: BasicTTIImpl.h:741
llvm::BasicTTIImplBase< ARMTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1224
llvm::ARMTTIImpl::isLegalMaskedStore
bool isLegalMaskedStore(Type *DataTy, Align Alignment)
Definition: ARMTargetTransformInfo.h:176
llvm::SPF_UMIN
@ SPF_UMIN
Signed minimum.
Definition: ValueTracking.h:660
llvm::getBooleanLoopAttribute
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopUtils.cpp:296
llvm::ARMTargetLowering::getNumInterleavedAccesses
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
Definition: ARMISelLowering.cpp:19198
simplifyNeonVld1
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
Definition: ARMTargetTransformInfo.cpp:66
llvm::APInt::getActiveBits
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1605
llvm::ARMTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: ARMTargetTransformInfo.cpp:2205
llvm::BasicTTIImplBase< ARMTTIImpl >::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:983
ARMTargetTransformInfo.h
llvm::HardwareLoopInfo::canAnalyze
bool canAnalyze(LoopInfo &LI)
Definition: TargetTransformInfo.cpp:47
llvm::IRBuilderBase::CreateVectorSplat
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1028
DerivedTypes.h
llvm::SCEV::getType
Type * getType() const
Return the LLVM type of this SCEV expression.
Definition: ScalarEvolution.cpp:379
EnableMaskedGatherScatters
cl::opt< bool > EnableMaskedGatherScatters
llvm::ScalarEvolution::getAddExpr
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:2314
llvm::APInt::getLowBitsSet
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Get a value with low bits set.
Definition: APInt.h:667
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:42
llvm::TargetTransformInfo::UnrollingPreferences::OptSizeThreshold
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Definition: TargetTransformInfo.h:445
llvm::BasicTTIImplBase< ARMTTIImpl >::getScalarizationOverhead
unsigned getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:603
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:171
llvm::ARMTTIImpl::preferPredicatedReductionSelect
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
Definition: ARMTargetTransformInfo.cpp:2224
llvm::ISD::SDIVREM
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:248
llvm::ARMTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: ARMTargetTransformInfo.cpp:800
llvm::ARMSubtarget::isThumb2
bool isThumb2() const
Definition: ARMSubtarget.h:803
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:411
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1382
llvm::LoopAccessInfo::getPSE
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
Definition: LoopAccessAnalysis.h:591
llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:716
llvm::MVT::v8i8
@ v8i8
Definition: MachineValueType.h:76
llvm::ARMTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Op1Info=TTI::OK_AnyValue, TTI::OperandValueKind Op2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: ARMTargetTransformInfo.cpp:1243
llvm::ScalarEvolution::getBackedgeTakenCount
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
Definition: ScalarEvolution.cpp:7013
llvm::MVT::v8f32
@ v8f32
Definition: MachineValueType.h:147
llvm::PredicatedScalarEvolution::getSE
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
Definition: ScalarEvolution.h:2170
llvm::ARMTTIImpl::instCombineIntrinsic
Optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
Definition: ARMTargetTransformInfo.cpp:120
llvm::BinaryOperator::Create
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), Instruction *InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Definition: Instructions.cpp:2549
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:84
llvm::MVT::v16i64
@ v16i64
Definition: MachineValueType.h:112
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1272
llvm::BasicTTIImplBase< ARMTTIImpl >::getExtendedAddReductionCost
InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2069
llvm::HardwareLoopInfo::CountType
IntegerType * CountType
Definition: TargetTransformInfo.h:100
llvm::EVT::isFixedLengthVector
bool isFixedLengthVector() const
Definition: ValueTypes.h:159
llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:804
llvm::MVT::f32
@ f32
Definition: MachineValueType.h:52
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:212
llvm::EVT::isFloatingPoint
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:134
llvm::TargetTransformInfo::AMK_None
@ AMK_None
Definition: TargetTransformInfo.h:636
llvm::TargetTransformInfo::AMK_PreIndexed
@ AMK_PreIndexed
Definition: TargetTransformInfo.h:634
llvm::APIntOps::smax
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition: APInt.h:2180
llvm::ARMSubtarget::hasVFP2Base
bool hasVFP2Base() const
Definition: ARMSubtarget.h:640
llvm::TargetLoweringBase::LibCall
@ LibCall
Definition: TargetLowering.h:196
llvm::ARMSubtarget::isThumb
bool isThumb() const
Definition: ARMSubtarget.h:800
llvm::ARMTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, bool IsPairwiseForm, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1590
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:122
llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:281
llvm::MVT::v4i1
@ v4i1
Definition: MachineValueType.h:63
llvm::ARMTargetLowering::isLegalInterleavedAccessType
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
Definition: ARMISelLowering.cpp:19203
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:498
llvm::TailPredication::ForceEnabledNoReductions
@ ForceEnabledNoReductions
Definition: ARMTargetTransformInfo.h:46