LLVM  14.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "ARMSubtarget.h"
12 #include "llvm/ADT/APInt.h"
13 #include "llvm/ADT/SmallVector.h"
14 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
18 #include "llvm/IR/BasicBlock.h"
19 #include "llvm/IR/DataLayout.h"
20 #include "llvm/IR/DerivedTypes.h"
21 #include "llvm/IR/Instruction.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/Intrinsics.h"
24 #include "llvm/IR/IntrinsicInst.h"
25 #include "llvm/IR/IntrinsicsARM.h"
26 #include "llvm/IR/PatternMatch.h"
27 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Casting.h"
30 #include "llvm/Support/KnownBits.h"
36 #include <algorithm>
37 #include <cassert>
38 #include <cstdint>
39 #include <utility>
40 
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "armtti"
44 
46  "enable-arm-maskedldst", cl::Hidden, cl::init(true),
47  cl::desc("Enable the generation of masked loads and stores"));
48 
50  "disable-arm-loloops", cl::Hidden, cl::init(false),
51  cl::desc("Disable the generation of low-overhead loops"));
52 
53 static cl::opt<bool>
54  AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
55  cl::desc("Enable the generation of WLS loops"));
56 
58 
60 
62 
63 /// Convert a vector load intrinsic into a simple llvm load instruction.
64 /// This is beneficial when the underlying object being addressed comes
65 /// from a constant, since we get constant-folding for free.
66 static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
68  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
69 
70  if (!IntrAlign)
71  return nullptr;
72 
73  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
74  ? MemAlign
75  : IntrAlign->getLimitedValue();
76 
77  if (!isPowerOf2_32(Alignment))
78  return nullptr;
79 
80  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
81  PointerType::get(II.getType(), 0));
82  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
83 }
84 
86  const Function *Callee) const {
87  const TargetMachine &TM = getTLI()->getTargetMachine();
88  const FeatureBitset &CallerBits =
89  TM.getSubtargetImpl(*Caller)->getFeatureBits();
90  const FeatureBitset &CalleeBits =
91  TM.getSubtargetImpl(*Callee)->getFeatureBits();
92 
93  // To inline a callee, all features not in the allowed list must match exactly.
94  bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
95  (CalleeBits & ~InlineFeaturesAllowed);
96  // For features in the allowed list, the callee's features must be a subset of
97  // the callers'.
98  bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
99  (CalleeBits & InlineFeaturesAllowed);
100  return MatchExact && MatchSubset;
101 }
102 
105  ScalarEvolution *SE) const {
106  if (ST->hasMVEIntegerOps())
107  return TTI::AMK_PostIndexed;
108 
109  if (L->getHeader()->getParent()->hasOptSize())
110  return TTI::AMK_None;
111 
112  if (ST->isMClass() && ST->isThumb2() &&
113  L->getNumBlocks() == 1)
114  return TTI::AMK_PreIndexed;
115 
116  return TTI::AMK_None;
117 }
118 
121  using namespace PatternMatch;
122  Intrinsic::ID IID = II.getIntrinsicID();
123  switch (IID) {
124  default:
125  break;
126  case Intrinsic::arm_neon_vld1: {
127  Align MemAlign =
129  &IC.getAssumptionCache(), &IC.getDominatorTree());
130  if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
131  return IC.replaceInstUsesWith(II, V);
132  }
133  break;
134  }
135 
136  case Intrinsic::arm_neon_vld2:
137  case Intrinsic::arm_neon_vld3:
138  case Intrinsic::arm_neon_vld4:
139  case Intrinsic::arm_neon_vld2lane:
140  case Intrinsic::arm_neon_vld3lane:
141  case Intrinsic::arm_neon_vld4lane:
142  case Intrinsic::arm_neon_vst1:
143  case Intrinsic::arm_neon_vst2:
144  case Intrinsic::arm_neon_vst3:
145  case Intrinsic::arm_neon_vst4:
146  case Intrinsic::arm_neon_vst2lane:
147  case Intrinsic::arm_neon_vst3lane:
148  case Intrinsic::arm_neon_vst4lane: {
149  Align MemAlign =
151  &IC.getAssumptionCache(), &IC.getDominatorTree());
152  unsigned AlignArg = II.getNumArgOperands() - 1;
153  Value *AlignArgOp = II.getArgOperand(AlignArg);
154  MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
155  if (Align && *Align < MemAlign) {
156  return IC.replaceOperand(
157  II, AlignArg,
159  false));
160  }
161  break;
162  }
163 
164  case Intrinsic::arm_mve_pred_i2v: {
165  Value *Arg = II.getArgOperand(0);
166  Value *ArgArg;
167  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
168  PatternMatch::m_Value(ArgArg))) &&
169  II.getType() == ArgArg->getType()) {
170  return IC.replaceInstUsesWith(II, ArgArg);
171  }
172  Constant *XorMask;
173  if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
174  PatternMatch::m_Value(ArgArg)),
175  PatternMatch::m_Constant(XorMask))) &&
176  II.getType() == ArgArg->getType()) {
177  if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
178  if (CI->getValue().trunc(16).isAllOnesValue()) {
179  auto TrueVector = IC.Builder.CreateVectorSplat(
180  cast<FixedVectorType>(II.getType())->getNumElements(),
181  IC.Builder.getTrue());
182  return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
183  }
184  }
185  }
186  KnownBits ScalarKnown(32);
187  if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
188  ScalarKnown, 0)) {
189  return &II;
190  }
191  break;
192  }
193  case Intrinsic::arm_mve_pred_v2i: {
194  Value *Arg = II.getArgOperand(0);
195  Value *ArgArg;
196  if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
197  PatternMatch::m_Value(ArgArg)))) {
198  return IC.replaceInstUsesWith(II, ArgArg);
199  }
200  if (!II.getMetadata(LLVMContext::MD_range)) {
201  Type *IntTy32 = Type::getInt32Ty(II.getContext());
202  Metadata *M[] = {
204  ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
205  II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
206  return &II;
207  }
208  break;
209  }
210  case Intrinsic::arm_mve_vadc:
211  case Intrinsic::arm_mve_vadc_predicated: {
212  unsigned CarryOp =
213  (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
214  assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
215  "Bad type for intrinsic!");
216 
217  KnownBits CarryKnown(32);
218  if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
219  CarryKnown)) {
220  return &II;
221  }
222  break;
223  }
224  case Intrinsic::arm_mve_vmldava: {
225  Instruction *I = cast<Instruction>(&II);
226  if (I->hasOneUse()) {
227  auto *User = cast<Instruction>(*I->user_begin());
228  Value *OpZ;
229  if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
230  match(I->getOperand(3), m_Zero())) {
231  Value *OpX = I->getOperand(4);
232  Value *OpY = I->getOperand(5);
233  Type *OpTy = OpX->getType();
234 
236  Value *V =
237  IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
238  {I->getOperand(0), I->getOperand(1),
239  I->getOperand(2), OpZ, OpX, OpY});
240 
241  IC.replaceInstUsesWith(*User, V);
242  return IC.eraseInstFromFunction(*User);
243  }
244  }
245  return None;
246  }
247  }
248  return None;
249 }
250 
252  InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
253  APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
254  std::function<void(Instruction *, unsigned, APInt, APInt &)>
255  SimplifyAndSetOp) const {
256 
257  // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
258  // opcode specifying a Top/Bottom instruction, which can change between
259  // instructions.
260  auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
261  unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
262  unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
263 
264  // The only odd/even lanes of operand 0 will only be demanded depending
265  // on whether this is a top/bottom instruction.
266  APInt DemandedElts =
267  APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
268  : APInt::getHighBitsSet(2, 1));
269  SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
270  // The other lanes will be defined from the inserted elements.
271  UndefElts &= APInt::getSplat(NumElts, !IsTop ? APInt::getLowBitsSet(2, 1)
272  : APInt::getHighBitsSet(2, 1));
273  return None;
274  };
275 
276  switch (II.getIntrinsicID()) {
277  default:
278  break;
279  case Intrinsic::arm_mve_vcvt_narrow:
280  SimplifyNarrowInstrTopBottom(2);
281  break;
282  case Intrinsic::arm_mve_vqmovn:
283  SimplifyNarrowInstrTopBottom(4);
284  break;
285  case Intrinsic::arm_mve_vshrn:
286  SimplifyNarrowInstrTopBottom(7);
287  break;
288  }
289 
290  return None;
291 }
292 
295  assert(Ty->isIntegerTy());
296 
297  unsigned Bits = Ty->getPrimitiveSizeInBits();
298  if (Bits == 0 || Imm.getActiveBits() >= 64)
299  return 4;
300 
301  int64_t SImmVal = Imm.getSExtValue();
302  uint64_t ZImmVal = Imm.getZExtValue();
303  if (!ST->isThumb()) {
304  if ((SImmVal >= 0 && SImmVal < 65536) ||
305  (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
306  (ARM_AM::getSOImmVal(~ZImmVal) != -1))
307  return 1;
308  return ST->hasV6T2Ops() ? 2 : 3;
309  }
310  if (ST->isThumb2()) {
311  if ((SImmVal >= 0 && SImmVal < 65536) ||
312  (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
313  (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
314  return 1;
315  return ST->hasV6T2Ops() ? 2 : 3;
316  }
317  // Thumb1, any i8 imm cost 1.
318  if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
319  return 1;
320  if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
321  return 2;
322  // Load from constantpool.
323  return 3;
324 }
325 
326 // Constants smaller than 256 fit in the immediate field of
327 // Thumb1 instructions so we return a zero cost and 1 otherwise.
329  const APInt &Imm, Type *Ty) {
330  if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
331  return 0;
332 
333  return 1;
334 }
335 
336 // Checks whether Inst is part of a min(max()) or max(min()) pattern
337 // that will match to an SSAT instruction
338 static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
339  Value *LHS, *RHS;
340  ConstantInt *C;
341  SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
342 
343  if (InstSPF == SPF_SMAX &&
345  C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) {
346 
347  auto isSSatMin = [&](Value *MinInst) {
348  if (isa<SelectInst>(MinInst)) {
349  Value *MinLHS, *MinRHS;
350  ConstantInt *MinC;
351  SelectPatternFlavor MinSPF =
352  matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
353  if (MinSPF == SPF_SMIN &&
355  MinC->getValue() == ((-Imm) - 1))
356  return true;
357  }
358  return false;
359  };
360 
361  if (isSSatMin(Inst->getOperand(1)) ||
362  (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
363  isSSatMin(*(++Inst->user_begin())))))
364  return true;
365  }
366  return false;
367 }
368 
369 InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
370  const APInt &Imm, Type *Ty,
372  Instruction *Inst) {
373  // Division by a constant can be turned into multiplication, but only if we
374  // know it's constant. So it's not so much that the immediate is cheap (it's
375  // not), but that the alternative is worse.
376  // FIXME: this is probably unneeded with GlobalISel.
377  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
378  Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
379  Idx == 1)
380  return 0;
381 
382  // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
383  // splitting any large offsets.
384  if (Opcode == Instruction::GetElementPtr && Idx != 0)
385  return 0;
386 
387  if (Opcode == Instruction::And) {
388  // UXTB/UXTH
389  if (Imm == 255 || Imm == 65535)
390  return 0;
391  // Conversion to BIC is free, and means we can use ~Imm instead.
392  return std::min(getIntImmCost(Imm, Ty, CostKind),
393  getIntImmCost(~Imm, Ty, CostKind));
394  }
395 
396  if (Opcode == Instruction::Add)
397  // Conversion to SUB is free, and means we can use -Imm instead.
398  return std::min(getIntImmCost(Imm, Ty, CostKind),
399  getIntImmCost(-Imm, Ty, CostKind));
400 
401  if (Opcode == Instruction::ICmp && Imm.isNegative() &&
402  Ty->getIntegerBitWidth() == 32) {
403  int64_t NegImm = -Imm.getSExtValue();
404  if (ST->isThumb2() && NegImm < 1<<12)
405  // icmp X, #-C -> cmn X, #C
406  return 0;
407  if (ST->isThumb() && NegImm < 1<<8)
408  // icmp X, #-C -> adds X, #C
409  return 0;
410  }
411 
412  // xor a, -1 can always be folded to MVN
413  if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
414  return 0;
415 
416  // Ensures negative constant of min(max()) or max(min()) patterns that
417  // match to SSAT instructions don't get hoisted
418  if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
419  Ty->getIntegerBitWidth() <= 32) {
420  if (isSSATMinMaxPattern(Inst, Imm) ||
421  (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
422  isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
423  return 0;
424  }
425 
426  return getIntImmCost(Imm, Ty, CostKind);
427 }
428 
431  const Instruction *I) {
433  (ST->hasNEON() || ST->hasMVEIntegerOps())) {
434  // FIXME: The vectorizer is highly sensistive to the cost of these
435  // instructions, which suggests that it may be using the costs incorrectly.
436  // But, for now, just make them free to avoid performance regressions for
437  // vector targets.
438  return 0;
439  }
440  return BaseT::getCFInstrCost(Opcode, CostKind, I);
441 }
442 
444  Type *Src,
447  const Instruction *I) {
448  int ISD = TLI->InstructionOpcodeToISD(Opcode);
449  assert(ISD && "Invalid opcode");
450 
451  // TODO: Allow non-throughput costs that aren't binary.
452  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
454  return Cost == 0 ? 0 : 1;
455  return Cost;
456  };
457  auto IsLegalFPType = [this](EVT VT) {
458  EVT EltVT = VT.getScalarType();
459  return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
460  (EltVT == MVT::f64 && ST->hasFP64()) ||
461  (EltVT == MVT::f16 && ST->hasFullFP16());
462  };
463 
464  EVT SrcTy = TLI->getValueType(DL, Src);
465  EVT DstTy = TLI->getValueType(DL, Dst);
466 
467  if (!SrcTy.isSimple() || !DstTy.isSimple())
468  return AdjustCost(
469  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
470 
471  // Extending masked load/Truncating masked stores is expensive because we
472  // currently don't split them. This means that we'll likely end up
473  // loading/storing each element individually (hence the high cost).
474  if ((ST->hasMVEIntegerOps() &&
475  (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
476  Opcode == Instruction::SExt)) ||
477  (ST->hasMVEFloatOps() &&
478  (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
479  IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
480  if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
481  return 2 * DstTy.getVectorNumElements() *
483 
484  // The extend of other kinds of load is free
485  if (CCH == TTI::CastContextHint::Normal ||
487  static const TypeConversionCostTblEntry LoadConversionTbl[] = {
500  };
501  if (const auto *Entry = ConvertCostTableLookup(
502  LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
503  return AdjustCost(Entry->Cost);
504 
505  static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
512  // The following extend from a legal type to an illegal type, so need to
513  // split the load. This introduced an extra load operation, but the
514  // extend is still "free".
521  };
522  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
523  if (const auto *Entry =
524  ConvertCostTableLookup(MVELoadConversionTbl, ISD,
525  DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
526  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
527  }
528 
529  static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
530  // FPExtends are similar but also require the VCVT instructions.
533  };
534  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
535  if (const auto *Entry =
536  ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
537  DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
538  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
539  }
540 
541  // The truncate of a store is free. This is the mirror of extends above.
542  static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
550  };
551  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
552  if (const auto *Entry =
553  ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
554  SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
555  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
556  }
557 
558  static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
561  };
562  if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
563  if (const auto *Entry =
564  ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
565  SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
566  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
567  }
568  }
569 
570  // NEON vector operations that can extend their inputs.
571  if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
572  I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
573  static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
574  // vaddl
575  { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
576  { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
577  // vsubl
578  { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
579  { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
580  // vmull
581  { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
582  { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
583  // vshll
584  { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
585  { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
586  };
587 
588  auto *User = cast<Instruction>(*I->user_begin());
589  int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
590  if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
591  DstTy.getSimpleVT(),
592  SrcTy.getSimpleVT())) {
593  return AdjustCost(Entry->Cost);
594  }
595  }
596 
597  // Single to/from double precision conversions.
598  if (Src->isVectorTy() && ST->hasNEON() &&
599  ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
600  DstTy.getScalarType() == MVT::f32) ||
601  (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
602  DstTy.getScalarType() == MVT::f64))) {
603  static const CostTblEntry NEONFltDblTbl[] = {
604  // Vector fptrunc/fpext conversions.
607  {ISD::FP_EXTEND, MVT::v4f32, 4}};
608 
609  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
610  if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
611  return AdjustCost(LT.first * Entry->Cost);
612  }
613 
614  // Some arithmetic, load and store operations have specific instructions
615  // to cast up/down their types automatically at no extra cost.
616  // TODO: Get these tables to know at least what the related operations are.
617  static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
624 
625  // The number of vmovl instructions for the extension.
644 
645  // Operations that we legalize using splitting.
648 
649  // Vector float <-> i32 conversions.
652 
673 
680 
681  // Vector double <-> i32 conversions.
684 
691 
698  };
699 
700  if (SrcTy.isVector() && ST->hasNEON()) {
701  if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
702  DstTy.getSimpleVT(),
703  SrcTy.getSimpleVT()))
704  return AdjustCost(Entry->Cost);
705  }
706 
707  // Scalar float to integer conversions.
708  static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
729  };
730  if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
731  if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
732  DstTy.getSimpleVT(),
733  SrcTy.getSimpleVT()))
734  return AdjustCost(Entry->Cost);
735  }
736 
737  // Scalar integer to float conversions.
738  static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
759  };
760 
761  if (SrcTy.isInteger() && ST->hasNEON()) {
762  if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
763  ISD, DstTy.getSimpleVT(),
764  SrcTy.getSimpleVT()))
765  return AdjustCost(Entry->Cost);
766  }
767 
768  // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
769  // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
770  // are linearised so take more.
771  static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
784  };
785 
786  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
787  if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
788  ISD, DstTy.getSimpleVT(),
789  SrcTy.getSimpleVT()))
790  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
791  }
792 
793  if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
794  // As general rule, fp converts that were not matched above are scalarized
795  // and cost 1 vcvt for each lane, so long as the instruction is available.
796  // If not it will become a series of function calls.
797  const InstructionCost CallCost =
798  getCallInstrCost(nullptr, Dst, {Src}, CostKind);
799  int Lanes = 1;
800  if (SrcTy.isFixedLengthVector())
801  Lanes = SrcTy.getVectorNumElements();
802 
803  if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
804  return Lanes;
805  else
806  return Lanes * CallCost;
807  }
808 
809  if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
810  SrcTy.isFixedLengthVector()) {
811  // Treat a truncate with larger than legal source (128bits for MVE) as
812  // expensive, 2 instructions per lane.
813  if ((SrcTy.getScalarType() == MVT::i8 ||
814  SrcTy.getScalarType() == MVT::i16 ||
815  SrcTy.getScalarType() == MVT::i32) &&
816  SrcTy.getSizeInBits() > 128 &&
817  SrcTy.getSizeInBits() > DstTy.getSizeInBits())
818  return SrcTy.getVectorNumElements() * 2;
819  }
820 
821  // Scalar integer conversion costs.
822  static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
823  // i16 -> i64 requires two dependent operations.
825 
826  // Truncates on i64 are assumed to be free.
829  { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
831  };
832 
833  if (SrcTy.isInteger()) {
834  if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
835  DstTy.getSimpleVT(),
836  SrcTy.getSimpleVT()))
837  return AdjustCost(Entry->Cost);
838  }
839 
840  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
842  : 1;
843  return AdjustCost(
844  BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
845 }
846 
848  unsigned Index) {
849  // Penalize inserting into an D-subregister. We end up with a three times
850  // lower estimated throughput on swift.
851  if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
852  ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
853  return 3;
854 
855  if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
856  Opcode == Instruction::ExtractElement)) {
857  // Cross-class copies are expensive on many microarchitectures,
858  // so assume they are expensive by default.
859  if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
860  return 3;
861 
862  // Even if it's not a cross class copy, this likely leads to mixing
863  // of NEON and VFP code and should be therefore penalized.
864  if (ValTy->isVectorTy() &&
865  ValTy->getScalarSizeInBits() <= 32)
866  return std::max<InstructionCost>(
867  BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
868  }
869 
870  if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
871  Opcode == Instruction::ExtractElement)) {
872  // Integer cross-lane moves are more expensive than float, which can
873  // sometimes just be vmovs. Integer involve being passes to GPR registers,
874  // causing more of a delay.
875  std::pair<InstructionCost, MVT> LT =
876  getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
877  return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
878  }
879 
880  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
881 }
882 
884  Type *CondTy,
885  CmpInst::Predicate VecPred,
887  const Instruction *I) {
888  int ISD = TLI->InstructionOpcodeToISD(Opcode);
889 
890  // Thumb scalar code size cost for select.
891  if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
892  ST->isThumb() && !ValTy->isVectorTy()) {
893  // Assume expensive structs.
894  if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
895  return TTI::TCC_Expensive;
896 
897  // Select costs can vary because they:
898  // - may require one or more conditional mov (including an IT),
899  // - can't operate directly on immediates,
900  // - require live flags, which we can't copy around easily.
901  InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
902 
903  // Possible IT instruction for Thumb2, or more for Thumb1.
904  ++Cost;
905 
906  // i1 values may need rematerialising by using mov immediates and/or
907  // flag setting instructions.
908  if (ValTy->isIntegerTy(1))
909  ++Cost;
910 
911  return Cost;
912  }
913 
914  // If this is a vector min/max/abs, use the cost of that intrinsic directly
915  // instead. Hopefully when min/max intrinsics are more prevalent this code
916  // will not be needed.
917  const Instruction *Sel = I;
918  if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
919  Sel->hasOneUse())
920  Sel = cast<Instruction>(Sel->user_back());
921  if (Sel && ValTy->isVectorTy() &&
922  (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
923  const Value *LHS, *RHS;
924  SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
925  unsigned IID = 0;
926  switch (SPF) {
927  case SPF_ABS:
928  IID = Intrinsic::abs;
929  break;
930  case SPF_SMIN:
931  IID = Intrinsic::smin;
932  break;
933  case SPF_SMAX:
934  IID = Intrinsic::smax;
935  break;
936  case SPF_UMIN:
937  IID = Intrinsic::umin;
938  break;
939  case SPF_UMAX:
940  IID = Intrinsic::umax;
941  break;
942  case SPF_FMINNUM:
943  IID = Intrinsic::minnum;
944  break;
945  case SPF_FMAXNUM:
946  IID = Intrinsic::maxnum;
947  break;
948  default:
949  break;
950  }
951  if (IID) {
952  // The ICmp is free, the select gets the cost of the min/max/etc
953  if (Sel != I)
954  return 0;
955  IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
956  return getIntrinsicInstrCost(CostAttrs, CostKind);
957  }
958  }
959 
960  // On NEON a vector select gets lowered to vbsl.
961  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
962  // Lowering of some vector selects is currently far from perfect.
963  static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
964  { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
965  { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
967  };
968 
969  EVT SelCondTy = TLI->getValueType(DL, CondTy);
970  EVT SelValTy = TLI->getValueType(DL, ValTy);
971  if (SelCondTy.isSimple() && SelValTy.isSimple()) {
972  if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
973  SelCondTy.getSimpleVT(),
974  SelValTy.getSimpleVT()))
975  return Entry->Cost;
976  }
977 
978  std::pair<InstructionCost, MVT> LT =
979  TLI->getTypeLegalizationCost(DL, ValTy);
980  return LT.first;
981  }
982 
983  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
984  (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
985  cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
986  FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
987  FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
988  if (!VecCondTy)
989  VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
990 
991  // If we don't have mve.fp any fp operations will need to be scalarized.
992  if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
993  // One scalaization insert, one scalarization extract and the cost of the
994  // fcmps.
995  return BaseT::getScalarizationOverhead(VecValTy, false, true) +
996  BaseT::getScalarizationOverhead(VecCondTy, true, false) +
997  VecValTy->getNumElements() *
998  getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
999  VecCondTy->getScalarType(), VecPred, CostKind,
1000  I);
1001  }
1002 
1003  std::pair<InstructionCost, MVT> LT =
1004  TLI->getTypeLegalizationCost(DL, ValTy);
1005  int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1006  // There are two types - the input that specifies the type of the compare
1007  // and the output vXi1 type. Because we don't know how the output will be
1008  // split, we may need an expensive shuffle to get two in sync. This has the
1009  // effect of making larger than legal compares (v8i32 for example)
1010  // expensive.
1011  if (LT.second.getVectorNumElements() > 2) {
1012  if (LT.first > 1)
1013  return LT.first * BaseCost +
1014  BaseT::getScalarizationOverhead(VecCondTy, true, false);
1015  return BaseCost;
1016  }
1017  }
1018 
1019  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1020  // for "multiple beats" potentially needed by MVE instructions.
1021  int BaseCost = 1;
1022  if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1023  BaseCost = ST->getMVEVectorCostFactor(CostKind);
1024 
1025  return BaseCost *
1026  BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1027 }
1028 
1030  ScalarEvolution *SE,
1031  const SCEV *Ptr) {
1032  // Address computations in vectorized code with non-consecutive addresses will
1033  // likely result in more instructions compared to scalar code where the
1034  // computation can more often be merged into the index mode. The resulting
1035  // extra micro-ops can significantly decrease throughput.
1036  unsigned NumVectorInstToHideOverhead = 10;
1037  int MaxMergeDistance = 64;
1038 
1039  if (ST->hasNEON()) {
1040  if (Ty->isVectorTy() && SE &&
1041  !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1042  return NumVectorInstToHideOverhead;
1043 
1044  // In many cases the address computation is not merged into the instruction
1045  // addressing mode.
1046  return 1;
1047  }
1048  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1049 }
1050 
1052  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1053  // If a VCTP is part of a chain, it's already profitable and shouldn't be
1054  // optimized, else LSR may block tail-predication.
1055  switch (II->getIntrinsicID()) {
1056  case Intrinsic::arm_mve_vctp8:
1057  case Intrinsic::arm_mve_vctp16:
1058  case Intrinsic::arm_mve_vctp32:
1059  case Intrinsic::arm_mve_vctp64:
1060  return true;
1061  default:
1062  break;
1063  }
1064  }
1065  return false;
1066 }
1067 
1068 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1070  return false;
1071 
1072  if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1073  // Don't support v2i1 yet.
1074  if (VecTy->getNumElements() == 2)
1075  return false;
1076 
1077  // We don't support extending fp types.
1078  unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1079  if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1080  return false;
1081  }
1082 
1083  unsigned EltWidth = DataTy->getScalarSizeInBits();
1084  return (EltWidth == 32 && Alignment >= 4) ||
1085  (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1086 }
1087 
1090  return false;
1091 
1092  // This method is called in 2 places:
1093  // - from the vectorizer with a scalar type, in which case we need to get
1094  // this as good as we can with the limited info we have (and rely on the cost
1095  // model for the rest).
1096  // - from the masked intrinsic lowering pass with the actual vector type.
1097  // For MVE, we have a custom lowering pass that will already have custom
1098  // legalised any gathers that we can to MVE intrinsics, and want to expand all
1099  // the rest. The pass runs before the masked intrinsic lowering pass, so if we
1100  // are here, we know we want to expand.
1101  if (isa<VectorType>(Ty))
1102  return false;
1103 
1104  unsigned EltWidth = Ty->getScalarSizeInBits();
1105  return ((EltWidth == 32 && Alignment >= 4) ||
1106  (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1107 }
1108 
1109 /// Given a memcpy/memset/memmove instruction, return the number of memory
1110 /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1111 /// call is used.
1113  MemOp MOp;
1114  unsigned DstAddrSpace = ~0u;
1115  unsigned SrcAddrSpace = ~0u;
1116  const Function *F = I->getParent()->getParent();
1117 
1118  if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1119  ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1120  // If 'size' is not a constant, a library call will be generated.
1121  if (!C)
1122  return -1;
1123 
1124  const unsigned Size = C->getValue().getZExtValue();
1125  const Align DstAlign = *MC->getDestAlign();
1126  const Align SrcAlign = *MC->getSourceAlign();
1127 
1128  MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1129  /*IsVolatile*/ false);
1130  DstAddrSpace = MC->getDestAddressSpace();
1131  SrcAddrSpace = MC->getSourceAddressSpace();
1132  }
1133  else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1134  ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1135  // If 'size' is not a constant, a library call will be generated.
1136  if (!C)
1137  return -1;
1138 
1139  const unsigned Size = C->getValue().getZExtValue();
1140  const Align DstAlign = *MS->getDestAlign();
1141 
1142  MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1143  /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1144  DstAddrSpace = MS->getDestAddressSpace();
1145  }
1146  else
1147  llvm_unreachable("Expected a memcpy/move or memset!");
1148 
1149  unsigned Limit, Factor = 2;
1150  switch(I->getIntrinsicID()) {
1151  case Intrinsic::memcpy:
1152  Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1153  break;
1154  case Intrinsic::memmove:
1155  Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1156  break;
1157  case Intrinsic::memset:
1158  Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1159  Factor = 1;
1160  break;
1161  default:
1162  llvm_unreachable("Expected a memcpy/move or memset!");
1163  }
1164 
1165  // MemOps will be poplulated with a list of data types that needs to be
1166  // loaded and stored. That's why we multiply the number of elements by 2 to
1167  // get the cost for this memcpy.
1168  std::vector<EVT> MemOps;
1169  if (getTLI()->findOptimalMemOpLowering(
1170  MemOps, Limit, MOp, DstAddrSpace,
1171  SrcAddrSpace, F->getAttributes()))
1172  return MemOps.size() * Factor;
1173 
1174  // If we can't find an optimal memop lowering, return the default cost
1175  return -1;
1176 }
1177 
1179  int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1180 
1181  // To model the cost of a library call, we assume 1 for the call, and
1182  // 3 for the argument setup.
1183  if (NumOps == -1)
1184  return 4;
1185  return NumOps;
1186 }
1187 
1190  int Index, VectorType *SubTp) {
1192  if (ST->hasNEON()) {
1193  if (Kind == TTI::SK_Broadcast) {
1194  static const CostTblEntry NEONDupTbl[] = {
1195  // VDUP handles these cases.
1202 
1207 
1208  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1209  if (const auto *Entry =
1210  CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1211  return LT.first * Entry->Cost;
1212  }
1213  if (Kind == TTI::SK_Reverse) {
1214  static const CostTblEntry NEONShuffleTbl[] = {
1215  // Reverse shuffle cost one instruction if we are shuffling within a
1216  // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1223 
1228 
1229  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1230  if (const auto *Entry =
1231  CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1232  return LT.first * Entry->Cost;
1233  }
1234  if (Kind == TTI::SK_Select) {
1235  static const CostTblEntry NEONSelShuffleTbl[] = {
1236  // Select shuffle cost table for ARM. Cost is the number of
1237  // instructions
1238  // required to create the shuffled vector.
1239 
1244 
1248 
1250 
1252 
1253  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1254  if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1255  ISD::VECTOR_SHUFFLE, LT.second))
1256  return LT.first * Entry->Cost;
1257  }
1258  }
1259  if (ST->hasMVEIntegerOps()) {
1260  if (Kind == TTI::SK_Broadcast) {
1261  static const CostTblEntry MVEDupTbl[] = {
1262  // VDUP handles these cases.
1268 
1269  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1270  if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1271  LT.second))
1272  return LT.first * Entry->Cost *
1274  }
1275 
1276  if (!Mask.empty()) {
1277  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1278  if (Mask.size() <= LT.second.getVectorNumElements() &&
1279  (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1280  isVREVMask(Mask, LT.second, 64)))
1282  }
1283  }
1284 
1285  int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1287  : 1;
1288  return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
1289 }
1290 
1292  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1294  TTI::OperandValueProperties Opd1PropInfo,
1296  const Instruction *CxtI) {
1297  int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1298  if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1299  // Make operations on i1 relatively expensive as this often involves
1300  // combining predicates. AND and XOR should be easier to handle with IT
1301  // blocks.
1302  switch (ISDOpcode) {
1303  default:
1304  break;
1305  case ISD::AND:
1306  case ISD::XOR:
1307  return 2;
1308  case ISD::OR:
1309  return 3;
1310  }
1311  }
1312 
1313  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1314 
1315  if (ST->hasNEON()) {
1316  const unsigned FunctionCallDivCost = 20;
1317  const unsigned ReciprocalDivCost = 10;
1318  static const CostTblEntry CostTbl[] = {
1319  // Division.
1320  // These costs are somewhat random. Choose a cost of 20 to indicate that
1321  // vectorizing devision (added function call) is going to be very expensive.
1322  // Double registers types.
1323  { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1324  { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1325  { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1326  { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1327  { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1328  { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1329  { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1330  { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1331  { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1332  { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1333  { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1334  { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1335  { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1336  { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1337  { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1338  { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1339  // Quad register types.
1340  { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1341  { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1342  { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1343  { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1344  { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1345  { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1346  { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1347  { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1348  { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1349  { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1350  { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1351  { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1352  { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1353  { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1354  { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1355  { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1356  // Multiplication.
1357  };
1358 
1359  if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1360  return LT.first * Entry->Cost;
1361 
1363  Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
1364 
1365  // This is somewhat of a hack. The problem that we are facing is that SROA
1366  // creates a sequence of shift, and, or instructions to construct values.
1367  // These sequences are recognized by the ISel and have zero-cost. Not so for
1368  // the vectorized code. Because we have support for v2i64 but not i64 those
1369  // sequences look particularly beneficial to vectorize.
1370  // To work around this we increase the cost of v2i64 operations to make them
1371  // seem less beneficial.
1372  if (LT.second == MVT::v2i64 &&
1374  Cost += 4;
1375 
1376  return Cost;
1377  }
1378 
1379  // If this operation is a shift on arm/thumb2, it might well be folded into
1380  // the following instruction, hence having a cost of 0.
1381  auto LooksLikeAFreeShift = [&]() {
1382  if (ST->isThumb1Only() || Ty->isVectorTy())
1383  return false;
1384 
1385  if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1386  return false;
1388  return false;
1389 
1390  // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1391  switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1392  case Instruction::Add:
1393  case Instruction::Sub:
1394  case Instruction::And:
1395  case Instruction::Xor:
1396  case Instruction::Or:
1397  case Instruction::ICmp:
1398  return true;
1399  default:
1400  return false;
1401  }
1402  };
1403  if (LooksLikeAFreeShift())
1404  return 0;
1405 
1406  // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1407  // for "multiple beats" potentially needed by MVE instructions.
1408  int BaseCost = 1;
1409  if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1410  BaseCost = ST->getMVEVectorCostFactor(CostKind);
1411 
1412  // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1413  // without treating floats as more expensive that scalars or increasing the
1414  // costs for custom operations. The results is also multiplied by the
1415  // MVEVectorCostFactor where appropriate.
1416  if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1417  return LT.first * BaseCost;
1418 
1419  // Else this is expand, assume that we need to scalarize this op.
1420  if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1421  unsigned Num = VTy->getNumElements();
1422  InstructionCost Cost =
1424  // Return the cost of multiple scalar invocation plus the cost of
1425  // inserting and extracting the values.
1426  SmallVector<Type *> Tys(Args.size(), Ty);
1427  return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
1428  }
1429 
1430  return BaseCost;
1431 }
1432 
1434  MaybeAlign Alignment,
1435  unsigned AddressSpace,
1437  const Instruction *I) {
1438  // TODO: Handle other cost kinds.
1440  return 1;
1441 
1442  // Type legalization can't handle structs
1443  if (TLI->getValueType(DL, Src, true) == MVT::Other)
1444  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1445  CostKind);
1446 
1447  if (ST->hasNEON() && Src->isVectorTy() &&
1448  (Alignment && *Alignment != Align(16)) &&
1449  cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1450  // Unaligned loads/stores are extremely inefficient.
1451  // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1452  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
1453  return LT.first * 4;
1454  }
1455 
1456  // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1457  // Same for stores.
1458  if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1459  ((Opcode == Instruction::Load && I->hasOneUse() &&
1460  isa<FPExtInst>(*I->user_begin())) ||
1461  (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1462  FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1463  Type *DstTy =
1464  Opcode == Instruction::Load
1465  ? (*I->user_begin())->getType()
1466  : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1467  if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1468  DstTy->getScalarType()->isFloatTy())
1469  return ST->getMVEVectorCostFactor(CostKind);
1470  }
1471 
1472  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1474  : 1;
1475  return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1476  CostKind, I);
1477 }
1478 
1480 ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1481  unsigned AddressSpace,
1483  if (ST->hasMVEIntegerOps()) {
1484  if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1485  return ST->getMVEVectorCostFactor(CostKind);
1486  if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1487  return ST->getMVEVectorCostFactor(CostKind);
1488  }
1489  if (!isa<FixedVectorType>(Src))
1490  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1491  CostKind);
1492  // Scalar cost, which is currently very high due to the efficiency of the
1493  // generated code.
1494  return cast<FixedVectorType>(Src)->getNumElements() * 8;
1495 }
1496 
1498  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1499  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1500  bool UseMaskForCond, bool UseMaskForGaps) {
1501  assert(Factor >= 2 && "Invalid interleave factor");
1502  assert(isa<VectorType>(VecTy) && "Expect a vector type");
1503 
1504  // vldN/vstN doesn't support vector types of i64/f64 element.
1505  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1506 
1507  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1508  !UseMaskForCond && !UseMaskForGaps) {
1509  unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1510  auto *SubVecTy =
1511  FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1512 
1513  // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1514  // Accesses having vector types that are a multiple of 128 bits can be
1515  // matched to more than one vldN/vstN instruction.
1516  int BaseCost =
1518  if (NumElts % Factor == 0 &&
1519  TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1520  return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1521 
1522  // Some smaller than legal interleaved patterns are cheap as we can make
1523  // use of the vmovn or vrev patterns to interleave a standard load. This is
1524  // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1525  // promoted differently). The cost of 2 here is then a load and vrev or
1526  // vmovn.
1527  if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1528  VecTy->isIntOrIntVectorTy() &&
1529  DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
1530  return 2 * BaseCost;
1531  }
1532 
1533  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1534  Alignment, AddressSpace, CostKind,
1535  UseMaskForCond, UseMaskForGaps);
1536 }
1537 
1539  unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1540  Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1541  using namespace PatternMatch;
1542  if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1543  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1544  Alignment, CostKind, I);
1545 
1546  assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1547  auto *VTy = cast<FixedVectorType>(DataTy);
1548 
1549  // TODO: Splitting, once we do that.
1550 
1551  unsigned NumElems = VTy->getNumElements();
1552  unsigned EltSize = VTy->getScalarSizeInBits();
1553  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
1554 
1555  // For now, it is assumed that for the MVE gather instructions the loads are
1556  // all effectively serialised. This means the cost is the scalar cost
1557  // multiplied by the number of elements being loaded. This is possibly very
1558  // conservative, but even so we still end up vectorising loops because the
1559  // cost per iteration for many loops is lower than for scalar loops.
1560  InstructionCost VectorCost =
1561  NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1562  // The scalarization cost should be a lot higher. We use the number of vector
1563  // elements plus the scalarization overhead.
1564  InstructionCost ScalarCost =
1565  NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
1566  BaseT::getScalarizationOverhead(VTy, false, true);
1567 
1568  if (EltSize < 8 || Alignment < EltSize / 8)
1569  return ScalarCost;
1570 
1571  unsigned ExtSize = EltSize;
1572  // Check whether there's a single user that asks for an extended type
1573  if (I != nullptr) {
1574  // Dependent of the caller of this function, a gather instruction will
1575  // either have opcode Instruction::Load or be a call to the masked_gather
1576  // intrinsic
1577  if ((I->getOpcode() == Instruction::Load ||
1578  match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1579  I->hasOneUse()) {
1580  const User *Us = *I->users().begin();
1581  if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1582  // only allow valid type combinations
1583  unsigned TypeSize =
1584  cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1585  if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1586  (TypeSize == 16 && EltSize == 8)) &&
1587  TypeSize * NumElems == 128) {
1588  ExtSize = TypeSize;
1589  }
1590  }
1591  }
1592  // Check whether the input data needs to be truncated
1593  TruncInst *T;
1594  if ((I->getOpcode() == Instruction::Store ||
1595  match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1596  (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1597  // Only allow valid type combinations
1598  unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1599  if (((EltSize == 16 && TypeSize == 32) ||
1600  (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1601  TypeSize * NumElems == 128)
1602  ExtSize = TypeSize;
1603  }
1604  }
1605 
1606  if (ExtSize * NumElems != 128 || NumElems < 4)
1607  return ScalarCost;
1608 
1609  // Any (aligned) i32 gather will not need to be scalarised.
1610  if (ExtSize == 32)
1611  return VectorCost;
1612  // For smaller types, we need to ensure that the gep's inputs are correctly
1613  // extended from a small enough value. Other sizes (including i64) are
1614  // scalarized for now.
1615  if (ExtSize != 8 && ExtSize != 16)
1616  return ScalarCost;
1617 
1618  if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1619  Ptr = BC->getOperand(0);
1620  if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1621  if (GEP->getNumOperands() != 2)
1622  return ScalarCost;
1623  unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1624  // Scale needs to be correct (which is only relevant for i16s).
1625  if (Scale != 1 && Scale * 8 != ExtSize)
1626  return ScalarCost;
1627  // And we need to zext (not sext) the indexes from a small enough type.
1628  if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1629  if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1630  return VectorCost;
1631  }
1632  return ScalarCost;
1633  }
1634  return ScalarCost;
1635 }
1636 
1642  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1643 
1644  EVT ValVT = TLI->getValueType(DL, ValTy);
1645  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1646  if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1647  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1648 
1649  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1650 
1651  static const CostTblEntry CostTblAdd[]{
1652  {ISD::ADD, MVT::v16i8, 1},
1653  {ISD::ADD, MVT::v8i16, 1},
1654  {ISD::ADD, MVT::v4i32, 1},
1655  };
1656  if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1657  return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1658 
1659  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1660 }
1661 
1663 ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
1664  Type *ResTy, VectorType *ValTy,
1666  EVT ValVT = TLI->getValueType(DL, ValTy);
1667  EVT ResVT = TLI->getValueType(DL, ResTy);
1668 
1669  if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1670  std::pair<InstructionCost, MVT> LT =
1671  TLI->getTypeLegalizationCost(DL, ValTy);
1672 
1673  // The legal cases are:
1674  // VADDV u/s 8/16/32
1675  // VMLAV u/s 8/16/32
1676  // VADDLV u/s 32
1677  // VMLALV u/s 16/32
1678  // Codegen currently cannot always handle larger than legal vectors very
1679  // well, especially for predicated reductions where the mask needs to be
1680  // split, so restrict to 128bit or smaller input types.
1681  unsigned RevVTSize = ResVT.getSizeInBits();
1682  if (ValVT.getSizeInBits() <= 128 &&
1683  ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1684  (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) ||
1685  (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1686  return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1687  }
1688 
1689  return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
1690  CostKind);
1691 }
1692 
1696  switch (ICA.getID()) {
1697  case Intrinsic::get_active_lane_mask:
1698  // Currently we make a somewhat optimistic assumption that
1699  // active_lane_mask's are always free. In reality it may be freely folded
1700  // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1701  // of add/icmp code. We may need to improve this in the future, but being
1702  // able to detect if it is free or not involves looking at a lot of other
1703  // code. We currently assume that the vectorizer inserted these, and knew
1704  // what it was doing in adding one.
1705  if (ST->hasMVEIntegerOps())
1706  return 0;
1707  break;
1708  case Intrinsic::sadd_sat:
1709  case Intrinsic::ssub_sat:
1710  case Intrinsic::uadd_sat:
1711  case Intrinsic::usub_sat: {
1712  if (!ST->hasMVEIntegerOps())
1713  break;
1714  Type *VT = ICA.getReturnType();
1715 
1716  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1717  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1718  LT.second == MVT::v16i8) {
1719  // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1720  // need to extend the type, as it uses shr(qadd(shl, shl)).
1721  unsigned Instrs =
1722  LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1723  return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1724  }
1725  break;
1726  }
1727  case Intrinsic::abs:
1728  case Intrinsic::smin:
1729  case Intrinsic::smax:
1730  case Intrinsic::umin:
1731  case Intrinsic::umax: {
1732  if (!ST->hasMVEIntegerOps())
1733  break;
1734  Type *VT = ICA.getReturnType();
1735 
1736  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1737  if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1738  LT.second == MVT::v16i8)
1739  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1740  break;
1741  }
1742  case Intrinsic::minnum:
1743  case Intrinsic::maxnum: {
1744  if (!ST->hasMVEFloatOps())
1745  break;
1746  Type *VT = ICA.getReturnType();
1747  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1748  if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1749  return LT.first * ST->getMVEVectorCostFactor(CostKind);
1750  break;
1751  }
1752  }
1753 
1755 }
1756 
1758  if (!F->isIntrinsic())
1760 
1761  // Assume all Arm-specific intrinsics map to an instruction.
1762  if (F->getName().startswith("llvm.arm"))
1763  return false;
1764 
1765  switch (F->getIntrinsicID()) {
1766  default: break;
1767  case Intrinsic::powi:
1768  case Intrinsic::sin:
1769  case Intrinsic::cos:
1770  case Intrinsic::pow:
1771  case Intrinsic::log:
1772  case Intrinsic::log10:
1773  case Intrinsic::log2:
1774  case Intrinsic::exp:
1775  case Intrinsic::exp2:
1776  return true;
1777  case Intrinsic::sqrt:
1778  case Intrinsic::fabs:
1779  case Intrinsic::copysign:
1780  case Intrinsic::floor:
1781  case Intrinsic::ceil:
1782  case Intrinsic::trunc:
1783  case Intrinsic::rint:
1784  case Intrinsic::nearbyint:
1785  case Intrinsic::round:
1786  case Intrinsic::canonicalize:
1787  case Intrinsic::lround:
1788  case Intrinsic::llround:
1789  case Intrinsic::lrint:
1790  case Intrinsic::llrint:
1791  if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
1792  return true;
1793  if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
1794  return true;
1795  // Some operations can be handled by vector instructions and assume
1796  // unsupported vectors will be expanded into supported scalar ones.
1797  // TODO Handle scalar operations properly.
1798  return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
1799  case Intrinsic::masked_store:
1800  case Intrinsic::masked_load:
1801  case Intrinsic::masked_gather:
1802  case Intrinsic::masked_scatter:
1803  return !ST->hasMVEIntegerOps();
1804  case Intrinsic::sadd_with_overflow:
1805  case Intrinsic::uadd_with_overflow:
1806  case Intrinsic::ssub_with_overflow:
1807  case Intrinsic::usub_with_overflow:
1808  case Intrinsic::sadd_sat:
1809  case Intrinsic::uadd_sat:
1810  case Intrinsic::ssub_sat:
1811  case Intrinsic::usub_sat:
1812  return false;
1813  }
1814 
1815  return BaseT::isLoweredToCall(F);
1816 }
1817 
1819  unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1820  EVT VT = TLI->getValueType(DL, I.getType(), true);
1821  if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1822  return true;
1823 
1824  // Check if an intrinsic will be lowered to a call and assume that any
1825  // other CallInst will generate a bl.
1826  if (auto *Call = dyn_cast<CallInst>(&I)) {
1827  if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1828  switch(II->getIntrinsicID()) {
1829  case Intrinsic::memcpy:
1830  case Intrinsic::memset:
1831  case Intrinsic::memmove:
1832  return getNumMemOps(II) == -1;
1833  default:
1834  if (const Function *F = Call->getCalledFunction())
1835  return isLoweredToCall(F);
1836  }
1837  }
1838  return true;
1839  }
1840 
1841  // FPv5 provides conversions between integer, double-precision,
1842  // single-precision, and half-precision formats.
1843  switch (I.getOpcode()) {
1844  default:
1845  break;
1846  case Instruction::FPToSI:
1847  case Instruction::FPToUI:
1848  case Instruction::SIToFP:
1849  case Instruction::UIToFP:
1850  case Instruction::FPTrunc:
1851  case Instruction::FPExt:
1852  return !ST->hasFPARMv8Base();
1853  }
1854 
1855  // FIXME: Unfortunately the approach of checking the Operation Action does
1856  // not catch all cases of Legalization that use library calls. Our
1857  // Legalization step categorizes some transformations into library calls as
1858  // Custom, Expand or even Legal when doing type legalization. So for now
1859  // we have to special case for instance the SDIV of 64bit integers and the
1860  // use of floating point emulation.
1861  if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1862  switch (ISD) {
1863  default:
1864  break;
1865  case ISD::SDIV:
1866  case ISD::UDIV:
1867  case ISD::SREM:
1868  case ISD::UREM:
1869  case ISD::SDIVREM:
1870  case ISD::UDIVREM:
1871  return true;
1872  }
1873  }
1874 
1875  // Assume all other non-float operations are supported.
1876  if (!VT.isFloatingPoint())
1877  return false;
1878 
1879  // We'll need a library call to handle most floats when using soft.
1880  if (TLI->useSoftFloat()) {
1881  switch (I.getOpcode()) {
1882  default:
1883  return true;
1884  case Instruction::Alloca:
1885  case Instruction::Load:
1886  case Instruction::Store:
1887  case Instruction::Select:
1888  case Instruction::PHI:
1889  return false;
1890  }
1891  }
1892 
1893  // We'll need a libcall to perform double precision operations on a single
1894  // precision only FPU.
1895  if (I.getType()->isDoubleTy() && !ST->hasFP64())
1896  return true;
1897 
1898  // Likewise for half precision arithmetic.
1899  if (I.getType()->isHalfTy() && !ST->hasFullFP16())
1900  return true;
1901 
1902  return false;
1903 }
1904 
1906  AssumptionCache &AC,
1907  TargetLibraryInfo *LibInfo,
1908  HardwareLoopInfo &HWLoopInfo) {
1909  // Low-overhead branches are only supported in the 'low-overhead branch'
1910  // extension of v8.1-m.
1911  if (!ST->hasLOB() || DisableLowOverheadLoops) {
1912  LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
1913  return false;
1914  }
1915 
1917  LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
1918  return false;
1919  }
1920 
1921  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1922  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
1923  LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
1924  return false;
1925  }
1926 
1927  const SCEV *TripCountSCEV =
1928  SE.getAddExpr(BackedgeTakenCount,
1929  SE.getOne(BackedgeTakenCount->getType()));
1930 
1931  // We need to store the trip count in LR, a 32-bit register.
1932  if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
1933  LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
1934  return false;
1935  }
1936 
1937  // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
1938  // point in generating a hardware loop if that's going to happen.
1939 
1940  auto IsHardwareLoopIntrinsic = [](Instruction &I) {
1941  if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
1942  switch (Call->getIntrinsicID()) {
1943  default:
1944  break;
1945  case Intrinsic::start_loop_iterations:
1946  case Intrinsic::test_start_loop_iterations:
1947  case Intrinsic::loop_decrement:
1948  case Intrinsic::loop_decrement_reg:
1949  return true;
1950  }
1951  }
1952  return false;
1953  };
1954 
1955  // Scan the instructions to see if there's any that we know will turn into a
1956  // call or if this loop is already a low-overhead loop or will become a tail
1957  // predicated loop.
1958  bool IsTailPredLoop = false;
1959  auto ScanLoop = [&](Loop *L) {
1960  for (auto *BB : L->getBlocks()) {
1961  for (auto &I : *BB) {
1962  if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
1963  isa<InlineAsm>(I)) {
1964  LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
1965  return false;
1966  }
1967  if (auto *II = dyn_cast<IntrinsicInst>(&I))
1968  IsTailPredLoop |=
1969  II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
1970  II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
1971  II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
1972  II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
1973  II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
1974  }
1975  }
1976  return true;
1977  };
1978 
1979  // Visit inner loops.
1980  for (auto Inner : *L)
1981  if (!ScanLoop(Inner))
1982  return false;
1983 
1984  if (!ScanLoop(L))
1985  return false;
1986 
1987  // TODO: Check whether the trip count calculation is expensive. If L is the
1988  // inner loop but we know it has a low trip count, calculating that trip
1989  // count (in the parent loop) may be detrimental.
1990 
1991  LLVMContext &C = L->getHeader()->getContext();
1992  HWLoopInfo.CounterInReg = true;
1993  HWLoopInfo.IsNestingLegal = false;
1994  HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
1995  HWLoopInfo.CountType = Type::getInt32Ty(C);
1996  HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
1997  return true;
1998 }
1999 
2000 static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2001  // We don't allow icmp's, and because we only look at single block loops,
2002  // we simply count the icmps, i.e. there should only be 1 for the backedge.
2003  if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2004  return false;
2005  // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2006  // not currently canonical, but soon will be. Code without them uses icmp, and
2007  // so is not tail predicated as per the condition above. In order to get the
2008  // same performance we treat min and max the same as an icmp for tailpred
2009  // purposes for the moment (we often rely on non-tailpred and higher VF's to
2010  // pick more optimial instructions like VQDMULH. They need to be recognized
2011  // directly by the vectorizer).
2012  if (auto *II = dyn_cast<IntrinsicInst>(&I))
2013  if ((II->getIntrinsicID() == Intrinsic::smin ||
2014  II->getIntrinsicID() == Intrinsic::smax ||
2015  II->getIntrinsicID() == Intrinsic::umin ||
2016  II->getIntrinsicID() == Intrinsic::umax) &&
2017  ++ICmpCount > 1)
2018  return false;
2019 
2020  if (isa<FCmpInst>(&I))
2021  return false;
2022 
2023  // We could allow extending/narrowing FP loads/stores, but codegen is
2024  // too inefficient so reject this for now.
2025  if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2026  return false;
2027 
2028  // Extends have to be extending-loads
2029  if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2030  if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2031  return false;
2032 
2033  // Truncs have to be narrowing-stores
2034  if (isa<TruncInst>(&I) )
2035  if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2036  return false;
2037 
2038  return true;
2039 }
2040 
2041 // To set up a tail-predicated loop, we need to know the total number of
2042 // elements processed by that loop. Thus, we need to determine the element
2043 // size and:
2044 // 1) it should be uniform for all operations in the vector loop, so we
2045 // e.g. don't want any widening/narrowing operations.
2046 // 2) it should be smaller than i64s because we don't have vector operations
2047 // that work on i64s.
2048 // 3) we don't want elements to be reversed or shuffled, to make sure the
2049 // tail-predication masks/predicates the right lanes.
2050 //
2052  const DataLayout &DL,
2053  const LoopAccessInfo *LAI) {
2054  LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2055 
2056  // If there are live-out values, it is probably a reduction. We can predicate
2057  // most reduction operations freely under MVE using a combination of
2058  // prefer-predicated-reduction-select and inloop reductions. We limit this to
2059  // floating point and integer reductions, but don't check for operators
2060  // specifically here. If the value ends up not being a reduction (and so the
2061  // vectorizer cannot tailfold the loop), we should fall back to standard
2062  // vectorization automatically.
2064  LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2065  bool ReductionsDisabled =
2068 
2069  for (auto *I : LiveOuts) {
2070  if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2071  !I->getType()->isHalfTy()) {
2072  LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2073  "live-out value\n");
2074  return false;
2075  }
2076  if (ReductionsDisabled) {
2077  LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2078  return false;
2079  }
2080  }
2081 
2082  // Next, check that all instructions can be tail-predicated.
2083  PredicatedScalarEvolution PSE = LAI->getPSE();
2084  SmallVector<Instruction *, 16> LoadStores;
2085  int ICmpCount = 0;
2086 
2087  for (BasicBlock *BB : L->blocks()) {
2088  for (Instruction &I : BB->instructionsWithoutDebug()) {
2089  if (isa<PHINode>(&I))
2090  continue;
2091  if (!canTailPredicateInstruction(I, ICmpCount)) {
2092  LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2093  return false;
2094  }
2095 
2096  Type *T = I.getType();
2097  if (T->isPointerTy())
2098  T = T->getPointerElementType();
2099 
2100  if (T->getScalarSizeInBits() > 32) {
2101  LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2102  return false;
2103  }
2104  if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2106  Type *AccessTy = getLoadStoreType(&I);
2107  int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L);
2108  if (NextStride == 1) {
2109  // TODO: for now only allow consecutive strides of 1. We could support
2110  // other strides as long as it is uniform, but let's keep it simple
2111  // for now.
2112  continue;
2113  } else if (NextStride == -1 ||
2114  (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2115  (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2116  LLVM_DEBUG(dbgs()
2117  << "Consecutive strides of 2 found, vld2/vstr2 can't "
2118  "be tail-predicated\n.");
2119  return false;
2120  // TODO: don't tail predicate if there is a reversed load?
2121  } else if (EnableMaskedGatherScatters) {
2122  // Gather/scatters do allow loading from arbitrary strides, at
2123  // least if they are loop invariant.
2124  // TODO: Loop variant strides should in theory work, too, but
2125  // this requires further testing.
2126  const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2127  if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2128  const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2129  if (PSE.getSE()->isLoopInvariant(Step, L))
2130  continue;
2131  }
2132  }
2133  LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2134  "tail-predicate\n.");
2135  return false;
2136  }
2137  }
2138  }
2139 
2140  LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2141  return true;
2142 }
2143 
2145  ScalarEvolution &SE,
2146  AssumptionCache &AC,
2147  TargetLibraryInfo *TLI,
2148  DominatorTree *DT,
2149  const LoopAccessInfo *LAI) {
2150  if (!EnableTailPredication) {
2151  LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2152  return false;
2153  }
2154 
2155  // Creating a predicated vector loop is the first step for generating a
2156  // tail-predicated hardware loop, for which we need the MVE masked
2157  // load/stores instructions:
2158  if (!ST->hasMVEIntegerOps())
2159  return false;
2160 
2161  // For now, restrict this to single block loops.
2162  if (L->getNumBlocks() > 1) {
2163  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2164  "loop.\n");
2165  return false;
2166  }
2167 
2168  assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2169 
2170  HardwareLoopInfo HWLoopInfo(L);
2171  if (!HWLoopInfo.canAnalyze(*LI)) {
2172  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2173  "analyzable.\n");
2174  return false;
2175  }
2176 
2177  // This checks if we have the low-overhead branch architecture
2178  // extension, and if we will create a hardware-loop:
2179  if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2180  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2181  "profitable.\n");
2182  return false;
2183  }
2184 
2185  if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2186  LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2187  "a candidate.\n");
2188  return false;
2189  }
2190 
2191  return canTailPredicateLoop(L, LI, SE, DL, LAI);
2192 }
2193 
2195  if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2196  return false;
2197 
2198  // Intrinsic @llvm.get.active.lane.mask is supported.
2199  // It is used in the MVETailPredication pass, which requires the number of
2200  // elements processed by this vector loop to setup the tail-predicated
2201  // loop.
2202  return true;
2203 }
2207  // Enable Upper bound unrolling universally, not dependant upon the conditions
2208  // below.
2209  UP.UpperBound = true;
2210 
2211  // Only currently enable these preferences for M-Class cores.
2212  if (!ST->isMClass())
2213  return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2214 
2215  // Disable loop unrolling for Oz and Os.
2216  UP.OptSizeThreshold = 0;
2217  UP.PartialOptSizeThreshold = 0;
2218  if (L->getHeader()->getParent()->hasOptSize())
2219  return;
2220 
2221  SmallVector<BasicBlock*, 4> ExitingBlocks;
2222  L->getExitingBlocks(ExitingBlocks);
2223  LLVM_DEBUG(dbgs() << "Loop has:\n"
2224  << "Blocks: " << L->getNumBlocks() << "\n"
2225  << "Exit blocks: " << ExitingBlocks.size() << "\n");
2226 
2227  // Only allow another exit other than the latch. This acts as an early exit
2228  // as it mirrors the profitability calculation of the runtime unroller.
2229  if (ExitingBlocks.size() > 2)
2230  return;
2231 
2232  // Limit the CFG of the loop body for targets with a branch predictor.
2233  // Allowing 4 blocks permits if-then-else diamonds in the body.
2234  if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2235  return;
2236 
2237  // Don't unroll vectorized loops, including the remainder loop
2238  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2239  return;
2240 
2241  // Scan the loop: don't unroll loops with calls as this could prevent
2242  // inlining.
2243  InstructionCost Cost = 0;
2244  for (auto *BB : L->getBlocks()) {
2245  for (auto &I : *BB) {
2246  // Don't unroll vectorised loop. MVE does not benefit from it as much as
2247  // scalar code.
2248  if (I.getType()->isVectorTy())
2249  return;
2250 
2251  if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2252  if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2253  if (!isLoweredToCall(F))
2254  continue;
2255  }
2256  return;
2257  }
2258 
2259  SmallVector<const Value*, 4> Operands(I.operand_values());
2260  Cost +=
2262  }
2263  }
2264 
2265  // On v6m cores, there are very few registers available. We can easily end up
2266  // spilling and reloading more registers in an unrolled loop. Look at the
2267  // number of LCSSA phis as a rough measure of how many registers will need to
2268  // be live out of the loop, reducing the default unroll count if more than 1
2269  // value is needed. In the long run, all of this should be being learnt by a
2270  // machine.
2271  unsigned UnrollCount = 4;
2272  if (ST->isThumb1Only()) {
2273  unsigned ExitingValues = 0;
2274  SmallVector<BasicBlock *, 4> ExitBlocks;
2275  L->getExitBlocks(ExitBlocks);
2276  for (auto *Exit : ExitBlocks) {
2277  // Count the number of LCSSA phis. Exclude values coming from GEP's as
2278  // only the last is expected to be needed for address operands.
2279  unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2280  return PH.getNumOperands() != 1 ||
2281  !isa<GetElementPtrInst>(PH.getOperand(0));
2282  });
2283  ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2284  }
2285  if (ExitingValues)
2286  UnrollCount /= ExitingValues;
2287  if (UnrollCount <= 1)
2288  return;
2289  }
2290 
2291  LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2292  LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2293 
2294  UP.Partial = true;
2295  UP.Runtime = true;
2296  UP.UnrollRemainder = true;
2298  UP.UnrollAndJam = true;
2300 
2301  // Force unrolling small loops can be very useful because of the branch
2302  // taken cost of the backedge.
2303  if (Cost < 12)
2304  UP.Force = true;
2305 }
2306 
2309  BaseT::getPeelingPreferences(L, SE, PP);
2310 }
2311 
2312 bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2313  TTI::ReductionFlags Flags) const {
2314  if (!ST->hasMVEIntegerOps())
2315  return false;
2316 
2317  unsigned ScalarBits = Ty->getScalarSizeInBits();
2318  switch (Opcode) {
2319  case Instruction::Add:
2320  return ScalarBits <= 64;
2321  default:
2322  return false;
2323  }
2324 }
2325 
2327  unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2328  if (!ST->hasMVEIntegerOps())
2329  return false;
2330  return true;
2331 }
llvm::ISD::SUB
@ SUB
Definition: ISDOpcodes.h:240
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
ARMSubtarget.h
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::TargetLoweringBase::getMaxStoresPerMemmove
unsigned getMaxStoresPerMemmove(bool OptSize) const
Get maximum # of store operations permitted for llvm.memmove.
Definition: TargetLowering.h:1628
llvm::TargetTransformInfo::CastContextHint::Masked
@ Masked
The cast is used with a masked load/store.
ValueTypes.h
llvm::TargetTransformInfo::UnrollingPreferences::PartialOptSizeThreshold
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
Definition: TargetTransformInfo.h:457
llvm::ScalarEvolution::hasLoopInvariantBackedgeTakenCount
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
Definition: ScalarEvolution.cpp:12351
llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition: TargetTransformInfo.h:862
llvm::BasicTTIImplBase< ARMTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:38
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:485
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:211
llvm::SPF_SMAX
@ SPF_SMAX
Unsigned minimum.
Definition: ValueTracking.h:662
llvm::ISD::VECTOR_SHUFFLE
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:563
llvm::ARM_AM::isThumbImmShiftedVal
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
Definition: ARMAddressingModes.h:235
llvm::MVT::v4f16
@ v4f16
Definition: MachineValueType.h:136
llvm::TargetTransformInfo::TCC_Expensive
@ TCC_Expensive
The cost of a 'div' instruction on x86.
Definition: TargetTransformInfo.h:264
llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:184
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
llvm::TargetTransformInfo::ReductionFlags
Flags describing the kind of vector reduction.
Definition: TargetTransformInfo.h:1338
llvm::ConvertCostTableLookup
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType >> Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
llvm::CostTblEntryT
Cost Table Entry.
Definition: CostTable.h:25
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::LoopBase::getExitBlocks
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
Definition: LoopInfoImpl.h:62
llvm::ARMSubtarget::hasMVEFloatOps
bool hasMVEFloatOps() const
Definition: ARMSubtarget.h:628
llvm::HardwareLoopInfo::LoopDecrement
Value * LoopDecrement
Definition: TargetTransformInfo.h:103
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:633
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:435
InstCombiner.h
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:720
llvm::InstCombiner::getDominatorTree
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:368
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
IntrinsicInst.h
ceil
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g ceil
Definition: README-FPStack.txt:54
DisableLowOverheadLoops
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
llvm::TypeSize::getFixedSize
ScalarTy getFixedSize() const
Definition: TypeSize.h:426
llvm::Function
Definition: Function.h:61
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
llvm::ISD::UDIV
@ UDIV
Definition: ISDOpcodes.h:243
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
llvm::SelectPatternResult::Flavor
SelectPatternFlavor Flavor
Definition: ValueTracking.h:682
llvm::TargetTransformInfoImplCRTPBase< ARMTTIImpl >::getUserCost
InstructionCost getUserCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
Definition: TargetTransformInfoImpl.h:929
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:659
llvm::BasicTTIImplBase< ARMTTIImpl >::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1055
llvm::TargetTransformInfo::AMK_PostIndexed
@ AMK_PostIndexed
Definition: TargetTransformInfo.h:641
llvm::ARMTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: ARMTargetTransformInfo.cpp:1188
llvm::PointerType::get
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Definition: Type.cpp:691
MVEMaxSupportedInterleaveFactor
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor
llvm::ARMTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: ARMTargetTransformInfo.cpp:85
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:319
llvm::PredicatedScalarEvolution
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
Definition: ScalarEvolution.h:2097
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:133
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::InstCombiner::Builder
BuilderTy & Builder
Definition: InstCombiner.h:56
llvm::ARMSubtarget::hasFPARMv8Base
bool hasFPARMv8Base() const
Definition: ARMSubtarget.h:655
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1467
llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:750
llvm::CmpInst::makeCmpResultType
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1031
llvm::IRBuilder< TargetFolder, IRBuilderCallbackInserter >
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:150
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:461
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJamInnerLoopThreshold
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
Definition: TargetTransformInfo.h:504
Local.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
llvm::TargetTransformInfo::UnrollingPreferences::UnrollRemainder
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
Definition: TargetTransformInfo.h:497
llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:785
llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:189
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:214
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::ARMSubtarget::hasV6T2Ops
bool hasV6T2Ops() const
Definition: ARMSubtarget.h:614
EnableMaskedLoadStores
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
llvm::MemOp
Definition: TargetLowering.h:112
APInt.h
llvm::ARMTTIImpl::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:1538
llvm::getLoadStoreType
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
Definition: Instructions.h:5346
llvm::HardwareLoopInfo::isHardwareLoopCandidate
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
Definition: TargetTransformInfo.cpp:100
llvm::ARMTTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:429
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:481
llvm::findDefsUsedOutsideOfLoop
SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
Definition: LoopUtils.cpp:131
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::getPtrStride
int64_t getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap=ValueToValueMap(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
Definition: LoopAccessAnalysis.cpp:1029
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1403
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:535
llvm::tgtok::Bits
@ Bits
Definition: TGLexer.h:50
llvm::Instruction::isShift
bool isShift() const
Definition: Instruction.h:167
llvm::BasicTTIImplBase< ARMTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:833
llvm::SPF_UMAX
@ SPF_UMAX
Signed maximum.
Definition: ValueTracking.h:663
llvm::Optional
Definition: APInt.h:33
T
#define T
Definition: Mips16ISelLowering.cpp:341
llvm::ARMSubtarget::hasMVEIntegerOps
bool hasMVEIntegerOps() const
Definition: ARMSubtarget.h:627
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:40
llvm::ConstantAsMetadata::get
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:419
llvm::TargetLoweringBase::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: TargetLoweringBase.cpp:1842
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:398
llvm::CallBase::getNumArgOperands
unsigned getNumArgOperands() const
Definition: InstrTypes.h:1336
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
llvm::matchSelectPattern
SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
Definition: ValueTracking.cpp:6196
llvm::SelectPatternFlavor
SelectPatternFlavor
Specific patterns of select instructions we can match.
Definition: ValueTracking.h:658
llvm::MVT::v2f64
@ v2f64
Definition: MachineValueType.h:172
llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:525
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::count_if
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1641
llvm::TargetLoweringBase::getOperationAction
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
Definition: TargetLowering.h:1047
llvm::ARMTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: ARMTargetTransformInfo.cpp:2204
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:203
llvm::LoopBase::getNumBlocks
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:185
llvm::APIntOps::umin
const APInt & umin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be unsigned.
Definition: APInt.h:2121
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1208
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:860
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::ARMTTIImpl::isLegalMaskedLoad
bool isLegalMaskedLoad(Type *DataTy, Align Alignment)
Definition: ARMTargetTransformInfo.cpp:1068
llvm::Instruction::setMetadata
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1336
llvm::ARMTTIImpl::isHardwareLoopProfitable
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
Definition: ARMTargetTransformInfo.cpp:1905
KnownBits.h
llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(Optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition: TargetTransformInfo.h:1160
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::HardwareLoopInfo::IsNestingLegal
bool IsNestingLegal
Definition: TargetTransformInfo.h:105
floor
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g floor
Definition: README-FPStack.txt:54
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:130
MachineValueType.h
UnrollCount
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:266
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:206
Instruction.h
llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition: DerivedTypes.h:568
llvm::ARMSubtarget::hasLOB
bool hasLOB() const
Definition: ARMSubtarget.h:663
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::InstCombiner::replaceOperand
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:436
llvm::Intrinsic::getType
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:1335
llvm::MVT::i1
@ i1
Definition: MachineValueType.h:43
llvm::APInt::isNonNegative
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:319
llvm::MVT::v8f16
@ v8f16
Definition: MachineValueType.h:137
llvm::APInt::isNegative
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:314
llvm::BasicTTIImplBase::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:487
llvm::ARM_AM::getSOImmVal
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
Definition: ARMAddressingModes.h:163
SubtargetFeature.h
TargetMachine.h
llvm::TypeConversionCostTblEntryT
Type Conversion Cost Table.
Definition: CostTable.h:55
llvm::ScalarEvolution::getOne
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
Definition: ScalarEvolution.h:629
llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:679
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:729
EnableTailPredication
cl::opt< TailPredication::Mode > EnableTailPredication
llvm::BasicTTIImplBase< ARMTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:1108
llvm::ARMTTIImpl::getIntImmCostInst
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
Definition: ARMTargetTransformInfo.cpp:369
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:859
llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition: TargetTransformInfo.h:1062
llvm::User
Definition: User.h:44
llvm::TargetLoweringBase::getMaxStoresPerMemset
unsigned getMaxStoresPerMemset(bool OptSize) const
Get maximum # of store operations permitted for llvm.memset.
Definition: TargetLowering.h:1589
llvm::BasicTTIImplBase< ARMTTIImpl >::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
Definition: BasicTTIImpl.h:2007
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:35
llvm::getKnownAlignment
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition: Local.h:224
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::TargetTransformInfo::UnrollingPreferences::Force
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
Definition: TargetTransformInfo.h:493
llvm::MVT::f64
@ f64
Definition: MachineValueType.h:56
llvm::ARMSubtarget::hasBranchPredictor
bool hasBranchPredictor() const
Definition: ARMSubtarget.h:717
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:57
llvm::EVT::getVectorNumElements
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:309
llvm::BasicTTIImplBase< ARMTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2119
AllowWLSLoops
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:735
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:178
llvm::APInt::getLimitedValue
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition: APInt.h:449
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:237
llvm::ARMTTIImpl::getIntImmCodeSizeCost
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty)
Definition: ARMTargetTransformInfo.cpp:328
llvm::MVT::v16i1
@ v16i1
Definition: MachineValueType.h:68
llvm::ISD::UDIVREM
@ UDIVREM
Definition: ISDOpcodes.h:256
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
llvm::PatternMatch::m_c_Add
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
Definition: PatternMatch.h:2228
llvm::ARMTargetLowering::useSoftFloat
bool useSoftFloat() const override
Definition: ARMISelLowering.cpp:1579
llvm::MVT::v8i1
@ v8i1
Definition: MachineValueType.h:67
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJam
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
Definition: TargetTransformInfo.h:499
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::PatternMatch::m_ConstantInt
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:145
llvm::EVT::isInteger
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:145
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:153
llvm::HardwareLoopInfo::PerformEntryTest
bool PerformEntryTest
Definition: TargetTransformInfo.h:109
canTailPredicateLoop
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI)
Definition: ARMTargetTransformInfo.cpp:2051
llvm::ARMTTIImpl::isLoweredToCall
bool isLoweredToCall(const Function *F)
Definition: ARMTargetTransformInfo.cpp:1757
llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:739
llvm::InstCombiner::eraseInstFromFunction
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1453
llvm::APInt::getHighBitsSet
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:281
llvm::LoopBase::getExitingBlocks
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
Definition: LoopInfoImpl.h:34
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:900
LoopUtils.h
llvm::HardwareLoopInfo::CounterInReg
bool CounterInReg
Definition: TargetTransformInfo.h:107
llvm::ARMSubtarget::hasV6Ops
bool hasV6Ops() const
Definition: ARMSubtarget.h:611
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:632
llvm::ARMTTIImpl::emitGetActiveLaneMask
bool emitGetActiveLaneMask() const
Definition: ARMTargetTransformInfo.cpp:2194
PatternMatch.h
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:648
llvm::ARMSubtarget::hasFP64
bool hasFP64() const
Definition: ARMSubtarget.h:690
llvm::MVT::v1i64
@ v1i64
Definition: MachineValueType.h:117
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:62
llvm::isVREVMask
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
Definition: ARMTargetTransformInfo.h:309
llvm::ARM_AM::getT2SOImmVal
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
Definition: ARMAddressingModes.h:320
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
llvm::BasicTTIImplBase< ARMTTIImpl >::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1060
llvm::BasicTTIImplBase< ARMTTIImpl >::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:890
llvm::None
const NoneType None
Definition: None.h:23
llvm::ARMTTIImpl::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: ARMTargetTransformInfo.cpp:1497
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:91
llvm::lltok::Kind
Kind
Definition: LLToken.h:18
llvm::ARMSubtarget::getMVEVectorCostFactor
unsigned getMVEVectorCostFactor(TargetTransformInfo::TargetCostKind CostKind) const
Definition: ARMSubtarget.h:930
llvm::Type::getIntegerBitWidth
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:97
llvm::MVT::v4i8
@ v4i8
Definition: MachineValueType.h:78
llvm::SPF_SMIN
@ SPF_SMIN
Definition: ValueTracking.h:660
Type.h
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:118
llvm::APInt::isAllOnesValue
bool isAllOnesValue() const
NOTE: This is soft-deprecated. Please use isAllOnes() instead.
Definition: APInt.h:360
llvm::Instruction::getMetadata
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:282
llvm::maxnum
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE maxNum semantics.
Definition: APFloat.h:1309
LoopInfo.h
llvm::ARMTTIImpl::getNumMemOps
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
Definition: ARMTargetTransformInfo.cpp:1112
llvm::ARMTTIImpl::isProfitableLSRChainElement
bool isProfitableLSRChainElement(Instruction *I)
Definition: ARMTargetTransformInfo.cpp:1051
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:78
llvm::ARMTTIImpl::preferPredicateOverEpilogue
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, const LoopAccessInfo *LAI)
Definition: ARMTargetTransformInfo.cpp:2144
getCalledFunction
static const Function * getCalledFunction(const Value *V, bool LookThroughBitCast, bool &IsNoBuiltin)
Definition: MemoryBuiltins.cpp:118
llvm::ScalarEvolution::getSCEV
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition: ScalarEvolution.cpp:4066
llvm::PatternMatch::m_Xor
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1135
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:201
llvm::APInt::getOneBitSet
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:224
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:861
llvm::MVT::v2i8
@ v2i8
Definition: MachineValueType.h:77
llvm::MVT::v4i64
@ v4i64
Definition: MachineValueType.h:120
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
llvm::ARMTTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:1433
llvm::TargetTransformInfo::CastContextHint::Normal
@ Normal
The cast is used with a normal load/store.
BasicBlock.h
llvm::cl::opt< bool >
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:78
llvm::ARMTTIImpl::getExtendedAddReductionCost
InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1663
llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:535
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::MVT::v16i8
@ v16i8
Definition: MachineValueType.h:80
llvm::CostTableLookup
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType >> Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
llvm::EVT::getSizeInBits
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:341
llvm::ARMSubtarget::isThumb1Only
bool isThumb1Only() const
Definition: ARMSubtarget.h:814
llvm::ScalarEvolution::getUnsignedRangeMax
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
Definition: ScalarEvolution.h:927
llvm::ARMTTIImpl::getMemcpyCost
InstructionCost getMemcpyCost(const Instruction *I)
Definition: ARMTargetTransformInfo.cpp:1178
llvm::InstCombiner::getAssumptionCache
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:366
llvm::MVT::v16i16
@ v16i16
Definition: MachineValueType.h:93
Index
uint32_t Index
Definition: ELFObjHandler.cpp:84
llvm::MVT::v2i64
@ v2i64
Definition: MachineValueType.h:118
uint64_t
llvm::InstCombiner::getDataLayout
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:369
llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition: ISDOpcodes.h:786
llvm::ARMTTIImpl::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr)
Definition: ARMTargetTransformInfo.cpp:1029
llvm::MVT::v16f32
@ v16f32
Definition: MachineValueType.h:162
llvm::Instruction::user_back
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:91
llvm::TruncInst
This class represents a truncation of integer types.
Definition: Instructions.h:4755
llvm::TargetTransformInfo::OK_UniformConstantValue
@ OK_UniformConstantValue
Definition: TargetTransformInfo.h:881
llvm::ARMSubtarget::isMClass
bool isMClass() const
Definition: ARMSubtarget.h:817
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:428
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:886
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
llvm::ARMTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1694
llvm::LoopAccessInfo
Drive the analysis of memory accesses in the loop.
Definition: LoopAccessAnalysis.h:515
llvm::ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic
Optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
Definition: ARMTargetTransformInfo.cpp:251
llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:141
llvm::MVT::v4f32
@ v4f32
Definition: MachineValueType.h:157
llvm::MVT::i8
@ i8
Definition: MachineValueType.h:44
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:79
llvm::MVT::Other
@ Other
Definition: MachineValueType.h:42
llvm::ARMSubtarget::hasSlowLoadDSubregister
bool hasSlowLoadDSubregister() const
Definition: ARMSubtarget.h:704
llvm::SPF_ABS
@ SPF_ABS
Floating point maxnum.
Definition: ValueTracking.h:666
memcpy
<%struct.s * > cast struct s *S to sbyte *< sbyte * > sbyte uint cast struct s *agg result to sbyte *< sbyte * > sbyte uint cast struct s *memtmp to sbyte *< sbyte * > sbyte uint ret void llc ends up issuing two memcpy or custom lower memcpy(of small size) to be ldmia/stmia. I think option 2 is better but the current register allocator cannot allocate a chunk of registers at a time. A feasible temporary solution is to use specific physical registers at the lowering time for small(<
llvm::TargetLoweringBase::getMaxStoresPerMemcpy
unsigned getMaxStoresPerMemcpy(bool OptSize) const
Get maximum # of store operations permitted for llvm.memcpy.
Definition: TargetLowering.h:1599
llvm::TargetTransformInfoImplBase::isLoweredToCall
bool isLoweredToCall(const Function *F) const
Definition: TargetTransformInfoImpl.h:119
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::ARMTTIImpl::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1480
llvm::ARMSubtarget::hasFullFP16
bool hasFullFP16() const
Definition: ARMSubtarget.h:734
llvm::ARMTTIImpl::preferInLoopReduction
bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
Definition: ARMTargetTransformInfo.cpp:2312
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:83
llvm::IRBuilderBase::CreateIntrinsic
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with args, mangled using Types.
Definition: IRBuilder.cpp:840
llvm::BasicTTIImplBase< ARMTTIImpl >::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: BasicTTIImpl.h:1176
llvm::IRBuilderBase::getTrue
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:458
llvm::PatternMatch::m_Constant
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
Definition: PatternMatch.h:142
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:878
isSSATMinMaxPattern
static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
Definition: ARMTargetTransformInfo.cpp:338
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:650
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::ARMTTIImpl::getPreferredAddressingMode
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
Definition: ARMTargetTransformInfo.cpp:104
llvm::APIntOps::smin
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition: APInt.h:2111
llvm::ArrayRef< int >
llvm::LoopInfo
Definition: LoopInfo.h:1083
llvm::EVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:155
ARMAddressingModes.h
llvm::ARMSubtarget::hasNEON
bool hasNEON() const
Definition: ARMSubtarget.h:656
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
DataLayout.h
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:47
llvm::MVT::v2i32
@ v2i32
Definition: MachineValueType.h:101
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:41
llvm::ARMTTIImpl::isLegalMaskedGather
bool isLegalMaskedGather(Type *Ty, Align Alignment)
Definition: ARMTargetTransformInfo.cpp:1088
llvm::BasicTTIImplBase< ARMTTIImpl >::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:671
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:215
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::ISD::SREM
@ SREM
Definition: ISDOpcodes.h:244
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
llvm::MVT::v2f32
@ v2f32
Definition: MachineValueType.h:155
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:990
llvm::ARMTTIImpl::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:883
llvm::TargetTransformInfo::UnrollingPreferences::DefaultUnrollRuntimeCount
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
Definition: TargetTransformInfo.h:464
trunc
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g trunc
Definition: README-FPStack.txt:63
llvm::SPF_FMINNUM
@ SPF_FMINNUM
Unsigned maximum.
Definition: ValueTracking.h:664
llvm::TargetLoweringBase::InstructionOpcodeToISD
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Definition: TargetLoweringBase.cpp:1762
llvm::TargetTransformInfo::AddressingModeKind
AddressingModeKind
Definition: TargetTransformInfo.h:639
llvm::MVT::v4i32
@ v4i32
Definition: MachineValueType.h:103
llvm::BasicTTIImplBase< ARMTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:559
llvm::SPF_FMAXNUM
@ SPF_FMAXNUM
Floating point minnum.
Definition: ValueTracking.h:665
canTailPredicateInstruction
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
Definition: ARMTargetTransformInfo.cpp:2000
llvm::MVT::v8i64
@ v8i64
Definition: MachineValueType.h:121
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:634
llvm::Function::hasOptSize
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:671
llvm::ScalarEvolution::isLoopInvariant
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
Definition: ScalarEvolution.cpp:12619
llvm::BasicTTIImplBase< ARMTTIImpl >::getCallInstrCost
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency)
Compute a cost of the given call instruction.
Definition: BasicTTIImpl.h:1996
llvm::InstCombiner::replaceInstUsesWith
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:415
llvm::MVT::v16i32
@ v16i32
Definition: MachineValueType.h:108
llvm::MemOp::Set
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
Definition: TargetLowering.h:145
llvm::MCID::Select
@ Select
Definition: MCInstrDesc.h:162
llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition: TargetLowering.h:339
llvm::LoopBase::isInnermost
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:165
llvm::APIntOps::umax
const APInt & umax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be unsigned.
Definition: APInt.h:2126
llvm::minnum
LLVM_READONLY APFloat minnum(const APFloat &A, const APFloat &B)
Implements IEEE minNum semantics.
Definition: APFloat.h:1298
llvm::MemOp::Copy
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
Definition: TargetLowering.h:130
llvm::KnownBits
Definition: KnownBits.h:23
llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:147
CostTable.h
llvm::EVT::getScalarType
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:296
llvm::TargetTransformInfo::UnrollingPreferences::UpperBound
bool UpperBound
Allow using trip count upper bound to unroll loops.
Definition: TargetTransformInfo.h:495
llvm::Align::value
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:207
llvm::MVT::v8i16
@ v8i16
Definition: MachineValueType.h:92
llvm::ARMTTIImpl::getIntImmCost
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:293
ISDOpcodes.h
llvm::TypeSize
Definition: TypeSize.h:417
Casting.h
llvm::BasicTTIImplBase< ARMTTIImpl >::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:1160
llvm::Value::hasNUses
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
llvm::MVT::i32
@ i32
Definition: MachineValueType.h:46
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:219
llvm::ISD::SDIV
@ SDIV
Definition: ISDOpcodes.h:242
powi
This is blocked on not handling X *X *X powi(X, 3)(see note above). The issue is that we end up getting t
llvm::log2
static double log2(double V)
Definition: AMDGPULibCalls.cpp:842
llvm::APInt::getSplat
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:595
llvm::InstCombiner::SimplifyDemandedBits
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, unsigned Depth=0)=0
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:183
llvm::MVT::v8i32
@ v8i32
Definition: MachineValueType.h:107
llvm::InstCombiner
The core instruction combiner logic.
Definition: InstCombiner.h:45
llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition: ISDOpcodes.h:740
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
llvm::TargetLoweringBase::isOperationLegalOrCustomOrPromote
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Definition: TargetLowering.h:1142
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
llvm::HardwareLoopInfo
Attributes of a target dependent hardware loop.
Definition: TargetTransformInfo.h:95
llvm::TargetTransformInfoImplBase::isConstantStridedAccessLessThan
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
Definition: TargetTransformInfoImpl.h:839
llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:833
llvm::BasicTTIImplBase< ARMTTIImpl >::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1167
llvm::ARMTTIImpl::maybeLoweredToCall
bool maybeLoweredToCall(Instruction &I)
Definition: ARMTargetTransformInfo.cpp:1818
Instructions.h
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:148
llvm::ARMTTIImpl::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: ARMTargetTransformInfo.cpp:443
llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:657
SmallVector.h
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:802
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:241
llvm::ISD::UREM
@ UREM
Definition: ISDOpcodes.h:245
llvm::MVT::f16
@ f16
Definition: MachineValueType.h:54
llvm::TailPredication::EnabledNoReductions
@ EnabledNoReductions
Definition: ARMTargetTransformInfo.h:44
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1338
llvm::BasicTTIImplBase< ARMTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: BasicTTIImpl.h:866
llvm::BasicTTIImplBase< ARMTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1332
llvm::ARMTTIImpl::isLegalMaskedStore
bool isLegalMaskedStore(Type *DataTy, Align Alignment)
Definition: ARMTargetTransformInfo.h:188
llvm::SPF_UMIN
@ SPF_UMIN
Signed minimum.
Definition: ValueTracking.h:661
llvm::getBooleanLoopAttribute
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1087
llvm::ARMTargetLowering::getNumInterleavedAccesses
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
Definition: ARMISelLowering.cpp:20602
simplifyNeonVld1
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
Definition: ARMTargetTransformInfo.cpp:66
llvm::APInt::getActiveBits
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1427
llvm::ARMTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: ARMTargetTransformInfo.cpp:2307
llvm::BasicTTIImplBase< ARMTTIImpl >::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1116
ARMTargetTransformInfo.h
llvm::HardwareLoopInfo::canAnalyze
bool canAnalyze(LoopInfo &LI)
Definition: TargetTransformInfo.cpp:47
llvm::IRBuilderBase::CreateVectorSplat
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1098
DerivedTypes.h
llvm::SCEV::getType
Type * getType() const
Return the LLVM type of this SCEV expression.
Definition: ScalarEvolution.cpp:379
llvm::getLoadStorePointerOperand
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Definition: Instructions.h:5301
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
EnableMaskedGatherScatters
cl::opt< bool > EnableMaskedGatherScatters
llvm::ScalarEvolution::getAddExpr
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:2419
llvm::APInt::getLowBitsSet
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:291
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:45
llvm::TargetTransformInfo::UnrollingPreferences::OptSizeThreshold
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Definition: TargetTransformInfo.h:450
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:172
llvm::ARMTTIImpl::preferPredicatedReductionSelect
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
Definition: ARMTargetTransformInfo.cpp:2326
llvm::ISD::SDIVREM
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
llvm::ARMTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: ARMTargetTransformInfo.cpp:847
llvm::ARMSubtarget::isThumb2
bool isThumb2() const
Definition: ARMSubtarget.h:815
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:414
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1409
llvm::LoopAccessInfo::getPSE
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
Definition: LoopAccessAnalysis.h:587
llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:726
llvm::MVT::v8i8
@ v8i8
Definition: MachineValueType.h:79
llvm::ARMTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Op1Info=TTI::OK_AnyValue, TTI::OperandValueKind Op2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: ARMTargetTransformInfo.cpp:1291
llvm::ScalarEvolution::getBackedgeTakenCount
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
Definition: ScalarEvolution.cpp:7284
llvm::MVT::v8f32
@ v8f32
Definition: MachineValueType.h:161
llvm::PredicatedScalarEvolution::getSE
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
Definition: ScalarEvolution.h:2129
llvm::ARMTTIImpl::instCombineIntrinsic
Optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
Definition: ARMTargetTransformInfo.cpp:120
llvm::BinaryOperator::Create
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), Instruction *InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Definition: Instructions.cpp:2662
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:89
llvm::MVT::v16i64
@ v16i64
Definition: MachineValueType.h:122
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1284
llvm::BasicTTIImplBase< ARMTTIImpl >::getExtendedAddReductionCost
InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2187
llvm::HardwareLoopInfo::CountType
IntegerType * CountType
Definition: TargetTransformInfo.h:102
llvm::EVT::isFixedLengthVector
bool isFixedLengthVector() const
Definition: ValueTypes.h:165
llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:814
llvm::MVT::f32
@ f32
Definition: MachineValueType.h:55
llvm::ARMTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: ARMTargetTransformInfo.cpp:1638
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:212
llvm::EVT::isFloatingPoint
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:140
llvm::TargetTransformInfo::AMK_None
@ AMK_None
Definition: TargetTransformInfo.h:642
llvm::TargetTransformInfo::AMK_PreIndexed
@ AMK_PreIndexed
Definition: TargetTransformInfo.h:640
llvm::APIntOps::smax
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition: APInt.h:2116
llvm::ARMSubtarget::hasVFP2Base
bool hasVFP2Base() const
Definition: ARMSubtarget.h:652
llvm::TargetLoweringBase::LibCall
@ LibCall
Definition: TargetLowering.h:200
llvm::ARMSubtarget::isThumb
bool isThumb() const
Definition: ARMSubtarget.h:812
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:128
llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:289
llvm::MVT::v4i1
@ v4i1
Definition: MachineValueType.h:66
llvm::ARMTargetLowering::isLegalInterleavedAccessType
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
Definition: ARMISelLowering.cpp:20607
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37
llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:498
llvm::TailPredication::ForceEnabledNoReductions
@ ForceEnabledNoReductions
Definition: ARMTargetTransformInfo.h:46