LLVM  10.0.0svn
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "ARMSubtarget.h"
12 #include "llvm/ADT/APInt.h"
13 #include "llvm/ADT/SmallVector.h"
14 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
18 #include "llvm/IR/BasicBlock.h"
19 #include "llvm/IR/CallSite.h"
20 #include "llvm/IR/DataLayout.h"
21 #include "llvm/IR/DerivedTypes.h"
22 #include "llvm/IR/Instruction.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/IntrinsicInst.h"
25 #include "llvm/IR/Type.h"
27 #include "llvm/Support/Casting.h"
30 #include <algorithm>
31 #include <cassert>
32 #include <cstdint>
33 #include <utility>
34 
35 using namespace llvm;
36 
37 #define DEBUG_TYPE "armtti"
38 
40  "enable-arm-maskedldst", cl::Hidden, cl::init(false),
41  cl::desc("Enable the generation of masked loads and stores"));
42 
44  "disable-arm-loloops", cl::Hidden, cl::init(false),
45  cl::desc("Disable the generation of low-overhead loops"));
46 
48  const Function *Callee) const {
49  const TargetMachine &TM = getTLI()->getTargetMachine();
50  const FeatureBitset &CallerBits =
51  TM.getSubtargetImpl(*Caller)->getFeatureBits();
52  const FeatureBitset &CalleeBits =
53  TM.getSubtargetImpl(*Callee)->getFeatureBits();
54 
55  // To inline a callee, all features not in the whitelist must match exactly.
56  bool MatchExact = (CallerBits & ~InlineFeatureWhitelist) ==
57  (CalleeBits & ~InlineFeatureWhitelist);
58  // For features in the whitelist, the callee's features must be a subset of
59  // the callers'.
60  bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeatureWhitelist) ==
61  (CalleeBits & InlineFeatureWhitelist);
62  return MatchExact && MatchSubset;
63 }
64 
65 int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
66  assert(Ty->isIntegerTy());
67 
68  unsigned Bits = Ty->getPrimitiveSizeInBits();
69  if (Bits == 0 || Imm.getActiveBits() >= 64)
70  return 4;
71 
72  int64_t SImmVal = Imm.getSExtValue();
73  uint64_t ZImmVal = Imm.getZExtValue();
74  if (!ST->isThumb()) {
75  if ((SImmVal >= 0 && SImmVal < 65536) ||
76  (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
77  (ARM_AM::getSOImmVal(~ZImmVal) != -1))
78  return 1;
79  return ST->hasV6T2Ops() ? 2 : 3;
80  }
81  if (ST->isThumb2()) {
82  if ((SImmVal >= 0 && SImmVal < 65536) ||
83  (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
84  (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
85  return 1;
86  return ST->hasV6T2Ops() ? 2 : 3;
87  }
88  // Thumb1, any i8 imm cost 1.
89  if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
90  return 1;
91  if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
92  return 2;
93  // Load from constantpool.
94  return 3;
95 }
96 
97 // Constants smaller than 256 fit in the immediate field of
98 // Thumb1 instructions so we return a zero cost and 1 otherwise.
99 int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
100  const APInt &Imm, Type *Ty) {
101  if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
102  return 0;
103 
104  return 1;
105 }
106 
107 int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
108  Type *Ty) {
109  // Division by a constant can be turned into multiplication, but only if we
110  // know it's constant. So it's not so much that the immediate is cheap (it's
111  // not), but that the alternative is worse.
112  // FIXME: this is probably unneeded with GlobalISel.
113  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
114  Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
115  Idx == 1)
116  return 0;
117 
118  if (Opcode == Instruction::And) {
119  // UXTB/UXTH
120  if (Imm == 255 || Imm == 65535)
121  return 0;
122  // Conversion to BIC is free, and means we can use ~Imm instead.
123  return std::min(getIntImmCost(Imm, Ty), getIntImmCost(~Imm, Ty));
124  }
125 
126  if (Opcode == Instruction::Add)
127  // Conversion to SUB is free, and means we can use -Imm instead.
128  return std::min(getIntImmCost(Imm, Ty), getIntImmCost(-Imm, Ty));
129 
130  if (Opcode == Instruction::ICmp && Imm.isNegative() &&
131  Ty->getIntegerBitWidth() == 32) {
132  int64_t NegImm = -Imm.getSExtValue();
133  if (ST->isThumb2() && NegImm < 1<<12)
134  // icmp X, #-C -> cmn X, #C
135  return 0;
136  if (ST->isThumb() && NegImm < 1<<8)
137  // icmp X, #-C -> adds X, #C
138  return 0;
139  }
140 
141  // xor a, -1 can always be folded to MVN
142  if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
143  return 0;
144 
145  return getIntImmCost(Imm, Ty);
146 }
147 
148 int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
149  const Instruction *I) {
150  int ISD = TLI->InstructionOpcodeToISD(Opcode);
151  assert(ISD && "Invalid opcode");
152 
153  // Single to/from double precision conversions.
154  static const CostTblEntry NEONFltDblTbl[] = {
155  // Vector fptrunc/fpext conversions.
156  { ISD::FP_ROUND, MVT::v2f64, 2 },
157  { ISD::FP_EXTEND, MVT::v2f32, 2 },
158  { ISD::FP_EXTEND, MVT::v4f32, 4 }
159  };
160 
161  if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
162  ISD == ISD::FP_EXTEND)) {
163  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
164  if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
165  return LT.first * Entry->Cost;
166  }
167 
168  EVT SrcTy = TLI->getValueType(DL, Src);
169  EVT DstTy = TLI->getValueType(DL, Dst);
170 
171  if (!SrcTy.isSimple() || !DstTy.isSimple())
172  return BaseT::getCastInstrCost(Opcode, Dst, Src);
173 
174  // The extend of a load is free
175  if (I && isa<LoadInst>(I->getOperand(0))) {
176  static const TypeConversionCostTblEntry LoadConversionTbl[] = {
189  };
190  if (const auto *Entry = ConvertCostTableLookup(
191  LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
192  return Entry->Cost;
193 
194  static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
201  };
202  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
203  if (const auto *Entry =
204  ConvertCostTableLookup(MVELoadConversionTbl, ISD,
205  DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
206  return Entry->Cost;
207  }
208  }
209 
210  // Some arithmetic, load and store operations have specific instructions
211  // to cast up/down their types automatically at no extra cost.
212  // TODO: Get these tables to know at least what the related operations are.
213  static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
220 
221  // The number of vmovl instructions for the extension.
232 
233  // Operations that we legalize using splitting.
236 
237  // Vector float <-> i32 conversions.
240 
261 
268 
269  // Vector double <-> i32 conversions.
272 
279 
286  };
287 
288  if (SrcTy.isVector() && ST->hasNEON()) {
289  if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
290  DstTy.getSimpleVT(),
291  SrcTy.getSimpleVT()))
292  return Entry->Cost;
293  }
294 
295  // Scalar float to integer conversions.
296  static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
317  };
318  if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
319  if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
320  DstTy.getSimpleVT(),
321  SrcTy.getSimpleVT()))
322  return Entry->Cost;
323  }
324 
325  // Scalar integer to float conversions.
326  static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
347  };
348 
349  if (SrcTy.isInteger() && ST->hasNEON()) {
350  if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
351  ISD, DstTy.getSimpleVT(),
352  SrcTy.getSimpleVT()))
353  return Entry->Cost;
354  }
355 
356  // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
357  // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
358  // are linearised so take more.
359  static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
372  };
373 
374  if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
375  if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
376  ISD, DstTy.getSimpleVT(),
377  SrcTy.getSimpleVT()))
378  return Entry->Cost * ST->getMVEVectorCostFactor();
379  }
380 
381  // Scalar integer conversion costs.
382  static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
383  // i16 -> i64 requires two dependent operations.
385 
386  // Truncates on i64 are assumed to be free.
389  { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
391  };
392 
393  if (SrcTy.isInteger()) {
394  if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
395  DstTy.getSimpleVT(),
396  SrcTy.getSimpleVT()))
397  return Entry->Cost;
398  }
399 
400  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
401  ? ST->getMVEVectorCostFactor()
402  : 1;
403  return BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src);
404 }
405 
406 int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
407  unsigned Index) {
408  // Penalize inserting into an D-subregister. We end up with a three times
409  // lower estimated throughput on swift.
410  if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
411  ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
412  return 3;
413 
414  if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
415  Opcode == Instruction::ExtractElement)) {
416  // Cross-class copies are expensive on many microarchitectures,
417  // so assume they are expensive by default.
418  if (ValTy->getVectorElementType()->isIntegerTy())
419  return 3;
420 
421  // Even if it's not a cross class copy, this likely leads to mixing
422  // of NEON and VFP code and should be therefore penalized.
423  if (ValTy->isVectorTy() &&
424  ValTy->getScalarSizeInBits() <= 32)
425  return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
426  }
427 
428  if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
429  Opcode == Instruction::ExtractElement)) {
430  // We say MVE moves costs at least the MVEVectorCostFactor, even though
431  // they are scalar instructions. This helps prevent mixing scalar and
432  // vector, to prevent vectorising where we end up just scalarising the
433  // result anyway.
434  return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index),
435  ST->getMVEVectorCostFactor()) *
436  ValTy->getVectorNumElements() / 2;
437  }
438 
439  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
440 }
441 
442 int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
443  const Instruction *I) {
444  int ISD = TLI->InstructionOpcodeToISD(Opcode);
445  // On NEON a vector select gets lowered to vbsl.
446  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
447  // Lowering of some vector selects is currently far from perfect.
448  static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
449  { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
450  { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
452  };
453 
454  EVT SelCondTy = TLI->getValueType(DL, CondTy);
455  EVT SelValTy = TLI->getValueType(DL, ValTy);
456  if (SelCondTy.isSimple() && SelValTy.isSimple()) {
457  if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
458  SelCondTy.getSimpleVT(),
459  SelValTy.getSimpleVT()))
460  return Entry->Cost;
461  }
462 
463  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
464  return LT.first;
465  }
466 
467  int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy()
468  ? ST->getMVEVectorCostFactor()
469  : 1;
470  return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
471 }
472 
474  const SCEV *Ptr) {
475  // Address computations in vectorized code with non-consecutive addresses will
476  // likely result in more instructions compared to scalar code where the
477  // computation can more often be merged into the index mode. The resulting
478  // extra micro-ops can significantly decrease throughput.
479  unsigned NumVectorInstToHideOverhead = 10;
480  int MaxMergeDistance = 64;
481 
482  if (ST->hasNEON()) {
483  if (Ty->isVectorTy() && SE &&
484  !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
485  return NumVectorInstToHideOverhead;
486 
487  // In many cases the address computation is not merged into the instruction
488  // addressing mode.
489  return 1;
490  }
491  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
492 }
493 
496  return false;
497 
498  if (auto *VecTy = dyn_cast<VectorType>(DataTy)) {
499  // Don't support v2i1 yet.
500  if (VecTy->getNumElements() == 2)
501  return false;
502 
503  // We don't support extending fp types.
504  unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
505  if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
506  return false;
507  }
508 
509  unsigned EltWidth = DataTy->getScalarSizeInBits();
510  return (EltWidth == 32 && (!Alignment || Alignment >= 4)) ||
511  (EltWidth == 16 && (!Alignment || Alignment >= 2)) ||
512  (EltWidth == 8);
513 }
514 
516  const MemCpyInst *MI = dyn_cast<MemCpyInst>(I);
517  assert(MI && "MemcpyInst expected");
519 
520  // To model the cost of a library call, we assume 1 for the call, and
521  // 3 for the argument setup.
522  const unsigned LibCallCost = 4;
523 
524  // If 'size' is not a constant, a library call will be generated.
525  if (!C)
526  return LibCallCost;
527 
528  const unsigned Size = C->getValue().getZExtValue();
529  const unsigned DstAlign = MI->getDestAlignment();
530  const unsigned SrcAlign = MI->getSourceAlignment();
531  const Function *F = I->getParent()->getParent();
532  const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
533  std::vector<EVT> MemOps;
534 
535  // MemOps will be poplulated with a list of data types that needs to be
536  // loaded and stored. That's why we multiply the number of elements by 2 to
537  // get the cost for this memcpy.
538  if (getTLI()->findOptimalMemOpLowering(
539  MemOps, Limit, Size, DstAlign, SrcAlign, false /*IsMemset*/,
540  false /*ZeroMemset*/, false /*MemcpyStrSrc*/, false /*AllowOverlap*/,
542  F->getAttributes()))
543  return MemOps.size() * 2;
544 
545  // If we can't find an optimal memop lowering, return the default cost
546  return LibCallCost;
547 }
548 
550  Type *SubTp) {
551  if (ST->hasNEON()) {
552  if (Kind == TTI::SK_Broadcast) {
553  static const CostTblEntry NEONDupTbl[] = {
554  // VDUP handles these cases.
561 
566 
567  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
568 
569  if (const auto *Entry =
570  CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
571  return LT.first * Entry->Cost;
572  }
573  if (Kind == TTI::SK_Reverse) {
574  static const CostTblEntry NEONShuffleTbl[] = {
575  // Reverse shuffle cost one instruction if we are shuffling within a
576  // double word (vrev) or two if we shuffle a quad word (vrev, vext).
583 
588 
589  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
590 
591  if (const auto *Entry =
592  CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
593  return LT.first * Entry->Cost;
594  }
595  if (Kind == TTI::SK_Select) {
596  static const CostTblEntry NEONSelShuffleTbl[] = {
597  // Select shuffle cost table for ARM. Cost is the number of
598  // instructions
599  // required to create the shuffled vector.
600 
605 
609 
611 
613 
614  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
615  if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
616  ISD::VECTOR_SHUFFLE, LT.second))
617  return LT.first * Entry->Cost;
618  }
619  }
620  if (ST->hasMVEIntegerOps()) {
621  if (Kind == TTI::SK_Broadcast) {
622  static const CostTblEntry MVEDupTbl[] = {
623  // VDUP handles these cases.
629 
630  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
631 
632  if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
633  LT.second))
634  return LT.first * Entry->Cost * ST->getMVEVectorCostFactor();
635  }
636  }
637  int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
638  ? ST->getMVEVectorCostFactor()
639  : 1;
640  return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
641 }
642 
644  unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
646  TTI::OperandValueProperties Opd2PropInfo,
648  int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
649  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
650 
651  const unsigned FunctionCallDivCost = 20;
652  const unsigned ReciprocalDivCost = 10;
653  static const CostTblEntry CostTbl[] = {
654  // Division.
655  // These costs are somewhat random. Choose a cost of 20 to indicate that
656  // vectorizing devision (added function call) is going to be very expensive.
657  // Double registers types.
658  { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
659  { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
660  { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
661  { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
662  { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
663  { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
664  { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
665  { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
666  { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
667  { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
668  { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
669  { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
670  { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
671  { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
672  { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
673  { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
674  // Quad register types.
675  { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
676  { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
677  { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
678  { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
679  { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
680  { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
681  { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
682  { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
683  { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
684  { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
685  { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
686  { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
687  { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
688  { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
689  { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
690  { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
691  // Multiplication.
692  };
693 
694  if (ST->hasNEON()) {
695  if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
696  return LT.first * Entry->Cost;
697 
698  int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
699  Opd1PropInfo, Opd2PropInfo);
700 
701  // This is somewhat of a hack. The problem that we are facing is that SROA
702  // creates a sequence of shift, and, or instructions to construct values.
703  // These sequences are recognized by the ISel and have zero-cost. Not so for
704  // the vectorized code. Because we have support for v2i64 but not i64 those
705  // sequences look particularly beneficial to vectorize.
706  // To work around this we increase the cost of v2i64 operations to make them
707  // seem less beneficial.
708  if (LT.second == MVT::v2i64 &&
710  Cost += 4;
711 
712  return Cost;
713  }
714 
715  int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy()
716  ? ST->getMVEVectorCostFactor()
717  : 1;
718 
719  // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
720  // without treating floats as more expensive that scalars or increasing the
721  // costs for custom operations. The results is also multiplied by the
722  // MVEVectorCostFactor where appropriate.
723  if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
724  return LT.first * BaseCost;
725 
726  // Else this is expand, assume that we need to scalarize this op.
727  if (Ty->isVectorTy()) {
728  unsigned Num = Ty->getVectorNumElements();
729  unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
730  // Return the cost of multiple scalar invocation plus the cost of
731  // inserting and extracting the values.
732  return BaseT::getScalarizationOverhead(Ty, Args) + Num * Cost;
733  }
734 
735  return BaseCost;
736 }
737 
738 int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
739  unsigned AddressSpace, const Instruction *I) {
740  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
741 
742  if (ST->hasNEON() && Src->isVectorTy() && Alignment != 16 &&
743  Src->getVectorElementType()->isDoubleTy()) {
744  // Unaligned loads/stores are extremely inefficient.
745  // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
746  return LT.first * 4;
747  }
748  int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
749  ? ST->getMVEVectorCostFactor()
750  : 1;
751  return BaseCost * LT.first;
752 }
753 
754 int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
755  unsigned Factor,
756  ArrayRef<unsigned> Indices,
757  unsigned Alignment,
758  unsigned AddressSpace,
759  bool UseMaskForCond,
760  bool UseMaskForGaps) {
761  assert(Factor >= 2 && "Invalid interleave factor");
762  assert(isa<VectorType>(VecTy) && "Expect a vector type");
763 
764  // vldN/vstN doesn't support vector types of i64/f64 element.
765  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
766 
767  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
768  !UseMaskForCond && !UseMaskForGaps) {
769  unsigned NumElts = VecTy->getVectorNumElements();
770  auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
771 
772  // vldN/vstN only support legal vector types of size 64 or 128 in bits.
773  // Accesses having vector types that are a multiple of 128 bits can be
774  // matched to more than one vldN/vstN instruction.
775  if (NumElts % Factor == 0 &&
776  TLI->isLegalInterleavedAccessType(SubVecTy, DL))
777  return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
778  }
779 
780  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
781  Alignment, AddressSpace,
782  UseMaskForCond, UseMaskForGaps);
783 }
784 
786  if (!F->isIntrinsic())
788 
789  // Assume all Arm-specific intrinsics map to an instruction.
790  if (F->getName().startswith("llvm.arm"))
791  return false;
792 
793  switch (F->getIntrinsicID()) {
794  default: break;
795  case Intrinsic::powi:
796  case Intrinsic::sin:
797  case Intrinsic::cos:
798  case Intrinsic::pow:
799  case Intrinsic::log:
800  case Intrinsic::log10:
801  case Intrinsic::log2:
802  case Intrinsic::exp:
803  case Intrinsic::exp2:
804  return true;
805  case Intrinsic::sqrt:
806  case Intrinsic::fabs:
807  case Intrinsic::copysign:
808  case Intrinsic::floor:
809  case Intrinsic::ceil:
810  case Intrinsic::trunc:
811  case Intrinsic::rint:
812  case Intrinsic::nearbyint:
813  case Intrinsic::round:
814  case Intrinsic::canonicalize:
815  case Intrinsic::lround:
816  case Intrinsic::llround:
817  case Intrinsic::lrint:
818  case Intrinsic::llrint:
819  if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
820  return true;
821  if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
822  return true;
823  // Some operations can be handled by vector instructions and assume
824  // unsupported vectors will be expanded into supported scalar ones.
825  // TODO Handle scalar operations properly.
826  return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
827  case Intrinsic::masked_store:
828  case Intrinsic::masked_load:
829  case Intrinsic::masked_gather:
830  case Intrinsic::masked_scatter:
831  return !ST->hasMVEIntegerOps();
832  case Intrinsic::sadd_with_overflow:
833  case Intrinsic::uadd_with_overflow:
834  case Intrinsic::ssub_with_overflow:
835  case Intrinsic::usub_with_overflow:
836  case Intrinsic::sadd_sat:
837  case Intrinsic::uadd_sat:
838  case Intrinsic::ssub_sat:
839  case Intrinsic::usub_sat:
840  return false;
841  }
842 
843  return BaseT::isLoweredToCall(F);
844 }
845 
847  AssumptionCache &AC,
848  TargetLibraryInfo *LibInfo,
849  HardwareLoopInfo &HWLoopInfo) {
850  // Low-overhead branches are only supported in the 'low-overhead branch'
851  // extension of v8.1-m.
852  if (!ST->hasLOB() || DisableLowOverheadLoops)
853  return false;
854 
856  return false;
857 
858  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
859  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
860  return false;
861 
862  const SCEV *TripCountSCEV =
863  SE.getAddExpr(BackedgeTakenCount,
864  SE.getOne(BackedgeTakenCount->getType()));
865 
866  // We need to store the trip count in LR, a 32-bit register.
867  if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32)
868  return false;
869 
870  // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
871  // point in generating a hardware loop if that's going to happen.
872  auto MaybeCall = [this](Instruction &I) {
873  const ARMTargetLowering *TLI = getTLI();
874  unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
875  EVT VT = TLI->getValueType(DL, I.getType(), true);
876  if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
877  return true;
878 
879  // Check if an intrinsic will be lowered to a call and assume that any
880  // other CallInst will generate a bl.
881  if (auto *Call = dyn_cast<CallInst>(&I)) {
882  if (isa<IntrinsicInst>(Call)) {
883  if (const Function *F = Call->getCalledFunction())
884  return isLoweredToCall(F);
885  }
886  return true;
887  }
888 
889  // FPv5 provides conversions between integer, double-precision,
890  // single-precision, and half-precision formats.
891  switch (I.getOpcode()) {
892  default:
893  break;
894  case Instruction::FPToSI:
895  case Instruction::FPToUI:
896  case Instruction::SIToFP:
897  case Instruction::UIToFP:
898  case Instruction::FPTrunc:
899  case Instruction::FPExt:
900  return !ST->hasFPARMv8Base();
901  }
902 
903  // FIXME: Unfortunately the approach of checking the Operation Action does
904  // not catch all cases of Legalization that use library calls. Our
905  // Legalization step categorizes some transformations into library calls as
906  // Custom, Expand or even Legal when doing type legalization. So for now
907  // we have to special case for instance the SDIV of 64bit integers and the
908  // use of floating point emulation.
909  if (VT.isInteger() && VT.getSizeInBits() >= 64) {
910  switch (ISD) {
911  default:
912  break;
913  case ISD::SDIV:
914  case ISD::UDIV:
915  case ISD::SREM:
916  case ISD::UREM:
917  case ISD::SDIVREM:
918  case ISD::UDIVREM:
919  return true;
920  }
921  }
922 
923  // Assume all other non-float operations are supported.
924  if (!VT.isFloatingPoint())
925  return false;
926 
927  // We'll need a library call to handle most floats when using soft.
928  if (TLI->useSoftFloat()) {
929  switch (I.getOpcode()) {
930  default:
931  return true;
932  case Instruction::Alloca:
933  case Instruction::Load:
934  case Instruction::Store:
935  case Instruction::Select:
936  case Instruction::PHI:
937  return false;
938  }
939  }
940 
941  // We'll need a libcall to perform double precision operations on a single
942  // precision only FPU.
943  if (I.getType()->isDoubleTy() && !ST->hasFP64())
944  return true;
945 
946  // Likewise for half precision arithmetic.
947  if (I.getType()->isHalfTy() && !ST->hasFullFP16())
948  return true;
949 
950  return false;
951  };
952 
953  auto IsHardwareLoopIntrinsic = [](Instruction &I) {
954  if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
955  switch (Call->getIntrinsicID()) {
956  default:
957  break;
958  case Intrinsic::set_loop_iterations:
959  case Intrinsic::test_set_loop_iterations:
960  case Intrinsic::loop_decrement:
961  case Intrinsic::loop_decrement_reg:
962  return true;
963  }
964  }
965  return false;
966  };
967 
968  // Scan the instructions to see if there's any that we know will turn into a
969  // call or if this loop is already a low-overhead loop.
970  auto ScanLoop = [&](Loop *L) {
971  for (auto *BB : L->getBlocks()) {
972  for (auto &I : *BB) {
973  if (MaybeCall(I) || IsHardwareLoopIntrinsic(I))
974  return false;
975  }
976  }
977  return true;
978  };
979 
980  // Visit inner loops.
981  for (auto Inner : *L)
982  if (!ScanLoop(Inner))
983  return false;
984 
985  if (!ScanLoop(L))
986  return false;
987 
988  // TODO: Check whether the trip count calculation is expensive. If L is the
989  // inner loop but we know it has a low trip count, calculating that trip
990  // count (in the parent loop) may be detrimental.
991 
992  LLVMContext &C = L->getHeader()->getContext();
993  HWLoopInfo.CounterInReg = true;
994  HWLoopInfo.IsNestingLegal = false;
995  HWLoopInfo.PerformEntryTest = true;
996  HWLoopInfo.CountType = Type::getInt32Ty(C);
997  HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
998  return true;
999 }
1000 
1003  // Only currently enable these preferences for M-Class cores.
1004  if (!ST->isMClass())
1006 
1007  // Disable loop unrolling for Oz and Os.
1008  UP.OptSizeThreshold = 0;
1009  UP.PartialOptSizeThreshold = 0;
1010  if (L->getHeader()->getParent()->hasOptSize())
1011  return;
1012 
1013  // Only enable on Thumb-2 targets.
1014  if (!ST->isThumb2())
1015  return;
1016 
1017  SmallVector<BasicBlock*, 4> ExitingBlocks;
1018  L->getExitingBlocks(ExitingBlocks);
1019  LLVM_DEBUG(dbgs() << "Loop has:\n"
1020  << "Blocks: " << L->getNumBlocks() << "\n"
1021  << "Exit blocks: " << ExitingBlocks.size() << "\n");
1022 
1023  // Only allow another exit other than the latch. This acts as an early exit
1024  // as it mirrors the profitability calculation of the runtime unroller.
1025  if (ExitingBlocks.size() > 2)
1026  return;
1027 
1028  // Limit the CFG of the loop body for targets with a branch predictor.
1029  // Allowing 4 blocks permits if-then-else diamonds in the body.
1030  if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
1031  return;
1032 
1033  // Scan the loop: don't unroll loops with calls as this could prevent
1034  // inlining.
1035  unsigned Cost = 0;
1036  for (auto *BB : L->getBlocks()) {
1037  for (auto &I : *BB) {
1038  if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1039  ImmutableCallSite CS(&I);
1040  if (const Function *F = CS.getCalledFunction()) {
1041  if (!isLoweredToCall(F))
1042  continue;
1043  }
1044  return;
1045  }
1046  // Don't unroll vectorised loop. MVE does not benefit from it as much as
1047  // scalar code.
1048  if (I.getType()->isVectorTy())
1049  return;
1050 
1051  SmallVector<const Value*, 4> Operands(I.value_op_begin(),
1052  I.value_op_end());
1053  Cost += getUserCost(&I, Operands);
1054  }
1055  }
1056 
1057  LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1058 
1059  UP.Partial = true;
1060  UP.Runtime = true;
1061  UP.UpperBound = true;
1062  UP.UnrollRemainder = true;
1064  UP.UnrollAndJam = true;
1066 
1067  // Force unrolling small loops can be very useful because of the branch
1068  // taken cost of the backedge.
1069  if (Cost < 12)
1070  UP.Force = true;
1071 }
1072 
1073 bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
1074  TTI::ReductionFlags Flags) const {
1075  assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
1076  unsigned ScalarBits = Ty->getScalarSizeInBits();
1077  if (!ST->hasMVEIntegerOps())
1078  return false;
1079 
1080  switch (Opcode) {
1081  case Instruction::FAdd:
1082  case Instruction::FMul:
1083  case Instruction::And:
1084  case Instruction::Or:
1085  case Instruction::Xor:
1086  case Instruction::Mul:
1087  case Instruction::FCmp:
1088  return false;
1089  case Instruction::ICmp:
1090  case Instruction::Add:
1091  return ScalarBits < 64 && ScalarBits * Ty->getVectorNumElements() == 128;
1092  default:
1093  llvm_unreachable("Unhandled reduction opcode");
1094  }
1095  return false;
1096 }
Type * getVectorElementType() const
Definition: Type.h:376
uint64_t CallInst * C
X = FP_ROUND(Y, TRUNC) - Rounding &#39;Y&#39; from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:577
bool isIntrinsic() const
isIntrinsic - Returns true if the function&#39;s name starts with "llvm.".
Definition: Function.h:198
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I=nullptr)
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
Definition: BasicTTIImpl.h:621
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance)
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1571
bool isThumb() const
Definition: ARMSubtarget.h:759
This class represents lattice values for constants.
Definition: AllocatorList.h:23
Cost tables and simple lookup functions.
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:399
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:627
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:560
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr)
LLVM_NODISCARD bool startswith(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:270
bool hasBranchPredictor() const
Definition: ARMSubtarget.h:667
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info=TTI::OK_AnyValue, TTI::OperandValueKind Op2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value *> Args=ArrayRef< const Value *>())
The main scalar evolution driver.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold, but used for partial/runtime unrolling (set to UINT_MAX to disable).
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:252
unsigned getSourceAlignment() const
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:57
A cache of @llvm.assume calls within a function.
unsigned getSourceAddressSpace() const
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:743
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:140
F(f)
Type Conversion Cost Table.
Definition: CostTable.h:44
FunTy * getCalledFunction() const
Return the function being called if this is a direct call, otherwise return null (if it&#39;s an indirect...
Definition: CallSite.h:111
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:624
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:230
Value * getLength() const
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Cost Table Entry.
Definition: CostTable.h:24
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1517
bool findOptimalMemOpLowering(std::vector< EVT > &MemOps, unsigned Limit, uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, bool AllowOverlap, unsigned DstAS, unsigned SrcAS, const AttributeList &FuncAttributes) const
Determines the optimal series of memory ops to replace the memset / memcpy.
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:209
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:135
bool isLoweredToCall(const Function *F)
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
const FeatureBitset & getFeatureBits() const
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:368
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I=nullptr)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:197
unsigned getDestAlignment() const
const TypeConversionCostTblEntry * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntry > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table, TypeTy must be comparable to CompareTy by ==. ...
Definition: CostTable.h:54
mir Rename Register Operands
This file implements a class to represent arbitrary precision integral constant values and operations...
BlockT * getHeader() const
Definition: LoopInfo.h:105
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1541
unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I)
Definition: BasicTTIImpl.h:825
This file a TargetTransformInfo::Concept conforming object specific to the ARM target machine...
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
unsigned getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:291
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1583
unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:687
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:517
bool useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
bool hasV6T2Ops() const
Definition: ARMSubtarget.h:573
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
Selects elements from the corresponding lane of either source operand.
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:137
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:223
Reverse the order of the vector.
Value * getOperand(unsigned i) const
Definition: User.h:169
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:563
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return &#39;this&#39;.
Definition: Type.h:307
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:363
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: BasicTTIImpl.h:439
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:168
bool isAllOnesValue() const
Determine if all bits are set.
Definition: APInt.h:395
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
bool useSoftFloat() const override
Container class for subtarget features.
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:115
Flags describing the kind of vector reduction.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:64
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: BasicTTIImpl.h:903
bool isMClass() const
Definition: ARMSubtarget.h:764
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV *> &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
unsigned getUserCost(const User *U, ArrayRef< const Value * > Operands)
bool isHalfTy() const
Return true if this is &#39;half&#39;, a 16-bit IEEE fp type.
Definition: Type.h:144
bool hasSlowLoadDSubregister() const
Definition: ARMSubtarget.h:654
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
bool hasVFP2Base() const
Definition: ARMSubtarget.h:607
Attributes of a target dependent hardware loop.
static double log2(double V)
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:588
void getExitingBlocks(SmallVectorImpl< BlockT *> &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
Definition: LoopInfoImpl.h:34
Extended Value Type.
Definition: ValueTypes.h:33
unsigned getMVEVectorCostFactor() const
Definition: ARMSubtarget.h:868
size_t size() const
Definition: SmallVector.h:52
const TargetMachine & getTargetMachine() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
OperandValueProperties
Additional properties of an operand&#39;s values.
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp)
Definition: BasicTTIImpl.h:668
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
Type * getType() const
Return the LLVM type of this SCEV expression.
bool hasFP64() const
Definition: ARMSubtarget.h:640
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:390
bool hasFPARMv8Base() const
Definition: ARMSubtarget.h:610
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type...
Definition: Type.cpp:134
This struct is a compact representation of a valid (power of two) or undefined (0) alignment...
Definition: Alignment.h:117
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
Provides information about what library functions are available for the current target.
AddressSpace
Definition: NVPTXBaseInfo.h:21
int getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr)
bool hasNEON() const
Definition: ARMSubtarget.h:611
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
This class wraps the llvm.memcpy intrinsic.
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:653
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition: Function.h:193
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target&#39;s TargetSubtargetInf...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
unsigned getVectorNumElements() const
Definition: DerivedTypes.h:566
Class for arbitrary precision integers.
Definition: APInt.h:69
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:459
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:507
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
const CostTblEntry * CostTableLookup(ArrayRef< CostTblEntry > Tbl, int ISD, MVT Ty)
Find in cost table, TypeTy must be comparable to CompareTy by ==.
Definition: CostTable.h:31
bool isThumb2() const
Definition: ARMSubtarget.h:762
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:168
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(false), cl::desc("Enable the generation of masked loads and stores"))
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:150
int getMemcpyCost(const Instruction *I)
unsigned getDestAddressSpace() const
This class represents an analyzed expression in the program.
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:180
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:102
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:509
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value...
Definition: APInt.h:481
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:154
Parameters that control the generic loop unrolling transformation.
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable)...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:614
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
Establish a view to a call site for examination.
Definition: CallSite.h:906
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:106
int getIntImmCost(const APInt &Imm, Type *Ty)
#define I(x, y, z)
Definition: MD5.cpp:58
bool isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment)
bool hasLOB() const
Definition: ARMSubtarget.h:618
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:624
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:332
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:865
uint32_t Size
Definition: Profile.cpp:46
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
unsigned getMaxStoresPerMemmove(bool OptSize) const
Get maximum # of store operations permitted for llvm.memmove.
const SCEV * getBackedgeTakenCount(const Loop *L)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
int getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty)
bool hasMVEIntegerOps() const
Definition: ARMSubtarget.h:584
Broadcast element 0 to all other elements.
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:65
bool UpperBound
Allow using trip count upper bound to unroll loops.
IRTranslator LLVM IR MI
OperandValueKind
Additional information about an operand&#39;s possible values.
Conversion operators.
Definition: ISDOpcodes.h:504
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:513
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:125
#define LLVM_DEBUG(X)
Definition: Debug.h:122
bool isDoubleTy() const
Return true if this is &#39;double&#39;, a 64-bit IEEE fp type.
Definition: Type.h:150
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count...
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
const BasicBlock * getParent() const
Definition: Instruction.h:66
ShuffleKind
The various kinds of shuffle patterns for vector queries.
bool hasFullFP16() const
Definition: ARMSubtarget.h:684