LLVM  14.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "AArch64ExpandImm.h"
13 #include "llvm/Analysis/LoopInfo.h"
16 #include "llvm/CodeGen/CostTable.h"
18 #include "llvm/IR/Intrinsics.h"
19 #include "llvm/IR/IntrinsicInst.h"
20 #include "llvm/IR/IntrinsicsAArch64.h"
21 #include "llvm/IR/PatternMatch.h"
22 #include "llvm/Support/Debug.h"
24 #include <algorithm>
25 using namespace llvm;
26 using namespace llvm::PatternMatch;
27 
28 #define DEBUG_TYPE "aarch64tti"
29 
30 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
31  cl::init(true), cl::Hidden);
32 
34  const Function *Callee) const {
35  const TargetMachine &TM = getTLI()->getTargetMachine();
36 
37  const FeatureBitset &CallerBits =
38  TM.getSubtargetImpl(*Caller)->getFeatureBits();
39  const FeatureBitset &CalleeBits =
40  TM.getSubtargetImpl(*Callee)->getFeatureBits();
41 
42  // Inline a callee if its target-features are a subset of the callers
43  // target-features.
44  return (CallerBits & CalleeBits) == CalleeBits;
45 }
46 
47 /// Calculate the cost of materializing a 64-bit value. This helper
48 /// method might only calculate a fraction of a larger immediate. Therefore it
49 /// is valid to return a cost of ZERO.
51  // Check if the immediate can be encoded within an instruction.
52  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
53  return 0;
54 
55  if (Val < 0)
56  Val = ~Val;
57 
58  // Calculate how many moves we will need to materialize this constant.
61  return Insn.size();
62 }
63 
64 /// Calculate the cost of materializing the given constant.
67  assert(Ty->isIntegerTy());
68 
69  unsigned BitSize = Ty->getPrimitiveSizeInBits();
70  if (BitSize == 0)
71  return ~0U;
72 
73  // Sign-extend all constants to a multiple of 64-bit.
74  APInt ImmVal = Imm;
75  if (BitSize & 0x3f)
76  ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
77 
78  // Split the constant into 64-bit chunks and calculate the cost for each
79  // chunk.
80  InstructionCost Cost = 0;
81  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
82  APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
83  int64_t Val = Tmp.getSExtValue();
84  Cost += getIntImmCost(Val);
85  }
86  // We need at least one instruction to materialze the constant.
87  return std::max<InstructionCost>(1, Cost);
88 }
89 
90 InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
91  const APInt &Imm, Type *Ty,
93  Instruction *Inst) {
94  assert(Ty->isIntegerTy());
95 
96  unsigned BitSize = Ty->getPrimitiveSizeInBits();
97  // There is no cost model for constants with a bit size of 0. Return TCC_Free
98  // here, so that constant hoisting will ignore this constant.
99  if (BitSize == 0)
100  return TTI::TCC_Free;
101 
102  unsigned ImmIdx = ~0U;
103  switch (Opcode) {
104  default:
105  return TTI::TCC_Free;
106  case Instruction::GetElementPtr:
107  // Always hoist the base address of a GetElementPtr.
108  if (Idx == 0)
109  return 2 * TTI::TCC_Basic;
110  return TTI::TCC_Free;
111  case Instruction::Store:
112  ImmIdx = 0;
113  break;
114  case Instruction::Add:
115  case Instruction::Sub:
116  case Instruction::Mul:
117  case Instruction::UDiv:
118  case Instruction::SDiv:
119  case Instruction::URem:
120  case Instruction::SRem:
121  case Instruction::And:
122  case Instruction::Or:
123  case Instruction::Xor:
124  case Instruction::ICmp:
125  ImmIdx = 1;
126  break;
127  // Always return TCC_Free for the shift value of a shift instruction.
128  case Instruction::Shl:
129  case Instruction::LShr:
130  case Instruction::AShr:
131  if (Idx == 1)
132  return TTI::TCC_Free;
133  break;
134  case Instruction::Trunc:
135  case Instruction::ZExt:
136  case Instruction::SExt:
137  case Instruction::IntToPtr:
138  case Instruction::PtrToInt:
139  case Instruction::BitCast:
140  case Instruction::PHI:
141  case Instruction::Call:
142  case Instruction::Select:
143  case Instruction::Ret:
144  case Instruction::Load:
145  break;
146  }
147 
148  if (Idx == ImmIdx) {
149  int NumConstants = (BitSize + 63) / 64;
151  return (Cost <= NumConstants * TTI::TCC_Basic)
152  ? static_cast<int>(TTI::TCC_Free)
153  : Cost;
154  }
155  return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
156 }
157 
160  const APInt &Imm, Type *Ty,
162  assert(Ty->isIntegerTy());
163 
164  unsigned BitSize = Ty->getPrimitiveSizeInBits();
165  // There is no cost model for constants with a bit size of 0. Return TCC_Free
166  // here, so that constant hoisting will ignore this constant.
167  if (BitSize == 0)
168  return TTI::TCC_Free;
169 
170  // Most (all?) AArch64 intrinsics do not support folding immediates into the
171  // selected instruction, so we compute the materialization cost for the
172  // immediate directly.
173  if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
174  return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
175 
176  switch (IID) {
177  default:
178  return TTI::TCC_Free;
179  case Intrinsic::sadd_with_overflow:
180  case Intrinsic::uadd_with_overflow:
181  case Intrinsic::ssub_with_overflow:
182  case Intrinsic::usub_with_overflow:
183  case Intrinsic::smul_with_overflow:
184  case Intrinsic::umul_with_overflow:
185  if (Idx == 1) {
186  int NumConstants = (BitSize + 63) / 64;
188  return (Cost <= NumConstants * TTI::TCC_Basic)
189  ? static_cast<int>(TTI::TCC_Free)
190  : Cost;
191  }
192  break;
193  case Intrinsic::experimental_stackmap:
194  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
195  return TTI::TCC_Free;
196  break;
197  case Intrinsic::experimental_patchpoint_void:
198  case Intrinsic::experimental_patchpoint_i64:
199  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
200  return TTI::TCC_Free;
201  break;
202  case Intrinsic::experimental_gc_statepoint:
203  if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
204  return TTI::TCC_Free;
205  break;
206  }
207  return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
208 }
209 
212  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
213  if (TyWidth == 32 || TyWidth == 64)
214  return TTI::PSK_FastHardware;
215  // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
216  return TTI::PSK_Software;
217 }
218 
222  auto *RetTy = ICA.getReturnType();
223  switch (ICA.getID()) {
224  case Intrinsic::umin:
225  case Intrinsic::umax:
226  case Intrinsic::smin:
227  case Intrinsic::smax: {
228  static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
230  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
231  // v2i64 types get converted to cmp+bif hence the cost of 2
232  if (LT.second == MVT::v2i64)
233  return LT.first * 2;
234  if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
235  return LT.first;
236  break;
237  }
238  case Intrinsic::sadd_sat:
239  case Intrinsic::ssub_sat:
240  case Intrinsic::uadd_sat:
241  case Intrinsic::usub_sat: {
242  static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
244  MVT::v2i64};
245  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
246  // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
247  // need to extend the type, as it uses shr(qadd(shl, shl)).
248  unsigned Instrs =
249  LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
250  if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
251  return LT.first * Instrs;
252  break;
253  }
254  case Intrinsic::abs: {
255  static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
257  MVT::v2i64};
258  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
259  if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
260  return LT.first;
261  break;
262  }
263  case Intrinsic::experimental_stepvector: {
264  InstructionCost Cost = 1; // Cost of the `index' instruction
265  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
266  // Legalisation of illegal vectors involves an `index' instruction plus
267  // (LT.first - 1) vector adds.
268  if (LT.first > 1) {
269  Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
270  InstructionCost AddCost =
271  getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
272  Cost += AddCost * (LT.first - 1);
273  }
274  return Cost;
275  }
276  case Intrinsic::bitreverse: {
277  static const CostTblEntry BitreverseTbl[] = {
278  {Intrinsic::bitreverse, MVT::i32, 1},
279  {Intrinsic::bitreverse, MVT::i64, 1},
280  {Intrinsic::bitreverse, MVT::v8i8, 1},
281  {Intrinsic::bitreverse, MVT::v16i8, 1},
282  {Intrinsic::bitreverse, MVT::v4i16, 2},
283  {Intrinsic::bitreverse, MVT::v8i16, 2},
284  {Intrinsic::bitreverse, MVT::v2i32, 2},
285  {Intrinsic::bitreverse, MVT::v4i32, 2},
286  {Intrinsic::bitreverse, MVT::v1i64, 2},
287  {Intrinsic::bitreverse, MVT::v2i64, 2},
288  };
289  const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy);
290  const auto *Entry =
291  CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
292  if (Entry) {
293  // Cost Model is using the legal type(i32) that i8 and i16 will be
294  // converted to +1 so that we match the actual lowering cost
295  if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
296  TLI->getValueType(DL, RetTy, true) == MVT::i16)
297  return LegalisationCost.first * Entry->Cost + 1;
298 
299  return LegalisationCost.first * Entry->Cost;
300  }
301  break;
302  }
303  case Intrinsic::ctpop: {
304  static const CostTblEntry CtpopCostTbl[] = {
305  {ISD::CTPOP, MVT::v2i64, 4},
306  {ISD::CTPOP, MVT::v4i32, 3},
307  {ISD::CTPOP, MVT::v8i16, 2},
308  {ISD::CTPOP, MVT::v16i8, 1},
309  {ISD::CTPOP, MVT::i64, 4},
310  {ISD::CTPOP, MVT::v2i32, 3},
311  {ISD::CTPOP, MVT::v4i16, 2},
312  {ISD::CTPOP, MVT::v8i8, 1},
313  {ISD::CTPOP, MVT::i32, 5},
314  };
315  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
316  MVT MTy = LT.second;
317  if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
318  // Extra cost of +1 when illegal vector types are legalized by promoting
319  // the integer type.
320  int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
321  RetTy->getScalarSizeInBits()
322  ? 1
323  : 0;
324  return LT.first * Entry->Cost + ExtraCost;
325  }
326  break;
327  }
328  default:
329  break;
330  }
331  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
332 }
333 
334 /// The function will remove redundant reinterprets casting in the presence
335 /// of the control flow
337  IntrinsicInst &II) {
339  auto RequiredType = II.getType();
340 
341  auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
342  assert(PN && "Expected Phi Node!");
343 
344  // Don't create a new Phi unless we can remove the old one.
345  if (!PN->hasOneUse())
346  return None;
347 
348  for (Value *IncValPhi : PN->incoming_values()) {
349  auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
350  if (!Reinterpret ||
351  Reinterpret->getIntrinsicID() !=
352  Intrinsic::aarch64_sve_convert_to_svbool ||
353  RequiredType != Reinterpret->getArgOperand(0)->getType())
354  return None;
355  }
356 
357  // Create the new Phi
358  LLVMContext &Ctx = PN->getContext();
359  IRBuilder<> Builder(Ctx);
360  Builder.SetInsertPoint(PN);
361  PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
362  Worklist.push_back(PN);
363 
364  for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
365  auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
366  NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
367  Worklist.push_back(Reinterpret);
368  }
369 
370  // Cleanup Phi Node and reinterprets
371  return IC.replaceInstUsesWith(II, NPN);
372 }
373 
375  IntrinsicInst &II) {
376  // If the reinterpret instruction operand is a PHI Node
377  if (isa<PHINode>(II.getArgOperand(0)))
378  return processPhiNode(IC, II);
379 
380  SmallVector<Instruction *, 32> CandidatesForRemoval;
381  Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
382 
383  const auto *IVTy = cast<VectorType>(II.getType());
384 
385  // Walk the chain of conversions.
386  while (Cursor) {
387  // If the type of the cursor has fewer lanes than the final result, zeroing
388  // must take place, which breaks the equivalence chain.
389  const auto *CursorVTy = cast<VectorType>(Cursor->getType());
390  if (CursorVTy->getElementCount().getKnownMinValue() <
391  IVTy->getElementCount().getKnownMinValue())
392  break;
393 
394  // If the cursor has the same type as I, it is a viable replacement.
395  if (Cursor->getType() == IVTy)
396  EarliestReplacement = Cursor;
397 
398  auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
399 
400  // If this is not an SVE conversion intrinsic, this is the end of the chain.
401  if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
402  Intrinsic::aarch64_sve_convert_to_svbool ||
403  IntrinsicCursor->getIntrinsicID() ==
404  Intrinsic::aarch64_sve_convert_from_svbool))
405  break;
406 
407  CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
408  Cursor = IntrinsicCursor->getOperand(0);
409  }
410 
411  // If no viable replacement in the conversion chain was found, there is
412  // nothing to do.
413  if (!EarliestReplacement)
414  return None;
415 
416  return IC.replaceInstUsesWith(II, EarliestReplacement);
417 }
418 
420  IntrinsicInst &II) {
421  IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
422  if (!Pg)
423  return None;
424 
425  if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
426  return None;
427 
428  const auto PTruePattern =
429  cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
430  if (PTruePattern != AArch64SVEPredPattern::vl1)
431  return None;
432 
433  // The intrinsic is inserting into lane zero so use an insert instead.
434  auto *IdxTy = Type::getInt64Ty(II.getContext());
436  II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
437  Insert->insertBefore(&II);
438  Insert->takeName(&II);
439 
440  return IC.replaceInstUsesWith(II, Insert);
441 }
442 
444  IntrinsicInst &II) {
445  // Replace DupX with a regular IR splat.
447  Builder.SetInsertPoint(&II);
448  auto *RetTy = cast<ScalableVectorType>(II.getType());
449  Value *Splat =
450  Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0));
451  Splat->takeName(&II);
452  return IC.replaceInstUsesWith(II, Splat);
453 }
454 
456  IntrinsicInst &II) {
457  LLVMContext &Ctx = II.getContext();
458  IRBuilder<> Builder(Ctx);
459  Builder.SetInsertPoint(&II);
460 
461  // Check that the predicate is all active
462  auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
463  if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
464  return None;
465 
466  const auto PTruePattern =
467  cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
468  if (PTruePattern != AArch64SVEPredPattern::all)
469  return None;
470 
471  // Check that we have a compare of zero..
472  auto *SplatValue =
473  dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
474  if (!SplatValue || !SplatValue->isZero())
475  return None;
476 
477  // ..against a dupq
478  auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
479  if (!DupQLane ||
480  DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
481  return None;
482 
483  // Where the dupq is a lane 0 replicate of a vector insert
484  if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
485  return None;
486 
487  auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
488  if (!VecIns ||
489  VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert)
490  return None;
491 
492  // Where the vector insert is a fixed constant vector insert into undef at
493  // index zero
494  if (!isa<UndefValue>(VecIns->getArgOperand(0)))
495  return None;
496 
497  if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
498  return None;
499 
500  auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
501  if (!ConstVec)
502  return None;
503 
504  auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
505  auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
506  if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
507  return None;
508 
509  unsigned NumElts = VecTy->getNumElements();
510  unsigned PredicateBits = 0;
511 
512  // Expand intrinsic operands to a 16-bit byte level predicate
513  for (unsigned I = 0; I < NumElts; ++I) {
514  auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
515  if (!Arg)
516  return None;
517  if (!Arg->isZero())
518  PredicateBits |= 1 << (I * (16 / NumElts));
519  }
520 
521  // If all bits are zero bail early with an empty predicate
522  if (PredicateBits == 0) {
523  auto *PFalse = Constant::getNullValue(II.getType());
524  PFalse->takeName(&II);
525  return IC.replaceInstUsesWith(II, PFalse);
526  }
527 
528  // Calculate largest predicate type used (where byte predicate is largest)
529  unsigned Mask = 8;
530  for (unsigned I = 0; I < 16; ++I)
531  if ((PredicateBits & (1 << I)) != 0)
532  Mask |= (I % 8);
533 
534  unsigned PredSize = Mask & -Mask;
535  auto *PredType = ScalableVectorType::get(
536  Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
537 
538  // Ensure all relevant bits are set
539  for (unsigned I = 0; I < 16; I += PredSize)
540  if ((PredicateBits & (1 << I)) == 0)
541  return None;
542 
543  auto *PTruePat =
545  auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
546  {PredType}, {PTruePat});
547  auto *ConvertToSVBool = Builder.CreateIntrinsic(
548  Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
549  auto *ConvertFromSVBool =
550  Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
551  {II.getType()}, {ConvertToSVBool});
552 
553  ConvertFromSVBool->takeName(&II);
554  return IC.replaceInstUsesWith(II, ConvertFromSVBool);
555 }
556 
558  IntrinsicInst &II) {
560  Builder.SetInsertPoint(&II);
561  Value *Pg = II.getArgOperand(0);
562  Value *Vec = II.getArgOperand(1);
563  auto IntrinsicID = II.getIntrinsicID();
564  bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
565 
566  // lastX(splat(X)) --> X
567  if (auto *SplatVal = getSplatValue(Vec))
568  return IC.replaceInstUsesWith(II, SplatVal);
569 
570  // If x and/or y is a splat value then:
571  // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
572  Value *LHS, *RHS;
573  if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
574  if (isSplatValue(LHS) || isSplatValue(RHS)) {
575  auto *OldBinOp = cast<BinaryOperator>(Vec);
576  auto OpC = OldBinOp->getOpcode();
577  auto *NewLHS =
578  Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
579  auto *NewRHS =
580  Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
581  auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
582  OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
583  return IC.replaceInstUsesWith(II, NewBinOp);
584  }
585  }
586 
587  auto *C = dyn_cast<Constant>(Pg);
588  if (IsAfter && C && C->isNullValue()) {
589  // The intrinsic is extracting lane 0 so use an extract instead.
590  auto *IdxTy = Type::getInt64Ty(II.getContext());
591  auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
592  Extract->insertBefore(&II);
593  Extract->takeName(&II);
594  return IC.replaceInstUsesWith(II, Extract);
595  }
596 
597  auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
598  if (!IntrPG)
599  return None;
600 
601  if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
602  return None;
603 
604  const auto PTruePattern =
605  cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
606 
607  // Can the intrinsic's predicate be converted to a known constant index?
608  unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
609  if (!MinNumElts)
610  return None;
611 
612  unsigned Idx = MinNumElts - 1;
613  // Increment the index if extracting the element after the last active
614  // predicate element.
615  if (IsAfter)
616  ++Idx;
617 
618  // Ignore extracts whose index is larger than the known minimum vector
619  // length. NOTE: This is an artificial constraint where we prefer to
620  // maintain what the user asked for until an alternative is proven faster.
621  auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
622  if (Idx >= PgVTy->getMinNumElements())
623  return None;
624 
625  // The intrinsic is extracting a fixed lane so use an extract instead.
626  auto *IdxTy = Type::getInt64Ty(II.getContext());
627  auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
628  Extract->insertBefore(&II);
629  Extract->takeName(&II);
630  return IC.replaceInstUsesWith(II, Extract);
631 }
632 
634  IntrinsicInst &II) {
635  LLVMContext &Ctx = II.getContext();
636  IRBuilder<> Builder(Ctx);
637  Builder.SetInsertPoint(&II);
638  // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
639  // can work with RDFFR_PP for ptest elimination.
640  auto *AllPat =
642  auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
643  {II.getType()}, {AllPat});
644  auto *RDFFR =
645  Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
646  RDFFR->takeName(&II);
647  return IC.replaceInstUsesWith(II, RDFFR);
648 }
649 
651 instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
652  const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
653 
655  LLVMContext &Ctx = II.getContext();
656  IRBuilder<> Builder(Ctx);
657  Builder.SetInsertPoint(&II);
658 
659  Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
660  auto *VScale = Builder.CreateVScale(StepVal);
661  VScale->takeName(&II);
662  return IC.replaceInstUsesWith(II, VScale);
663  }
664 
665  unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
666 
667  return MinNumElts && NumElts >= MinNumElts
669  II, ConstantInt::get(II.getType(), MinNumElts)))
670  : None;
671 }
672 
674  IntrinsicInst &II) {
675  IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
676  IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
677 
678  if (Op1 && Op2 &&
679  Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
680  Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
681  Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
682 
684  Builder.SetInsertPoint(&II);
685 
686  Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
687  Type *Tys[] = {Op1->getArgOperand(0)->getType()};
688 
689  auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
690 
691  PTest->takeName(&II);
692  return IC.replaceInstUsesWith(II, PTest);
693  }
694 
695  return None;
696 }
697 
698 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
699  switch (Intrinsic) {
700  case Intrinsic::aarch64_sve_fmul:
701  return Instruction::BinaryOps::FMul;
702  case Intrinsic::aarch64_sve_fadd:
703  return Instruction::BinaryOps::FAdd;
704  case Intrinsic::aarch64_sve_fsub:
705  return Instruction::BinaryOps::FSub;
706  default:
707  return Instruction::BinaryOpsEnd;
708  }
709 }
710 
712  IntrinsicInst &II) {
713  auto *OpPredicate = II.getOperand(0);
714  auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
715  if (BinOpCode == Instruction::BinaryOpsEnd ||
716  !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
717  m_ConstantInt<AArch64SVEPredPattern::all>())))
718  return None;
720  Builder.SetInsertPoint(&II);
721  Builder.setFastMathFlags(II.getFastMathFlags());
722  auto BinOp =
723  Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
724  return IC.replaceInstUsesWith(II, BinOp);
725 }
726 
728  IntrinsicInst &II) {
729  auto *OpPredicate = II.getOperand(0);
730  auto *OpMultiplicand = II.getOperand(1);
731  auto *OpMultiplier = II.getOperand(2);
732 
734  Builder.SetInsertPoint(&II);
735 
736  // Return true if a given instruction is a unit splat value, false otherwise.
737  auto IsUnitSplat = [](auto *I) {
738  auto *SplatValue = getSplatValue(I);
739  if (!SplatValue)
740  return false;
741  return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
742  };
743 
744  // Return true if a given instruction is an aarch64_sve_dup intrinsic call
745  // with a unit splat value, false otherwise.
746  auto IsUnitDup = [](auto *I) {
747  auto *IntrI = dyn_cast<IntrinsicInst>(I);
748  if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
749  return false;
750 
751  auto *SplatValue = IntrI->getOperand(2);
752  return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
753  };
754 
755  // The OpMultiplier variable should always point to the dup (if any), so
756  // swap if necessary.
757  if (IsUnitDup(OpMultiplicand) || IsUnitSplat(OpMultiplicand))
758  std::swap(OpMultiplier, OpMultiplicand);
759 
760  if (IsUnitSplat(OpMultiplier)) {
761  // [f]mul pg (dupx 1) %n => %n
762  OpMultiplicand->takeName(&II);
763  return IC.replaceInstUsesWith(II, OpMultiplicand);
764  } else if (IsUnitDup(OpMultiplier)) {
765  // [f]mul pg (dup pg 1) %n => %n
766  auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
767  auto *DupPg = DupInst->getOperand(1);
768  // TODO: this is naive. The optimization is still valid if DupPg
769  // 'encompasses' OpPredicate, not only if they're the same predicate.
770  if (OpPredicate == DupPg) {
771  OpMultiplicand->takeName(&II);
772  return IC.replaceInstUsesWith(II, OpMultiplicand);
773  }
774  }
775 
776  return instCombineSVEVectorBinOp(IC, II);
777 }
778 
780  IntrinsicInst &II) {
782  Builder.SetInsertPoint(&II);
783  Value *UnpackArg = II.getArgOperand(0);
784  auto *RetTy = cast<ScalableVectorType>(II.getType());
785  bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
786  II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
787 
788  // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
789  // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
790  if (auto *ScalarArg = getSplatValue(UnpackArg)) {
791  ScalarArg =
792  Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
793  Value *NewVal =
794  Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
795  NewVal->takeName(&II);
796  return IC.replaceInstUsesWith(II, NewVal);
797  }
798 
799  return None;
800 }
802  IntrinsicInst &II) {
803  auto *OpVal = II.getOperand(0);
804  auto *OpIndices = II.getOperand(1);
805  VectorType *VTy = cast<VectorType>(II.getType());
806 
807  // Check whether OpIndices is a constant splat value < minimal element count
808  // of result.
809  auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
810  if (!SplatValue ||
811  SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
812  return None;
813 
814  // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
815  // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
817  Builder.SetInsertPoint(&II);
818  auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
819  auto *VectorSplat =
820  Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
821 
822  VectorSplat->takeName(&II);
823  return IC.replaceInstUsesWith(II, VectorSplat);
824 }
825 
827  IntrinsicInst &II) {
828  // Try to remove sequences of tuple get/set.
829  Value *SetTuple, *SetIndex, *SetValue;
830  auto *GetTuple = II.getArgOperand(0);
831  auto *GetIndex = II.getArgOperand(1);
832  // Check that we have tuple_get(GetTuple, GetIndex) where GetTuple is a
833  // call to tuple_set i.e. tuple_set(SetTuple, SetIndex, SetValue).
834  // Make sure that the types of the current intrinsic and SetValue match
835  // in order to safely remove the sequence.
836  if (!match(GetTuple,
837  m_Intrinsic<Intrinsic::aarch64_sve_tuple_set>(
838  m_Value(SetTuple), m_Value(SetIndex), m_Value(SetValue))) ||
839  SetValue->getType() != II.getType())
840  return None;
841  // Case where we get the same index right after setting it.
842  // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) --> SetValue
843  if (GetIndex == SetIndex)
844  return IC.replaceInstUsesWith(II, SetValue);
845  // If we are getting a different index than what was set in the tuple_set
846  // intrinsic. We can just set the input tuple to the one up in the chain.
847  // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex)
848  // --> tuple_get(SetTuple, GetIndex)
849  return IC.replaceOperand(II, 0, SetTuple);
850 }
851 
853  IntrinsicInst &II) {
854  // zip1(uzp1(A, B), uzp2(A, B)) --> A
855  // zip2(uzp1(A, B), uzp2(A, B)) --> B
856  Value *A, *B;
857  if (match(II.getArgOperand(0),
858  m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
859  match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
860  m_Specific(A), m_Specific(B))))
861  return IC.replaceInstUsesWith(
862  II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
863 
864  return None;
865 }
866 
869  IntrinsicInst &II) const {
870  Intrinsic::ID IID = II.getIntrinsicID();
871  switch (IID) {
872  default:
873  break;
874  case Intrinsic::aarch64_sve_convert_from_svbool:
875  return instCombineConvertFromSVBool(IC, II);
876  case Intrinsic::aarch64_sve_dup:
877  return instCombineSVEDup(IC, II);
878  case Intrinsic::aarch64_sve_dup_x:
879  return instCombineSVEDupX(IC, II);
880  case Intrinsic::aarch64_sve_cmpne:
881  case Intrinsic::aarch64_sve_cmpne_wide:
882  return instCombineSVECmpNE(IC, II);
883  case Intrinsic::aarch64_sve_rdffr:
884  return instCombineRDFFR(IC, II);
885  case Intrinsic::aarch64_sve_lasta:
886  case Intrinsic::aarch64_sve_lastb:
887  return instCombineSVELast(IC, II);
888  case Intrinsic::aarch64_sve_cntd:
889  return instCombineSVECntElts(IC, II, 2);
890  case Intrinsic::aarch64_sve_cntw:
891  return instCombineSVECntElts(IC, II, 4);
892  case Intrinsic::aarch64_sve_cnth:
893  return instCombineSVECntElts(IC, II, 8);
894  case Intrinsic::aarch64_sve_cntb:
895  return instCombineSVECntElts(IC, II, 16);
896  case Intrinsic::aarch64_sve_ptest_any:
897  case Intrinsic::aarch64_sve_ptest_first:
898  case Intrinsic::aarch64_sve_ptest_last:
899  return instCombineSVEPTest(IC, II);
900  case Intrinsic::aarch64_sve_mul:
901  case Intrinsic::aarch64_sve_fmul:
902  return instCombineSVEVectorMul(IC, II);
903  case Intrinsic::aarch64_sve_fadd:
904  case Intrinsic::aarch64_sve_fsub:
905  return instCombineSVEVectorBinOp(IC, II);
906  case Intrinsic::aarch64_sve_tbl:
907  return instCombineSVETBL(IC, II);
908  case Intrinsic::aarch64_sve_uunpkhi:
909  case Intrinsic::aarch64_sve_uunpklo:
910  case Intrinsic::aarch64_sve_sunpkhi:
911  case Intrinsic::aarch64_sve_sunpklo:
912  return instCombineSVEUnpack(IC, II);
913  case Intrinsic::aarch64_sve_tuple_get:
914  return instCombineSVETupleGet(IC, II);
915  case Intrinsic::aarch64_sve_zip1:
916  case Intrinsic::aarch64_sve_zip2:
917  return instCombineSVEZip(IC, II);
918  }
919 
920  return None;
921 }
922 
923 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
925 
926  // A helper that returns a vector type from the given type. The number of
927  // elements in type Ty determine the vector width.
928  auto toVectorTy = [&](Type *ArgTy) {
929  return VectorType::get(ArgTy->getScalarType(),
930  cast<VectorType>(DstTy)->getElementCount());
931  };
932 
933  // Exit early if DstTy is not a vector type whose elements are at least
934  // 16-bits wide.
935  if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
936  return false;
937 
938  // Determine if the operation has a widening variant. We consider both the
939  // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
940  // instructions.
941  //
942  // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
943  // verify that their extending operands are eliminated during code
944  // generation.
945  switch (Opcode) {
946  case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
947  case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
948  break;
949  default:
950  return false;
951  }
952 
953  // To be a widening instruction (either the "wide" or "long" versions), the
954  // second operand must be a sign- or zero extend having a single user. We
955  // only consider extends having a single user because they may otherwise not
956  // be eliminated.
957  if (Args.size() != 2 ||
958  (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
959  !Args[1]->hasOneUse())
960  return false;
961  auto *Extend = cast<CastInst>(Args[1]);
962 
963  // Legalize the destination type and ensure it can be used in a widening
964  // operation.
965  auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
966  unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
967  if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
968  return false;
969 
970  // Legalize the source type and ensure it can be used in a widening
971  // operation.
972  auto *SrcTy = toVectorTy(Extend->getSrcTy());
973  auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
974  unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
975  if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
976  return false;
977 
978  // Get the total number of vector elements in the legalized types.
979  InstructionCost NumDstEls =
980  DstTyL.first * DstTyL.second.getVectorMinNumElements();
981  InstructionCost NumSrcEls =
982  SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
983 
984  // Return true if the legalized types have the same number of vector elements
985  // and the destination element type size is twice that of the source type.
986  return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
987 }
988 
990  Type *Src,
993  const Instruction *I) {
994  int ISD = TLI->InstructionOpcodeToISD(Opcode);
995  assert(ISD && "Invalid opcode");
996 
997  // If the cast is observable, and it is used by a widening instruction (e.g.,
998  // uaddl, saddw, etc.), it may be free.
999  if (I && I->hasOneUse()) {
1000  auto *SingleUser = cast<Instruction>(*I->user_begin());
1001  SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
1002  if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
1003  // If the cast is the second operand, it is free. We will generate either
1004  // a "wide" or "long" version of the widening instruction.
1005  if (I == SingleUser->getOperand(1))
1006  return 0;
1007  // If the cast is not the second operand, it will be free if it looks the
1008  // same as the second operand. In this case, we will generate a "long"
1009  // version of the widening instruction.
1010  if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
1011  if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
1012  cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
1013  return 0;
1014  }
1015  }
1016 
1017  // TODO: Allow non-throughput costs that aren't binary.
1018  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1020  return Cost == 0 ? 0 : 1;
1021  return Cost;
1022  };
1023 
1024  EVT SrcTy = TLI->getValueType(DL, Src);
1025  EVT DstTy = TLI->getValueType(DL, Dst);
1026 
1027  if (!SrcTy.isSimple() || !DstTy.isSimple())
1028  return AdjustCost(
1029  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1030 
1031  static const TypeConversionCostTblEntry
1032  ConversionTbl[] = {
1037 
1038  // Truncations on nxvmiN
1055 
1056  // The number of shll instructions for the extension.
1073 
1074  // LowerVectorINT_TO_FP:
1081 
1082  // Complex: to v2f32
1089 
1090  // Complex: to v4f32
1095 
1096  // Complex: to v8f32
1101 
1102  // Complex: to v16f32
1105 
1106  // Complex: to v2f64
1113 
1114 
1115  // LowerVectorFP_TO_INT
1122 
1123  // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
1130 
1131  // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
1136 
1137  // Complex, from nxv2f32.
1146 
1147  // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
1154 
1155  // Complex, from nxv2f64.
1164 
1165  // Complex, from nxv4f32.
1174 
1175  // Complex, from nxv8f64. Illegal -> illegal conversions not required.
1180 
1181  // Complex, from nxv4f64. Illegal -> illegal conversions not required.
1188 
1189  // Complex, from nxv8f32. Illegal -> illegal conversions not required.
1194 
1195  // Complex, from nxv8f16.
1204 
1205  // Complex, from nxv4f16.
1214 
1215  // Complex, from nxv2f16.
1224 
1225  // Truncate from nxvmf32 to nxvmf16.
1229 
1230  // Truncate from nxvmf64 to nxvmf16.
1234 
1235  // Truncate from nxvmf64 to nxvmf32.
1239 
1240  // Extend from nxvmf16 to nxvmf32.
1244 
1245  // Extend from nxvmf16 to nxvmf64.
1249 
1250  // Extend from nxvmf32 to nxvmf64.
1254 
1255  };
1256 
1257  if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
1258  DstTy.getSimpleVT(),
1259  SrcTy.getSimpleVT()))
1260  return AdjustCost(Entry->Cost);
1261 
1262  return AdjustCost(
1263  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1264 }
1265 
1267  Type *Dst,
1268  VectorType *VecTy,
1269  unsigned Index) {
1270 
1271  // Make sure we were given a valid extend opcode.
1272  assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
1273  "Invalid opcode");
1274 
1275  // We are extending an element we extract from a vector, so the source type
1276  // of the extend is the element type of the vector.
1277  auto *Src = VecTy->getElementType();
1278 
1279  // Sign- and zero-extends are for integer types only.
1280  assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
1281 
1282  // Get the cost for the extract. We compute the cost (if any) for the extend
1283  // below.
1284  InstructionCost Cost =
1285  getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
1286 
1287  // Legalize the types.
1288  auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
1289  auto DstVT = TLI->getValueType(DL, Dst);
1290  auto SrcVT = TLI->getValueType(DL, Src);
1292 
1293  // If the resulting type is still a vector and the destination type is legal,
1294  // we may get the extension for free. If not, get the default cost for the
1295  // extend.
1296  if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
1297  return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1298  CostKind);
1299 
1300  // The destination type should be larger than the element type. If not, get
1301  // the default cost for the extend.
1302  if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
1303  return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1304  CostKind);
1305 
1306  switch (Opcode) {
1307  default:
1308  llvm_unreachable("Opcode should be either SExt or ZExt");
1309 
1310  // For sign-extends, we only need a smov, which performs the extension
1311  // automatically.
1312  case Instruction::SExt:
1313  return Cost;
1314 
1315  // For zero-extends, the extend is performed automatically by a umov unless
1316  // the destination type is i64 and the element type is i8 or i16.
1317  case Instruction::ZExt:
1318  if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
1319  return Cost;
1320  }
1321 
1322  // If we are unable to perform the extend for free, get the default cost.
1323  return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1324  CostKind);
1325 }
1326 
1329  const Instruction *I) {
1331  return Opcode == Instruction::PHI ? 0 : 1;
1332  assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
1333  // Branches are assumed to be predicted.
1334  return 0;
1335 }
1336 
1338  unsigned Index) {
1339  assert(Val->isVectorTy() && "This must be a vector type");
1340 
1341  if (Index != -1U) {
1342  // Legalize the type.
1343  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
1344 
1345  // This type is legalized to a scalar type.
1346  if (!LT.second.isVector())
1347  return 0;
1348 
1349  // The type may be split. Normalize the index to the new type.
1350  unsigned Width = LT.second.getVectorNumElements();
1351  Index = Index % Width;
1352 
1353  // The element at index zero is already inside the vector.
1354  if (Index == 0)
1355  return 0;
1356  }
1357 
1358  // All other insert/extracts cost this much.
1359  return ST->getVectorInsertExtractBaseCost();
1360 }
1361 
1363  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1364  TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
1365  TTI::OperandValueProperties Opd1PropInfo,
1367  const Instruction *CxtI) {
1368  // TODO: Handle more cost kinds.
1370  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1371  Opd2Info, Opd1PropInfo,
1372  Opd2PropInfo, Args, CxtI);
1373 
1374  // Legalize the type.
1375  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1376 
1377  // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
1378  // add in the widening overhead specified by the sub-target. Since the
1379  // extends feeding widening instructions are performed automatically, they
1380  // aren't present in the generated code and have a zero cost. By adding a
1381  // widening overhead here, we attach the total cost of the combined operation
1382  // to the widening instruction.
1383  InstructionCost Cost = 0;
1384  if (isWideningInstruction(Ty, Opcode, Args))
1385  Cost += ST->getWideningBaseCost();
1386 
1387  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1388 
1389  switch (ISD) {
1390  default:
1391  return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1392  Opd2Info,
1393  Opd1PropInfo, Opd2PropInfo);
1394  case ISD::SDIV:
1396  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
1397  // On AArch64, scalar signed division by constants power-of-two are
1398  // normally expanded to the sequence ADD + CMP + SELECT + SRA.
1399  // The OperandValue properties many not be same as that of previous
1400  // operation; conservatively assume OP_None.
1401  Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
1402  Opd1Info, Opd2Info,
1405  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
1406  Opd1Info, Opd2Info,
1409  Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind,
1410  Opd1Info, Opd2Info,
1413  Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
1414  Opd1Info, Opd2Info,
1417  return Cost;
1418  }
1420  case ISD::UDIV:
1422  auto VT = TLI->getValueType(DL, Ty);
1423  if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
1424  // Vector signed division by constant are expanded to the
1425  // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
1426  // to MULHS + SUB + SRL + ADD + SRL.
1427  InstructionCost MulCost = getArithmeticInstrCost(
1428  Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info,
1430  InstructionCost AddCost = getArithmeticInstrCost(
1431  Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info,
1433  InstructionCost ShrCost = getArithmeticInstrCost(
1434  Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info,
1436  return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
1437  }
1438  }
1439 
1440  Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1441  Opd2Info,
1442  Opd1PropInfo, Opd2PropInfo);
1443  if (Ty->isVectorTy()) {
1444  // On AArch64, vector divisions are not supported natively and are
1445  // expanded into scalar divisions of each pair of elements.
1446  Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind,
1447  Opd1Info, Opd2Info, Opd1PropInfo,
1448  Opd2PropInfo);
1449  Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
1450  Opd1Info, Opd2Info, Opd1PropInfo,
1451  Opd2PropInfo);
1452  // TODO: if one of the arguments is scalar, then it's not necessary to
1453  // double the cost of handling the vector elements.
1454  Cost += Cost;
1455  }
1456  return Cost;
1457 
1458  case ISD::MUL:
1459  if (LT.second != MVT::v2i64)
1460  return (Cost + 1) * LT.first;
1461  // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive
1462  // as elements are extracted from the vectors and the muls scalarized.
1463  // As getScalarizationOverhead is a bit too pessimistic, we estimate the
1464  // cost for a i64 vector directly here, which is:
1465  // - four i64 extracts,
1466  // - two i64 inserts, and
1467  // - two muls.
1468  // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with
1469  // LT.first = 2 the cost is 16.
1470  return LT.first * 8;
1471  case ISD::ADD:
1472  case ISD::XOR:
1473  case ISD::OR:
1474  case ISD::AND:
1475  // These nodes are marked as 'custom' for combining purposes only.
1476  // We know that they are legal. See LowerAdd in ISelLowering.
1477  return (Cost + 1) * LT.first;
1478 
1479  case ISD::FADD:
1480  case ISD::FSUB:
1481  case ISD::FMUL:
1482  case ISD::FDIV:
1483  case ISD::FNEG:
1484  // These nodes are marked as 'custom' just to lower them to SVE.
1485  // We know said lowering will incur no additional cost.
1486  if (!Ty->getScalarType()->isFP128Ty())
1487  return (Cost + 2) * LT.first;
1488 
1489  return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1490  Opd2Info,
1491  Opd1PropInfo, Opd2PropInfo);
1492  }
1493 }
1494 
1496  ScalarEvolution *SE,
1497  const SCEV *Ptr) {
1498  // Address computations in vectorized code with non-consecutive addresses will
1499  // likely result in more instructions compared to scalar code where the
1500  // computation can more often be merged into the index mode. The resulting
1501  // extra micro-ops can significantly decrease throughput.
1502  unsigned NumVectorInstToHideOverhead = 10;
1503  int MaxMergeDistance = 64;
1504 
1505  if (Ty->isVectorTy() && SE &&
1506  !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1507  return NumVectorInstToHideOverhead;
1508 
1509  // In many cases the address computation is not merged into the instruction
1510  // addressing mode.
1511  return 1;
1512 }
1513 
1515  Type *CondTy,
1516  CmpInst::Predicate VecPred,
1518  const Instruction *I) {
1519  // TODO: Handle other cost kinds.
1521  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1522  I);
1523 
1524  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1525  // We don't lower some vector selects well that are wider than the register
1526  // width.
1527  if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
1528  // We would need this many instructions to hide the scalarization happening.
1529  const int AmortizationCost = 20;
1530 
1531  // If VecPred is not set, check if we can get a predicate from the context
1532  // instruction, if its type matches the requested ValTy.
1533  if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
1534  CmpInst::Predicate CurrentPred;
1535  if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
1536  m_Value())))
1537  VecPred = CurrentPred;
1538  }
1539  // Check if we have a compare/select chain that can be lowered using CMxx &
1540  // BFI pair.
1541  if (CmpInst::isIntPredicate(VecPred)) {
1542  static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
1544  MVT::v2i64};
1545  auto LT = TLI->getTypeLegalizationCost(DL, ValTy);
1546  if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
1547  return LT.first;
1548  }
1549 
1550  static const TypeConversionCostTblEntry
1551  VectorSelectTbl[] = {
1553  { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
1555  { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
1556  { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
1557  { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
1558  };
1559 
1560  EVT SelCondTy = TLI->getValueType(DL, CondTy);
1561  EVT SelValTy = TLI->getValueType(DL, ValTy);
1562  if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1563  if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
1564  SelCondTy.getSimpleVT(),
1565  SelValTy.getSimpleVT()))
1566  return Entry->Cost;
1567  }
1568  }
1569  // The base case handles scalable vectors fine for now, since it treats the
1570  // cost as 1 * legalization cost.
1571  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1572 }
1573 
1575 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
1577  if (ST->requiresStrictAlign()) {
1578  // TODO: Add cost modeling for strict align. Misaligned loads expand to
1579  // a bunch of instructions when strict align is enabled.
1580  return Options;
1581  }
1582  Options.AllowOverlappingLoads = true;
1583  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
1584  Options.NumLoadsPerBlock = Options.MaxNumLoads;
1585  // TODO: Though vector loads usually perform well on AArch64, in some targets
1586  // they may wake up the FP unit, which raises the power consumption. Perhaps
1587  // they could be used with no holds barred (-O3).
1588  Options.LoadSizes = {8, 4, 2, 1};
1589  return Options;
1590 }
1591 
1594  Align Alignment, unsigned AddressSpace,
1596  if (!isa<ScalableVectorType>(Src))
1597  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1598  CostKind);
1599  auto LT = TLI->getTypeLegalizationCost(DL, Src);
1600  if (!LT.first.isValid())
1601  return InstructionCost::getInvalid();
1602 
1603  // The code-generator is currently not able to handle scalable vectors
1604  // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
1605  // it. This change will be removed when code-generation for these types is
1606  // sufficiently reliable.
1607  if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
1608  return InstructionCost::getInvalid();
1609 
1610  return LT.first * 2;
1611 }
1612 
1614  unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1615  Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1616  if (useNeonVector(DataTy))
1617  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1618  Alignment, CostKind, I);
1619  auto *VT = cast<VectorType>(DataTy);
1620  auto LT = TLI->getTypeLegalizationCost(DL, DataTy);
1621  if (!LT.first.isValid())
1622  return InstructionCost::getInvalid();
1623 
1624  // The code-generator is currently not able to handle scalable vectors
1625  // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
1626  // it. This change will be removed when code-generation for these types is
1627  // sufficiently reliable.
1628  if (cast<VectorType>(DataTy)->getElementCount() ==
1630  return InstructionCost::getInvalid();
1631 
1632  ElementCount LegalVF = LT.second.getVectorElementCount();
1633  InstructionCost MemOpCost =
1634  getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
1635  return LT.first * MemOpCost * getMaxNumElements(LegalVF);
1636 }
1637 
1638 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
1639  return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
1640 }
1641 
1643  MaybeAlign Alignment,
1644  unsigned AddressSpace,
1646  const Instruction *I) {
1647  EVT VT = TLI->getValueType(DL, Ty, true);
1648  // Type legalization can't handle structs
1649  if (VT == MVT::Other)
1650  return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
1651  CostKind);
1652 
1653  auto LT = TLI->getTypeLegalizationCost(DL, Ty);
1654  if (!LT.first.isValid())
1655  return InstructionCost::getInvalid();
1656 
1657  // The code-generator is currently not able to handle scalable vectors
1658  // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
1659  // it. This change will be removed when code-generation for these types is
1660  // sufficiently reliable.
1661  if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
1662  if (VTy->getElementCount() == ElementCount::getScalable(1))
1663  return InstructionCost::getInvalid();
1664 
1665  // TODO: consider latency as well for TCK_SizeAndLatency.
1667  return LT.first;
1668 
1670  return 1;
1671 
1672  if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
1673  LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
1674  // Unaligned stores are extremely inefficient. We don't split all
1675  // unaligned 128-bit stores because the negative impact that has shown in
1676  // practice on inlined block copy code.
1677  // We make such stores expensive so that we will only vectorize if there
1678  // are 6 other instructions getting vectorized.
1679  const int AmortizationCost = 6;
1680 
1681  return LT.first * 2 * AmortizationCost;
1682  }
1683 
1684  // Check truncating stores and extending loads.
1685  if (useNeonVector(Ty) &&
1686  Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
1687  // v4i8 types are lowered to scalar a load/store and sshll/xtn.
1688  if (VT == MVT::v4i8)
1689  return 2;
1690  // Otherwise we need to scalarize.
1691  return cast<FixedVectorType>(Ty)->getNumElements() * 2;
1692  }
1693 
1694  return LT.first;
1695 }
1696 
1698  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1699  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1700  bool UseMaskForCond, bool UseMaskForGaps) {
1701  assert(Factor >= 2 && "Invalid interleave factor");
1702  auto *VecVTy = cast<FixedVectorType>(VecTy);
1703 
1704  if (!UseMaskForCond && !UseMaskForGaps &&
1705  Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1706  unsigned NumElts = VecVTy->getNumElements();
1707  auto *SubVecTy =
1708  FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1709 
1710  // ldN/stN only support legal vector types of size 64 or 128 in bits.
1711  // Accesses having vector types that are a multiple of 128 bits can be
1712  // matched to more than one ldN/stN instruction.
1713  if (NumElts % Factor == 0 &&
1714  TLI->isLegalInterleavedAccessType(SubVecTy, DL))
1715  return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1716  }
1717 
1718  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1719  Alignment, AddressSpace, CostKind,
1720  UseMaskForCond, UseMaskForGaps);
1721 }
1722 
1725  InstructionCost Cost = 0;
1727  for (auto *I : Tys) {
1728  if (!I->isVectorTy())
1729  continue;
1730  if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
1731  128)
1732  Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
1733  getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
1734  }
1735  return Cost;
1736 }
1737 
1739  return ST->getMaxInterleaveFactor();
1740 }
1741 
1742 // For Falkor, we want to avoid having too many strided loads in a loop since
1743 // that can exhaust the HW prefetcher resources. We adjust the unroller
1744 // MaxCount preference below to attempt to ensure unrolling doesn't create too
1745 // many strided loads.
1746 static void
1749  enum { MaxStridedLoads = 7 };
1750  auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
1751  int StridedLoads = 0;
1752  // FIXME? We could make this more precise by looking at the CFG and
1753  // e.g. not counting loads in each side of an if-then-else diamond.
1754  for (const auto BB : L->blocks()) {
1755  for (auto &I : *BB) {
1756  LoadInst *LMemI = dyn_cast<LoadInst>(&I);
1757  if (!LMemI)
1758  continue;
1759 
1760  Value *PtrValue = LMemI->getPointerOperand();
1761  if (L->isLoopInvariant(PtrValue))
1762  continue;
1763 
1764  const SCEV *LSCEV = SE.getSCEV(PtrValue);
1765  const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
1766  if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
1767  continue;
1768 
1769  // FIXME? We could take pairing of unrolled load copies into account
1770  // by looking at the AddRec, but we would probably have to limit this
1771  // to loops with no stores or other memory optimization barriers.
1772  ++StridedLoads;
1773  // We've seen enough strided loads that seeing more won't make a
1774  // difference.
1775  if (StridedLoads > MaxStridedLoads / 2)
1776  return StridedLoads;
1777  }
1778  }
1779  return StridedLoads;
1780  };
1781 
1782  int StridedLoads = countStridedLoads(L, SE);
1783  LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
1784  << " strided loads\n");
1785  // Pick the largest power of 2 unroll count that won't result in too many
1786  // strided loads.
1787  if (StridedLoads) {
1788  UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
1789  LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
1790  << UP.MaxCount << '\n');
1791  }
1792 }
1793 
1797  // Enable partial unrolling and runtime unrolling.
1798  BaseT::getUnrollingPreferences(L, SE, UP, ORE);
1799 
1800  UP.UpperBound = true;
1801 
1802  // For inner loop, it is more likely to be a hot one, and the runtime check
1803  // can be promoted out from LICM pass, so the overhead is less, let's try
1804  // a larger threshold to unroll more loops.
1805  if (L->getLoopDepth() > 1)
1806  UP.PartialThreshold *= 2;
1807 
1808  // Disable partial & runtime unrolling on -Os.
1809  UP.PartialOptSizeThreshold = 0;
1810 
1811  if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
1813  getFalkorUnrollingPreferences(L, SE, UP);
1814 
1815  // Scan the loop: don't unroll loops with calls as this could prevent
1816  // inlining. Don't unroll vector loops either, as they don't benefit much from
1817  // unrolling.
1818  for (auto *BB : L->getBlocks()) {
1819  for (auto &I : *BB) {
1820  // Don't unroll vectorised loop.
1821  if (I.getType()->isVectorTy())
1822  return;
1823 
1824  if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1825  if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1826  if (!isLoweredToCall(F))
1827  continue;
1828  }
1829  return;
1830  }
1831  }
1832  }
1833 
1834  // Enable runtime unrolling for in-order models
1835  // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
1836  // checking for that case, we can ensure that the default behaviour is
1837  // unchanged
1838  if (ST->getProcFamily() != AArch64Subtarget::Others &&
1839  !ST->getSchedModel().isOutOfOrder()) {
1840  UP.Runtime = true;
1841  UP.Partial = true;
1842  UP.UnrollRemainder = true;
1844 
1845  UP.UnrollAndJam = true;
1847  }
1848 }
1849 
1852  BaseT::getPeelingPreferences(L, SE, PP);
1853 }
1854 
1856  Type *ExpectedType) {
1857  switch (Inst->getIntrinsicID()) {
1858  default:
1859  return nullptr;
1860  case Intrinsic::aarch64_neon_st2:
1861  case Intrinsic::aarch64_neon_st3:
1862  case Intrinsic::aarch64_neon_st4: {
1863  // Create a struct type
1864  StructType *ST = dyn_cast<StructType>(ExpectedType);
1865  if (!ST)
1866  return nullptr;
1867  unsigned NumElts = Inst->arg_size() - 1;
1868  if (ST->getNumElements() != NumElts)
1869  return nullptr;
1870  for (unsigned i = 0, e = NumElts; i != e; ++i) {
1871  if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
1872  return nullptr;
1873  }
1874  Value *Res = UndefValue::get(ExpectedType);
1875  IRBuilder<> Builder(Inst);
1876  for (unsigned i = 0, e = NumElts; i != e; ++i) {
1877  Value *L = Inst->getArgOperand(i);
1878  Res = Builder.CreateInsertValue(Res, L, i);
1879  }
1880  return Res;
1881  }
1882  case Intrinsic::aarch64_neon_ld2:
1883  case Intrinsic::aarch64_neon_ld3:
1884  case Intrinsic::aarch64_neon_ld4:
1885  if (Inst->getType() == ExpectedType)
1886  return Inst;
1887  return nullptr;
1888  }
1889 }
1890 
1893  switch (Inst->getIntrinsicID()) {
1894  default:
1895  break;
1896  case Intrinsic::aarch64_neon_ld2:
1897  case Intrinsic::aarch64_neon_ld3:
1898  case Intrinsic::aarch64_neon_ld4:
1899  Info.ReadMem = true;
1900  Info.WriteMem = false;
1901  Info.PtrVal = Inst->getArgOperand(0);
1902  break;
1903  case Intrinsic::aarch64_neon_st2:
1904  case Intrinsic::aarch64_neon_st3:
1905  case Intrinsic::aarch64_neon_st4:
1906  Info.ReadMem = false;
1907  Info.WriteMem = true;
1908  Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
1909  break;
1910  }
1911 
1912  switch (Inst->getIntrinsicID()) {
1913  default:
1914  return false;
1915  case Intrinsic::aarch64_neon_ld2:
1916  case Intrinsic::aarch64_neon_st2:
1917  Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
1918  break;
1919  case Intrinsic::aarch64_neon_ld3:
1920  case Intrinsic::aarch64_neon_st3:
1921  Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
1922  break;
1923  case Intrinsic::aarch64_neon_ld4:
1924  case Intrinsic::aarch64_neon_st4:
1925  Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
1926  break;
1927  }
1928  return true;
1929 }
1930 
1931 /// See if \p I should be considered for address type promotion. We check if \p
1932 /// I is a sext with right type and used in memory accesses. If it used in a
1933 /// "complex" getelementptr, we allow it to be promoted without finding other
1934 /// sext instructions that sign extended the same initial value. A getelementptr
1935 /// is considered as "complex" if it has more than 2 operands.
1937  const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
1938  bool Considerable = false;
1939  AllowPromotionWithoutCommonHeader = false;
1940  if (!isa<SExtInst>(&I))
1941  return false;
1942  Type *ConsideredSExtType =
1943  Type::getInt64Ty(I.getParent()->getParent()->getContext());
1944  if (I.getType() != ConsideredSExtType)
1945  return false;
1946  // See if the sext is the one with the right type and used in at least one
1947  // GetElementPtrInst.
1948  for (const User *U : I.users()) {
1949  if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
1950  Considerable = true;
1951  // A getelementptr is considered as "complex" if it has more than 2
1952  // operands. We will promote a SExt used in such complex GEP as we
1953  // expect some computation to be merged if they are done on 64 bits.
1954  if (GEPInst->getNumOperands() > 2) {
1955  AllowPromotionWithoutCommonHeader = true;
1956  break;
1957  }
1958  }
1959  }
1960  return Considerable;
1961 }
1962 
1964  const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
1965  if (!VF.isScalable())
1966  return true;
1967 
1968  Type *Ty = RdxDesc.getRecurrenceType();
1969  if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
1970  return false;
1971 
1972  switch (RdxDesc.getRecurrenceKind()) {
1973  case RecurKind::Add:
1974  case RecurKind::FAdd:
1975  case RecurKind::And:
1976  case RecurKind::Or:
1977  case RecurKind::Xor:
1978  case RecurKind::SMin:
1979  case RecurKind::SMax:
1980  case RecurKind::UMin:
1981  case RecurKind::UMax:
1982  case RecurKind::FMin:
1983  case RecurKind::FMax:
1984  case RecurKind::SelectICmp:
1985  case RecurKind::SelectFCmp:
1986  return true;
1987  default:
1988  return false;
1989  }
1990 }
1991 
1994  bool IsUnsigned,
1996  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1997 
1998  if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
1999  return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
2000 
2001  assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&
2002  "Both vector needs to be equally scalable");
2003 
2004  InstructionCost LegalizationCost = 0;
2005  if (LT.first > 1) {
2006  Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2007  unsigned MinMaxOpcode =
2008  Ty->isFPOrFPVectorTy()
2010  : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin);
2011  IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy});
2012  LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
2013  }
2014 
2015  return LegalizationCost + /*Cost of horizontal reduction*/ 2;
2016 }
2017 
2019  unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
2020  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2021  InstructionCost LegalizationCost = 0;
2022  if (LT.first > 1) {
2023  Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
2024  LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
2025  LegalizationCost *= LT.first - 1;
2026  }
2027 
2028  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2029  assert(ISD && "Invalid opcode");
2030  // Add the final reduction cost for the legal horizontal reduction
2031  switch (ISD) {
2032  case ISD::ADD:
2033  case ISD::AND:
2034  case ISD::OR:
2035  case ISD::XOR:
2036  case ISD::FADD:
2037  return LegalizationCost + 2;
2038  default:
2039  return InstructionCost::getInvalid();
2040  }
2041 }
2042 
2047  if (TTI::requiresOrderedReduction(FMF)) {
2048  if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
2049  InstructionCost BaseCost =
2050  BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2051  // Add on extra cost to reflect the extra overhead on some CPUs. We still
2052  // end up vectorizing for more computationally intensive loops.
2053  return BaseCost + FixedVTy->getNumElements();
2054  }
2055 
2056  if (Opcode != Instruction::FAdd)
2057  return InstructionCost::getInvalid();
2058 
2059  auto *VTy = cast<ScalableVectorType>(ValTy);
2060  InstructionCost Cost =
2061  getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
2062  Cost *= getMaxNumElements(VTy->getElementCount());
2063  return Cost;
2064  }
2065 
2066  if (isa<ScalableVectorType>(ValTy))
2067  return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
2068 
2069  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2070  MVT MTy = LT.second;
2071  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2072  assert(ISD && "Invalid opcode");
2073 
2074  // Horizontal adds can use the 'addv' instruction. We model the cost of these
2075  // instructions as twice a normal vector add, plus 1 for each legalization
2076  // step (LT.first). This is the only arithmetic vector reduction operation for
2077  // which we have an instruction.
2078  // OR, XOR and AND costs should match the codegen from:
2079  // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
2080  // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
2081  // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
2082  static const CostTblEntry CostTblNoPairwise[]{
2083  {ISD::ADD, MVT::v8i8, 2},
2084  {ISD::ADD, MVT::v16i8, 2},
2085  {ISD::ADD, MVT::v4i16, 2},
2086  {ISD::ADD, MVT::v8i16, 2},
2087  {ISD::ADD, MVT::v4i32, 2},
2088  {ISD::OR, MVT::v8i8, 15},
2089  {ISD::OR, MVT::v16i8, 17},
2090  {ISD::OR, MVT::v4i16, 7},
2091  {ISD::OR, MVT::v8i16, 9},
2092  {ISD::OR, MVT::v2i32, 3},
2093  {ISD::OR, MVT::v4i32, 5},
2094  {ISD::OR, MVT::v2i64, 3},
2095  {ISD::XOR, MVT::v8i8, 15},
2096  {ISD::XOR, MVT::v16i8, 17},
2097  {ISD::XOR, MVT::v4i16, 7},
2098  {ISD::XOR, MVT::v8i16, 9},
2099  {ISD::XOR, MVT::v2i32, 3},
2100  {ISD::XOR, MVT::v4i32, 5},
2101  {ISD::XOR, MVT::v2i64, 3},
2102  {ISD::AND, MVT::v8i8, 15},
2103  {ISD::AND, MVT::v16i8, 17},
2104  {ISD::AND, MVT::v4i16, 7},
2105  {ISD::AND, MVT::v8i16, 9},
2106  {ISD::AND, MVT::v2i32, 3},
2107  {ISD::AND, MVT::v4i32, 5},
2108  {ISD::AND, MVT::v2i64, 3},
2109  };
2110  switch (ISD) {
2111  default:
2112  break;
2113  case ISD::ADD:
2114  if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
2115  return (LT.first - 1) + Entry->Cost;
2116  break;
2117  case ISD::XOR:
2118  case ISD::AND:
2119  case ISD::OR:
2120  const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
2121  if (!Entry)
2122  break;
2123  auto *ValVTy = cast<FixedVectorType>(ValTy);
2124  if (!ValVTy->getElementType()->isIntegerTy(1) &&
2125  MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
2126  isPowerOf2_32(ValVTy->getNumElements())) {
2127  InstructionCost ExtraCost = 0;
2128  if (LT.first != 1) {
2129  // Type needs to be split, so there is an extra cost of LT.first - 1
2130  // arithmetic ops.
2131  auto *Ty = FixedVectorType::get(ValTy->getElementType(),
2132  MTy.getVectorNumElements());
2133  ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
2134  ExtraCost *= LT.first - 1;
2135  }
2136  return Entry->Cost + ExtraCost;
2137  }
2138  break;
2139  }
2140  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2141 }
2142 
2144  static const CostTblEntry ShuffleTbl[] = {
2145  { TTI::SK_Splice, MVT::nxv16i8, 1 },
2146  { TTI::SK_Splice, MVT::nxv8i16, 1 },
2147  { TTI::SK_Splice, MVT::nxv4i32, 1 },
2148  { TTI::SK_Splice, MVT::nxv2i64, 1 },
2149  { TTI::SK_Splice, MVT::nxv2f16, 1 },
2150  { TTI::SK_Splice, MVT::nxv4f16, 1 },
2151  { TTI::SK_Splice, MVT::nxv8f16, 1 },
2152  { TTI::SK_Splice, MVT::nxv2bf16, 1 },
2153  { TTI::SK_Splice, MVT::nxv4bf16, 1 },
2154  { TTI::SK_Splice, MVT::nxv8bf16, 1 },
2155  { TTI::SK_Splice, MVT::nxv2f32, 1 },
2156  { TTI::SK_Splice, MVT::nxv4f32, 1 },
2157  { TTI::SK_Splice, MVT::nxv2f64, 1 },
2158  };
2159 
2160  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
2161  Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
2163  EVT PromotedVT = LT.second.getScalarType() == MVT::i1
2164  ? TLI->getPromotedVTForPredicate(EVT(LT.second))
2165  : LT.second;
2166  Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
2167  InstructionCost LegalizationCost = 0;
2168  if (Index < 0) {
2169  LegalizationCost =
2170  getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
2172  getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
2174  }
2175 
2176  // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
2177  // Cost performed on a promoted type.
2178  if (LT.second.getScalarType() == MVT::i1) {
2179  LegalizationCost +=
2180  getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
2182  getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
2184  }
2185  const auto *Entry =
2186  CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
2187  assert(Entry && "Illegal Type for Splice");
2188  LegalizationCost += Entry->Cost;
2189  return LegalizationCost * LT.first;
2190 }
2191 
2193  VectorType *Tp,
2194  ArrayRef<int> Mask, int Index,
2195  VectorType *SubTp) {
2196  Kind = improveShuffleKindFromMask(Kind, Mask);
2199  Kind == TTI::SK_Reverse) {
2200  static const CostTblEntry ShuffleTbl[] = {
2201  // Broadcast shuffle kinds can be performed with 'dup'.
2202  { TTI::SK_Broadcast, MVT::v8i8, 1 },
2203  { TTI::SK_Broadcast, MVT::v16i8, 1 },
2204  { TTI::SK_Broadcast, MVT::v4i16, 1 },
2205  { TTI::SK_Broadcast, MVT::v8i16, 1 },
2206  { TTI::SK_Broadcast, MVT::v2i32, 1 },
2207  { TTI::SK_Broadcast, MVT::v4i32, 1 },
2208  { TTI::SK_Broadcast, MVT::v2i64, 1 },
2209  { TTI::SK_Broadcast, MVT::v2f32, 1 },
2210  { TTI::SK_Broadcast, MVT::v4f32, 1 },
2211  { TTI::SK_Broadcast, MVT::v2f64, 1 },
2212  // Transpose shuffle kinds can be performed with 'trn1/trn2' and
2213  // 'zip1/zip2' instructions.
2214  { TTI::SK_Transpose, MVT::v8i8, 1 },
2215  { TTI::SK_Transpose, MVT::v16i8, 1 },
2216  { TTI::SK_Transpose, MVT::v4i16, 1 },
2217  { TTI::SK_Transpose, MVT::v8i16, 1 },
2218  { TTI::SK_Transpose, MVT::v2i32, 1 },
2219  { TTI::SK_Transpose, MVT::v4i32, 1 },
2220  { TTI::SK_Transpose, MVT::v2i64, 1 },
2221  { TTI::SK_Transpose, MVT::v2f32, 1 },
2222  { TTI::SK_Transpose, MVT::v4f32, 1 },
2223  { TTI::SK_Transpose, MVT::v2f64, 1 },
2224  // Select shuffle kinds.
2225  // TODO: handle vXi8/vXi16.
2226  { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
2227  { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
2228  { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
2229  { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
2230  { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
2231  { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
2232  // PermuteSingleSrc shuffle kinds.
2233  { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
2234  { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
2235  { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
2236  { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
2237  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
2238  { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
2239  { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case.
2240  { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case.
2241  { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case.
2242  { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl
2243  { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl
2244  { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl
2245  { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl
2246  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl
2247  // Reverse can be lowered with `rev`.
2248  { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov.
2249  { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT
2250  { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov.
2251  { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov.
2252  { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT
2253  { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov.
2254  // Broadcast shuffle kinds for scalable vectors
2272  // Handle the cases for vector.reverse with scalable vectors
2273  { TTI::SK_Reverse, MVT::nxv16i8, 1 },
2274  { TTI::SK_Reverse, MVT::nxv8i16, 1 },
2275  { TTI::SK_Reverse, MVT::nxv4i32, 1 },
2276  { TTI::SK_Reverse, MVT::nxv2i64, 1 },
2277  { TTI::SK_Reverse, MVT::nxv2f16, 1 },
2278  { TTI::SK_Reverse, MVT::nxv4f16, 1 },
2279  { TTI::SK_Reverse, MVT::nxv8f16, 1 },
2283  { TTI::SK_Reverse, MVT::nxv2f32, 1 },
2284  { TTI::SK_Reverse, MVT::nxv4f32, 1 },
2285  { TTI::SK_Reverse, MVT::nxv2f64, 1 },
2286  { TTI::SK_Reverse, MVT::nxv16i1, 1 },
2287  { TTI::SK_Reverse, MVT::nxv8i1, 1 },
2288  { TTI::SK_Reverse, MVT::nxv4i1, 1 },
2289  { TTI::SK_Reverse, MVT::nxv2i1, 1 },
2290  };
2291  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
2292  if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
2293  return LT.first * Entry->Cost;
2294  }
2295  if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
2296  return getSpliceCost(Tp, Index);
2297  return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
2298 }
i
i
Definition: README.txt:29
llvm::AArch64TTIImpl::getIntImmCostIntrin
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:159
llvm::MVT::nxv4i1
@ nxv4i1
Definition: MachineValueType.h:190
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::TargetTransformInfo::PSK_FastHardware
@ PSK_FastHardware
Definition: TargetTransformInfo.h:592
llvm::AArch64TTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: AArch64TargetTransformInfo.cpp:1794
llvm::MVT::nxv4i64
@ nxv4i64
Definition: MachineValueType.h:220
llvm::TargetTransformInfo::UnrollingPreferences::PartialOptSizeThreshold
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
Definition: TargetTransformInfo.h:457
llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition: TargetTransformInfo.h:862
Attrs
Function Attrs
Definition: README_ALTIVEC.txt:215
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:485
llvm::MVT::nxv2i1
@ nxv2i1
Definition: MachineValueType.h:189
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:211
llvm::Loop::isLoopInvariant
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:64
llvm::MVT::v4f16
@ v4f16
Definition: MachineValueType.h:136
llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition: TargetTransformInfo.h:453
llvm
This file implements support for optimizing divisions by a constant.
Definition: AllocatorList.h:23
llvm::ConvertCostTableLookup
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType >> Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
llvm::Type::getInt1Ty
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:238
llvm::CostTblEntryT
Cost Table Entry.
Definition: CostTable.h:25
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
getFalkorUnrollingPreferences
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
Definition: AArch64TargetTransformInfo.cpp:1747
llvm::MVT::nxv2f64
@ nxv2f64
Definition: MachineValueType.h:247
llvm::AArch64TTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: AArch64TargetTransformInfo.cpp:1337
llvm::RecurKind::Or
@ Or
Bitwise or logical OR of integers.
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:633
InstCombiner.h
instCombineSVEPTest
static Optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:673
Insert
Vector Rotate Left Mask Mask Insert
Definition: README_P9.txt:112
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:720
llvm::SCEVAddRecExpr::isAffine
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
Definition: ScalarEvolutionExpressions.h:379
llvm::TargetTransformInfo::UnrollingPreferences::MaxCount
unsigned MaxCount
Definition: TargetTransformInfo.h:469
llvm::AArch64TTIImpl::getIntImmCost
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
Definition: AArch64TargetTransformInfo.cpp:50
IntrinsicInst.h
llvm::ElementCount
Definition: TypeSize.h:386
llvm::AArch64TTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:1642
llvm::Function
Definition: Function.h:62
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
instCombineSVECmpNE
static Optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:455
llvm::BinaryOperator::CreateWithCopiedFlags
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Instruction *CopyO, const Twine &Name="", Instruction *InsertBefore=nullptr)
Definition: InstrTypes.h:250
llvm::ISD::UDIV
@ UDIV
Definition: ISDOpcodes.h:243
llvm::TargetTransformInfo::PopcntSupportKind
PopcntSupportKind
Flags indicating the kind of support for population count.
Definition: TargetTransformInfo.h:592
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
llvm::MVT::nxv2f32
@ nxv2f32
Definition: MachineValueType.h:241
llvm::PatternMatch::m_FPOne
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:848
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:308
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::MVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition: MachineValueType.h:366
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1474
llvm::getSplatValue
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
Definition: VectorUtils.cpp:366
llvm::IRBuilder<>
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:150
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:460
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJamInnerLoopThreshold
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
Definition: TargetTransformInfo.h:504
llvm::MVT::nxv2i64
@ nxv2i64
Definition: MachineValueType.h:219
llvm::TargetTransformInfo::UnrollingPreferences::UnrollRemainder
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
Definition: TargetTransformInfo.h:497
llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:785
llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:178
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:214
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::InsertElementInst::Create
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Definition: Instructions.h:1951
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::ExtractElementInst::Create
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Definition: Instructions.h:1886
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:481
llvm::MVT::nxv4f16
@ nxv4f16
Definition: MachineValueType.h:230
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1410
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:535
llvm::Optional
Definition: APInt.h:33
llvm::RecurKind::SelectFCmp
@ SelectFCmp
Integer select(fcmp(),x,y) where one of (x,y) is loop invariant.
llvm::PatternMatch::m_BinOp
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:84
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:40
llvm::MVT::nxv4f64
@ nxv4f64
Definition: MachineValueType.h:248
llvm::TargetTransformInfo::OP_PowerOf2
@ OP_PowerOf2
Definition: TargetTransformInfo.h:886
instCombineSVEVectorBinOp
static Optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:711
llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:422
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:116
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:267
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
llvm::RecurKind::SMin
@ SMin
Signed integer min implemented in terms of select(cmp()).
llvm::MVT::v2f64
@ v2f64
Definition: MachineValueType.h:172
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition: TargetTransformInfo.h:870
llvm::MVT::nxv8i16
@ nxv8i16
Definition: MachineValueType.h:207
llvm::LinearPolySize::isScalable
bool isScalable() const
Returns whether the size is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:299
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:241
llvm::APIntOps::umin
const APInt & umin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be unsigned.
Definition: APInt.h:2128
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:860
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::AArch64TTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: AArch64TargetTransformInfo.cpp:33
instCombineRDFFR
static Optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:633
llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(Optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition: TargetTransformInfo.h:1160
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:130
llvm::MVT::nxv8bf16
@ nxv8bf16
Definition: MachineValueType.h:238
llvm::AArch64TTIImpl::getExtractWithExtendCost
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
Definition: AArch64TargetTransformInfo.cpp:1266
llvm::PatternMatch::m_OneUse
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:266
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::MVT::v4bf16
@ v4bf16
Definition: MachineValueType.h:147
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:206
TargetLowering.h
llvm::InstCombiner::replaceOperand
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:436
llvm::MVT::i1
@ i1
Definition: MachineValueType.h:43
llvm::MVT::v8f16
@ v8f16
Definition: MachineValueType.h:137
llvm::isSplatValue
bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
Definition: VectorUtils.cpp:381
llvm::MVT::nxv2bf16
@ nxv2bf16
Definition: MachineValueType.h:236
llvm::RecurKind::And
@ And
Bitwise or logical AND of integers.
instCombineConvertFromSVBool
static Optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:374
llvm::TypeConversionCostTblEntryT
Type Conversion Cost Table.
Definition: CostTable.h:55
llvm::PatternMatch::m_Select
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
Definition: PatternMatch.h:1472
llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:679
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:729
llvm::TargetTransformInfo::OP_None
@ OP_None
Definition: TargetTransformInfo.h:886
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:859
llvm::AArch64_AM::isLogicalImmediate
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
Definition: AArch64AddressingModes.h:276
llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition: TargetTransformInfo.h:1062
llvm::User
Definition: User.h:44
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:35
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::MVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition: MachineValueType.h:1062
llvm::AArch64TTIImpl::getCostOfKeepingLiveOverCall
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
Definition: AArch64TargetTransformInfo.cpp:1724
llvm::RecurrenceDescriptor::getRecurrenceType
Type * getRecurrenceType() const
Returns the type of the recurrence.
Definition: IVDescriptors.h:246
instCombineSVEZip
static Optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:852
llvm::AArch64Subtarget::Others
@ Others
Definition: AArch64Subtarget.h:41
llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:735
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:178
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:226
llvm::MVT::v16i1
@ v16i1
Definition: MachineValueType.h:68
llvm::MVT::nxv2f16
@ nxv2f16
Definition: MachineValueType.h:229
llvm::VectorType::getElementCount
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:627
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
llvm::Log2_32
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:596
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::MVT::v8i1
@ v8i1
Definition: MachineValueType.h:67
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJam
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
Definition: TargetTransformInfo.h:499
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::AArch64TTIImpl::getIntImmCostInst
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
Definition: AArch64TargetTransformInfo.cpp:90
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:191
llvm::RecurrenceDescriptor::getRecurrenceKind
RecurKind getRecurrenceKind() const
Definition: IVDescriptors.h:196
llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:739
llvm::AArch64TTIImpl::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:989
Options
const char LLVMTargetMachineRef LLVMPassBuilderOptionsRef Options
Definition: PassBuilderBindings.cpp:48
llvm::AArch64TTIImpl::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:1993
llvm::MVT::nxv4i8
@ nxv4i8
Definition: MachineValueType.h:198
llvm::MVT::nxv4f32
@ nxv4f32
Definition: MachineValueType.h:242
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1796
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:925
instCombineSVEVectorMul
static Optional< Instruction * > instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:727
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:632
Align
uint64_t Align
Definition: ELFObjHandler.cpp:83
PatternMatch.h
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:686
llvm::MVT::v1i64
@ v1i64
Definition: MachineValueType.h:117
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::MCID::Call
@ Call
Definition: MCInstrDesc.h:153
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
IVDescriptors.h
llvm::AArch64TTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: AArch64TargetTransformInfo.cpp:2192
llvm::None
const NoneType None
Definition: None.h:23
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:91
llvm::EVT::getTypeForEVT
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:181
llvm::lltok::Kind
Kind
Definition: LLToken.h:18
llvm::MVT::v4i8
@ v4i8
Definition: MachineValueType.h:78
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:118
llvm::MVT::nxv4i16
@ nxv4i16
Definition: MachineValueType.h:206
instCombineSVETBL
static Optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:801
llvm::PatternMatch::m_One
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:513
llvm::RecurKind::UMin
@ UMin
Unisgned integer min implemented in terms of select(cmp()).
llvm::TargetTransformInfo::PSK_Software
@ PSK_Software
Definition: TargetTransformInfo.h:592
llvm::maxnum
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE maxNum semantics.
Definition: APFloat.h:1307
LoopInfo.h
instCombineSVEUnpack
static Optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:779
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:78
llvm::APInt::ashr
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:791
llvm::AArch64TTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:220
getCalledFunction
static const Function * getCalledFunction(const Value *V, bool LookThroughBitCast, bool &IsNoBuiltin)
Definition: MemoryBuiltins.cpp:118
llvm::AArch64TTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:2044
llvm::ScalarEvolution::getSCEV
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition: ScalarEvolution.cpp:4088
AArch64AddressingModes.h
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:190
llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:377
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:861
llvm::MVT::v2i8
@ v2i8
Definition: MachineValueType.h:77
llvm::AArch64TTIImpl::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: AArch64TargetTransformInfo.cpp:1697
llvm::MVT::v4i64
@ v4i64
Definition: MachineValueType.h:120
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
llvm::MVT::nxv16i8
@ nxv16i8
Definition: MachineValueType.h:200
llvm::cl::opt< bool >
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:77
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::MVT::nxv8i64
@ nxv8i64
Definition: MachineValueType.h:221
llvm::MVT::v16i8
@ v16i8
Definition: MachineValueType.h:80
llvm::CostTableLookup
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType >> Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
llvm::AArch64TTIImpl::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:1593
llvm::AArch64TTIImpl::instCombineIntrinsic
Optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
Definition: AArch64TargetTransformInfo.cpp:868
llvm::MVT::v16i16
@ v16i16
Definition: MachineValueType.h:93
Index
uint32_t Index
Definition: ELFObjHandler.cpp:84
llvm::MVT::v2i64
@ v2i64
Definition: MachineValueType.h:118
intrinsicIDToBinOpCode
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
Definition: AArch64TargetTransformInfo.cpp:698
llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition: ISDOpcodes.h:786
llvm::AArch64::SVEBitsPerBlock
static constexpr unsigned SVEBitsPerBlock
Definition: AArch64BaseInfo.h:757
AArch64ExpandImm.h
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::MVT::v16f32
@ v16f32
Definition: MachineValueType.h:162
llvm::TargetTransformInfo::OK_UniformConstantValue
@ OK_UniformConstantValue
Definition: TargetTransformInfo.h:881
llvm::AArch64TTIImpl::getArithmeticReductionCostSVE
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:2018
llvm::LegalityPredicates::all
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
Definition: LegalizerInfo.h:226
llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition: Instructions.h:2783
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:57
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:428
instCombineSVETupleGet
static Optional< Instruction * > instCombineSVETupleGet(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:826
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:886
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:928
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:441
llvm::AArch64TTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: AArch64TargetTransformInfo.cpp:1362
llvm::RecurKind::Add
@ Add
Sum of integers.
llvm::MVT::v4f32
@ v4f32
Definition: MachineValueType.h:157
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::AArch64TTIImpl::isLegalToVectorizeReduction
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
Definition: AArch64TargetTransformInfo.cpp:1963
llvm::MVT::getVectorNumElements
unsigned getVectorNumElements() const
Definition: MachineValueType.h:850
llvm::MVT::i8
@ i8
Definition: MachineValueType.h:44
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:79
llvm::MVT::Other
@ Other
Definition: MachineValueType.h:42
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:840
llvm::LoopBase::getLoopDepth
unsigned getLoopDepth() const
Return the nesting level of this loop.
Definition: LoopInfo.h:96
llvm::MVT::nxv4i32
@ nxv4i32
Definition: MachineValueType.h:213
llvm::CmpInst::BAD_ICMP_PREDICATE
@ BAD_ICMP_PREDICATE
Definition: InstrTypes.h:753
llvm::MVT
Machine Value Type.
Definition: MachineValueType.h:31
llvm::TargetTransformInfo::SK_Splice
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
Definition: TargetTransformInfo.h:872
llvm::MVT::nxv4bf16
@ nxv4bf16
Definition: MachineValueType.h:237
llvm::LinearPolySize::getKnownMinValue
ScalarTy getKnownMinValue() const
Returns the minimum value this size can represent.
Definition: TypeSize.h:297
llvm::AArch64TTIImpl::getMaxInterleaveFactor
unsigned getMaxInterleaveFactor(unsigned VF)
Definition: AArch64TargetTransformInfo.cpp:1738
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:878
llvm::AArch64TTIImpl::getSpliceCost
InstructionCost getSpliceCost(VectorType *Tp, int Index)
Definition: AArch64TargetTransformInfo.cpp:2143
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:650
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
instCombineSVEDupX
static Optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:443
llvm::CmpInst::isIntPredicate
bool isIntPredicate() const
Definition: InstrTypes.h:814
llvm::TargetTransformInfo::MemCmpExpansionOptions
Returns options for expansion of memcmp. IsZeroCmp is.
Definition: TargetTransformInfo.h:761
llvm::MVT::nxv2i32
@ nxv2i32
Definition: MachineValueType.h:212
llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition: TargetTransformInfo.h:262
llvm::APIntOps::smin
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition: APInt.h:2118
llvm::RecurKind::UMax
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::Instruction::getFastMathFlags
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
Definition: Instruction.cpp:280
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1558
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:47
llvm::StructType
Class to represent struct types.
Definition: DerivedTypes.h:213
llvm::MVT::v2i32
@ v2i32
Definition: MachineValueType.h:101
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:215
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:134
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::MVT::v2f32
@ v2f32
Definition: MachineValueType.h:155
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:990
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::TargetTransformInfo::UnrollingPreferences::DefaultUnrollRuntimeCount
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
Definition: TargetTransformInfo.h:464
llvm::RecurKind::FMax
@ FMax
FP max implemented in terms of select(cmp()).
llvm::MVT::v4i32
@ v4i32
Definition: MachineValueType.h:103
llvm::AArch64TTIImpl::getPopcntSupport
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
Definition: AArch64TargetTransformInfo.cpp:211
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:286
llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:127
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:175
llvm::ISD::FMUL
@ FMUL
Definition: ISDOpcodes.h:379
llvm::TargetTransformInfo::SK_Transpose
@ SK_Transpose
Transpose two vectors.
Definition: TargetTransformInfo.h:865
instCombineSVEDup
static Optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:419
llvm::MVT::v8i64
@ v8i64
Definition: MachineValueType.h:121
llvm::TargetTransformInfo::CastContextHint::None
@ None
The cast is not used with a load/store of any kind.
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:634
llvm::InstCombiner::replaceInstUsesWith
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:415
AArch64TargetTransformInfo.h
llvm::MVT::v16i32
@ v16i32
Definition: MachineValueType.h:108
llvm::AArch64TTIImpl::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:1514
llvm::MCID::Select
@ Select
Definition: MCInstrDesc.h:162
llvm::MVT::nxv16i1
@ nxv16i1
Definition: MachineValueType.h:192
llvm::APIntOps::umax
const APInt & umax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be unsigned.
Definition: APInt.h:2133
llvm::MVT::v8bf16
@ v8bf16
Definition: MachineValueType.h:148
llvm::MVT::nxv2i16
@ nxv2i16
Definition: MachineValueType.h:205
llvm::Type::getInt64Ty
static IntegerType * getInt64Ty(LLVMContext &C)
Definition: Type.cpp:242
Insn
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
Definition: AArch64MIPeepholeOpt.cpp:74
llvm::Constant::getNullValue
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:348
EnableFalkorHWPFUnrollFix
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
CostTable.h
llvm::TargetTransformInfo::UnrollingPreferences::UpperBound
bool UpperBound
Allow using trip count upper bound to unroll loops.
Definition: TargetTransformInfo.h:495
llvm::AArch64_IMM::expandMOVImm
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
Definition: AArch64ExpandImm.cpp:304
instCombineSVECntElts
static Optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
Definition: AArch64TargetTransformInfo.cpp:651
llvm::CallBase::arg_size
unsigned arg_size() const
Definition: InstrTypes.h:1326
llvm::APInt::sextOrTrunc
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:978
llvm::MVT::v8i16
@ v8i16
Definition: MachineValueType.h:92
llvm::SCEVAddRecExpr
This node represents a polynomial recurrence on the trip count of the specified loop.
Definition: ScalarEvolutionExpressions.h:352
llvm::AArch64TTIImpl::getTgtMemIntrinsic
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
Definition: AArch64TargetTransformInfo.cpp:1891
llvm::LinearPolySize< ElementCount >::getScalable
static ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:287
llvm::MVT::i32
@ i32
Definition: MachineValueType.h:46
llvm::ISD::SDIV
@ SDIV
Definition: ISDOpcodes.h:242
llvm::AArch64TTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:1327
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:183
llvm::MVT::nxv8i8
@ nxv8i8
Definition: MachineValueType.h:199
llvm::MVT::v8i32
@ v8i32
Definition: MachineValueType.h:107
llvm::InstCombiner
The core instruction combiner logic.
Definition: InstCombiner.h:45
llvm::AMDGPU::Hwreg::Width
Width
Definition: SIDefines.h:413
llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition: ISDOpcodes.h:740
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
llvm::MVT::nxv8i32
@ nxv8i32
Definition: MachineValueType.h:214
llvm::APInt::sext
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:928
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
llvm::MVT::nxv8f16
@ nxv8f16
Definition: MachineValueType.h:231
llvm::InstructionCost::getInvalid
static InstructionCost getInvalid(CostType Val=0)
Definition: InstructionCost.h:73
llvm::Instruction::BinaryOps
BinaryOps
Definition: Instruction.h:785
llvm::RecurrenceDescriptor
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:71
llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:833
llvm::RecurKind::SelectICmp
@ SelectICmp
Integer select(icmp(),x,y) where one of (x,y) is loop invariant.
llvm::RecurKind::FAdd
@ FAdd
Sum of floats.
llvm::getNumElementsFromSVEPredPattern
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
Definition: AArch64BaseInfo.h:459
llvm::ISD::FSUB
@ FSUB
Definition: ISDOpcodes.h:378
llvm::Pattern
Definition: FileCheckImpl.h:614
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:148
llvm::MVT::nxv8i1
@ nxv8i1
Definition: MachineValueType.h:191
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:802
llvm::Type::isBFloatTy
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:144
processPhiNode
static Optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
Definition: AArch64TargetTransformInfo.cpp:336
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:241
llvm::MVT::f16
@ f16
Definition: MachineValueType.h:54
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1328
SetValue
static void SetValue(Value *V, GenericValue Val, ExecutionContext &SF)
Definition: Execution.cpp:41
TargetTransformInfo.h
llvm::AArch64TTIImpl::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:1613
llvm::PHINode
Definition: Instructions.h:2633
llvm::PatternMatch
Definition: PatternMatch.h:47
llvm::ISD::MULHU
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:614
llvm::MemIntrinsicInfo
Information about a load/store intrinsic defined by the target.
Definition: TargetTransformInfo.h:70
instCombineSVELast
static Optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:557
llvm::AArch64TTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AArch64TargetTransformInfo.cpp:1850
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:45
llvm::ScalableVectorType::get
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:707
llvm::ISD::FNEG
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:866
llvm::RecurKind::FMin
@ FMin
FP min implemented in terms of select(cmp()).
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::AArch64TTIImpl::shouldConsiderAddressTypePromotion
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
Definition: AArch64TargetTransformInfo.cpp:1936
llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition: TargetTransformInfo.h:263
llvm::Value::takeName
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:382
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
BasicTTIImpl.h
llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:726
llvm::MVT::v8i8
@ v8i8
Definition: MachineValueType.h:79
llvm::MVT::nxv2i8
@ nxv2i8
Definition: MachineValueType.h:197
llvm::MVT::v8f32
@ v8f32
Definition: MachineValueType.h:161
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:89
llvm::MVT::v16i64
@ v16i64
Definition: MachineValueType.h:122
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1282
llvm::PatternMatch::m_Cmp
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:89
llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:814
llvm::RecurKind::SMax
@ SMax
Signed integer max implemented in terms of select(cmp()).
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::AArch64Subtarget::Falkor
@ Falkor
Definition: AArch64Subtarget.h:66
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:212
llvm::Type::isFP128Ty
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:156
Debug.h
llvm::VectorType::get
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:670
llvm::AArch64TTIImpl::useNeonVector
bool useNeonVector(const Type *Ty) const
Definition: AArch64TargetTransformInfo.cpp:1638
llvm::APIntOps::smax
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition: APInt.h:2123
llvm::MVT::nxv8f64
@ nxv8f64
Definition: MachineValueType.h:249
llvm::ISD::CTPOP
@ CTPOP
Definition: ISDOpcodes.h:669
llvm::RecurKind::Xor
@ Xor
Bitwise or logical XOR of integers.
llvm::AArch64TTIImpl::getOrCreateResultFromMemIntrinsic
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
Definition: AArch64TargetTransformInfo.cpp:1855
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:166
llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:289
llvm::MVT::v4i1
@ v4i1
Definition: MachineValueType.h:66
llvm::AArch64TTIImpl::enableMemCmpExpansion
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
Definition: AArch64TargetTransformInfo.cpp:1575
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37
llvm::AArch64TTIImpl::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
Definition: AArch64TargetTransformInfo.cpp:1495
llvm::MVT::nxv8f32
@ nxv8f32
Definition: MachineValueType.h:243
llvm::SmallVectorImpl::insert
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:773
llvm::ISD::FDIV
@ FDIV
Definition: ISDOpcodes.h:380