LLVM  16.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "AArch64ExpandImm.h"
11 #include "AArch64PerfectShuffle.h"
14 #include "llvm/Analysis/LoopInfo.h"
17 #include "llvm/CodeGen/CostTable.h"
19 #include "llvm/IR/IntrinsicInst.h"
20 #include "llvm/IR/Intrinsics.h"
21 #include "llvm/IR/IntrinsicsAArch64.h"
22 #include "llvm/IR/PatternMatch.h"
23 #include "llvm/Support/Debug.h"
26 #include <algorithm>
27 #include <optional>
28 using namespace llvm;
29 using namespace llvm::PatternMatch;
30 
31 #define DEBUG_TYPE "aarch64tti"
32 
33 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
34  cl::init(true), cl::Hidden);
35 
36 static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
37  cl::Hidden);
38 
39 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
40  cl::init(10), cl::Hidden);
41 
43 private:
44  uint8_t Bits = 0; // Currently defaults to disabled.
45 
46 public:
48  TFDisabled = 0x0,
49  TFReductions = 0x01,
50  TFRecurrences = 0x02,
51  TFSimple = 0x80,
52  TFAll = TFReductions | TFRecurrences | TFSimple
53  };
54 
55  void operator=(const std::string &Val) {
56  if (Val.empty())
57  return;
58  SmallVector<StringRef, 6> TailFoldTypes;
59  StringRef(Val).split(TailFoldTypes, '+', -1, false);
60  for (auto TailFoldType : TailFoldTypes) {
61  if (TailFoldType == "disabled")
62  Bits = 0;
63  else if (TailFoldType == "all")
64  Bits = TFAll;
65  else if (TailFoldType == "default")
66  Bits = 0; // Currently defaults to never tail-folding.
67  else if (TailFoldType == "simple")
68  add(TFSimple);
69  else if (TailFoldType == "reductions")
70  add(TFReductions);
71  else if (TailFoldType == "recurrences")
72  add(TFRecurrences);
73  else if (TailFoldType == "noreductions")
74  remove(TFReductions);
75  else if (TailFoldType == "norecurrences")
76  remove(TFRecurrences);
77  else {
78  errs()
79  << "invalid argument " << TailFoldType.str()
80  << " to -sve-tail-folding=; each element must be one of: disabled, "
81  "all, default, simple, reductions, noreductions, recurrences, "
82  "norecurrences\n";
83  }
84  }
85  }
86 
87  operator uint8_t() const { return Bits; }
88 
89  void add(uint8_t Flag) { Bits |= Flag; }
90  void remove(uint8_t Flag) { Bits &= ~Flag; }
91 };
92 
94 
96  "sve-tail-folding",
97  cl::desc(
98  "Control the use of vectorisation using tail-folding for SVE:"
99  "\ndisabled No loop types will vectorize using tail-folding"
100  "\ndefault Uses the default tail-folding settings for the target "
101  "CPU"
102  "\nall All legal loop types will vectorize using tail-folding"
103  "\nsimple Use tail-folding for simple loops (not reductions or "
104  "recurrences)"
105  "\nreductions Use tail-folding for loops containing reductions"
106  "\nrecurrences Use tail-folding for loops containing fixed order "
107  "recurrences"),
109 
110 // Experimental option that will only be fully functional when the
111 // code-generator is changed to use SVE instead of NEON for all fixed-width
112 // operations.
114  "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
115 
116 // Experimental option that will only be fully functional when the cost-model
117 // and code-generator have been changed to avoid using scalable vector
118 // instructions that are not legal in streaming SVE mode.
120  "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
121 
123  const Function *Callee) const {
124  SMEAttrs CallerAttrs(*Caller);
125  SMEAttrs CalleeAttrs(*Callee);
126  if (CallerAttrs.requiresSMChange(CalleeAttrs,
127  /*BodyOverridesInterface=*/true) ||
128  CallerAttrs.requiresLazySave(CalleeAttrs) ||
129  CalleeAttrs.hasNewZAInterface())
130  return false;
131 
132  const TargetMachine &TM = getTLI()->getTargetMachine();
133 
134  const FeatureBitset &CallerBits =
135  TM.getSubtargetImpl(*Caller)->getFeatureBits();
136  const FeatureBitset &CalleeBits =
137  TM.getSubtargetImpl(*Callee)->getFeatureBits();
138 
139  // Inline a callee if its target-features are a subset of the callers
140  // target-features.
141  return (CallerBits & CalleeBits) == CalleeBits;
142 }
143 
148 }
149 
150 /// Calculate the cost of materializing a 64-bit value. This helper
151 /// method might only calculate a fraction of a larger immediate. Therefore it
152 /// is valid to return a cost of ZERO.
154  // Check if the immediate can be encoded within an instruction.
155  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
156  return 0;
157 
158  if (Val < 0)
159  Val = ~Val;
160 
161  // Calculate how many moves we will need to materialize this constant.
164  return Insn.size();
165 }
166 
167 /// Calculate the cost of materializing the given constant.
170  assert(Ty->isIntegerTy());
171 
172  unsigned BitSize = Ty->getPrimitiveSizeInBits();
173  if (BitSize == 0)
174  return ~0U;
175 
176  // Sign-extend all constants to a multiple of 64-bit.
177  APInt ImmVal = Imm;
178  if (BitSize & 0x3f)
179  ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
180 
181  // Split the constant into 64-bit chunks and calculate the cost for each
182  // chunk.
183  InstructionCost Cost = 0;
184  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
185  APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
186  int64_t Val = Tmp.getSExtValue();
187  Cost += getIntImmCost(Val);
188  }
189  // We need at least one instruction to materialze the constant.
190  return std::max<InstructionCost>(1, Cost);
191 }
192 
194  const APInt &Imm, Type *Ty,
196  Instruction *Inst) {
197  assert(Ty->isIntegerTy());
198 
199  unsigned BitSize = Ty->getPrimitiveSizeInBits();
200  // There is no cost model for constants with a bit size of 0. Return TCC_Free
201  // here, so that constant hoisting will ignore this constant.
202  if (BitSize == 0)
203  return TTI::TCC_Free;
204 
205  unsigned ImmIdx = ~0U;
206  switch (Opcode) {
207  default:
208  return TTI::TCC_Free;
209  case Instruction::GetElementPtr:
210  // Always hoist the base address of a GetElementPtr.
211  if (Idx == 0)
212  return 2 * TTI::TCC_Basic;
213  return TTI::TCC_Free;
214  case Instruction::Store:
215  ImmIdx = 0;
216  break;
217  case Instruction::Add:
218  case Instruction::Sub:
219  case Instruction::Mul:
220  case Instruction::UDiv:
221  case Instruction::SDiv:
222  case Instruction::URem:
223  case Instruction::SRem:
224  case Instruction::And:
225  case Instruction::Or:
226  case Instruction::Xor:
227  case Instruction::ICmp:
228  ImmIdx = 1;
229  break;
230  // Always return TCC_Free for the shift value of a shift instruction.
231  case Instruction::Shl:
232  case Instruction::LShr:
233  case Instruction::AShr:
234  if (Idx == 1)
235  return TTI::TCC_Free;
236  break;
237  case Instruction::Trunc:
238  case Instruction::ZExt:
239  case Instruction::SExt:
240  case Instruction::IntToPtr:
241  case Instruction::PtrToInt:
242  case Instruction::BitCast:
243  case Instruction::PHI:
244  case Instruction::Call:
245  case Instruction::Select:
246  case Instruction::Ret:
247  case Instruction::Load:
248  break;
249  }
250 
251  if (Idx == ImmIdx) {
252  int NumConstants = (BitSize + 63) / 64;
254  return (Cost <= NumConstants * TTI::TCC_Basic)
255  ? static_cast<int>(TTI::TCC_Free)
256  : Cost;
257  }
259 }
260 
263  const APInt &Imm, Type *Ty,
265  assert(Ty->isIntegerTy());
266 
267  unsigned BitSize = Ty->getPrimitiveSizeInBits();
268  // There is no cost model for constants with a bit size of 0. Return TCC_Free
269  // here, so that constant hoisting will ignore this constant.
270  if (BitSize == 0)
271  return TTI::TCC_Free;
272 
273  // Most (all?) AArch64 intrinsics do not support folding immediates into the
274  // selected instruction, so we compute the materialization cost for the
275  // immediate directly.
276  if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
278 
279  switch (IID) {
280  default:
281  return TTI::TCC_Free;
282  case Intrinsic::sadd_with_overflow:
283  case Intrinsic::uadd_with_overflow:
284  case Intrinsic::ssub_with_overflow:
285  case Intrinsic::usub_with_overflow:
286  case Intrinsic::smul_with_overflow:
287  case Intrinsic::umul_with_overflow:
288  if (Idx == 1) {
289  int NumConstants = (BitSize + 63) / 64;
291  return (Cost <= NumConstants * TTI::TCC_Basic)
292  ? static_cast<int>(TTI::TCC_Free)
293  : Cost;
294  }
295  break;
296  case Intrinsic::experimental_stackmap:
297  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
298  return TTI::TCC_Free;
299  break;
300  case Intrinsic::experimental_patchpoint_void:
301  case Intrinsic::experimental_patchpoint_i64:
302  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
303  return TTI::TCC_Free;
304  break;
305  case Intrinsic::experimental_gc_statepoint:
306  if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
307  return TTI::TCC_Free;
308  break;
309  }
311 }
312 
315  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
316  if (TyWidth == 32 || TyWidth == 64)
317  return TTI::PSK_FastHardware;
318  // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
319  return TTI::PSK_Software;
320 }
321 
325  auto *RetTy = ICA.getReturnType();
326  switch (ICA.getID()) {
327  case Intrinsic::umin:
328  case Intrinsic::umax:
329  case Intrinsic::smin:
330  case Intrinsic::smax: {
331  static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
333  auto LT = getTypeLegalizationCost(RetTy);
334  // v2i64 types get converted to cmp+bif hence the cost of 2
335  if (LT.second == MVT::v2i64)
336  return LT.first * 2;
337  if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
338  return LT.first;
339  break;
340  }
341  case Intrinsic::sadd_sat:
342  case Intrinsic::ssub_sat:
343  case Intrinsic::uadd_sat:
344  case Intrinsic::usub_sat: {
345  static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
347  MVT::v2i64};
348  auto LT = getTypeLegalizationCost(RetTy);
349  // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
350  // need to extend the type, as it uses shr(qadd(shl, shl)).
351  unsigned Instrs =
352  LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
353  if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
354  return LT.first * Instrs;
355  break;
356  }
357  case Intrinsic::abs: {
358  static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
360  MVT::v2i64};
361  auto LT = getTypeLegalizationCost(RetTy);
362  if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
363  return LT.first;
364  break;
365  }
366  case Intrinsic::experimental_stepvector: {
367  InstructionCost Cost = 1; // Cost of the `index' instruction
368  auto LT = getTypeLegalizationCost(RetTy);
369  // Legalisation of illegal vectors involves an `index' instruction plus
370  // (LT.first - 1) vector adds.
371  if (LT.first > 1) {
372  Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
373  InstructionCost AddCost =
374  getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
375  Cost += AddCost * (LT.first - 1);
376  }
377  return Cost;
378  }
379  case Intrinsic::bitreverse: {
380  static const CostTblEntry BitreverseTbl[] = {
381  {Intrinsic::bitreverse, MVT::i32, 1},
382  {Intrinsic::bitreverse, MVT::i64, 1},
383  {Intrinsic::bitreverse, MVT::v8i8, 1},
384  {Intrinsic::bitreverse, MVT::v16i8, 1},
385  {Intrinsic::bitreverse, MVT::v4i16, 2},
386  {Intrinsic::bitreverse, MVT::v8i16, 2},
387  {Intrinsic::bitreverse, MVT::v2i32, 2},
388  {Intrinsic::bitreverse, MVT::v4i32, 2},
389  {Intrinsic::bitreverse, MVT::v1i64, 2},
390  {Intrinsic::bitreverse, MVT::v2i64, 2},
391  };
392  const auto LegalisationCost = getTypeLegalizationCost(RetTy);
393  const auto *Entry =
394  CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
395  if (Entry) {
396  // Cost Model is using the legal type(i32) that i8 and i16 will be
397  // converted to +1 so that we match the actual lowering cost
398  if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
399  TLI->getValueType(DL, RetTy, true) == MVT::i16)
400  return LegalisationCost.first * Entry->Cost + 1;
401 
402  return LegalisationCost.first * Entry->Cost;
403  }
404  break;
405  }
406  case Intrinsic::ctpop: {
407  if (!ST->hasNEON()) {
408  // 32-bit or 64-bit ctpop without NEON is 12 instructions.
409  return getTypeLegalizationCost(RetTy).first * 12;
410  }
411  static const CostTblEntry CtpopCostTbl[] = {
412  {ISD::CTPOP, MVT::v2i64, 4},
413  {ISD::CTPOP, MVT::v4i32, 3},
414  {ISD::CTPOP, MVT::v8i16, 2},
415  {ISD::CTPOP, MVT::v16i8, 1},
416  {ISD::CTPOP, MVT::i64, 4},
417  {ISD::CTPOP, MVT::v2i32, 3},
418  {ISD::CTPOP, MVT::v4i16, 2},
419  {ISD::CTPOP, MVT::v8i8, 1},
420  {ISD::CTPOP, MVT::i32, 5},
421  };
422  auto LT = getTypeLegalizationCost(RetTy);
423  MVT MTy = LT.second;
424  if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
425  // Extra cost of +1 when illegal vector types are legalized by promoting
426  // the integer type.
427  int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
428  RetTy->getScalarSizeInBits()
429  ? 1
430  : 0;
431  return LT.first * Entry->Cost + ExtraCost;
432  }
433  break;
434  }
435  case Intrinsic::sadd_with_overflow:
436  case Intrinsic::uadd_with_overflow:
437  case Intrinsic::ssub_with_overflow:
438  case Intrinsic::usub_with_overflow:
439  case Intrinsic::smul_with_overflow:
440  case Intrinsic::umul_with_overflow: {
441  static const CostTblEntry WithOverflowCostTbl[] = {
442  {Intrinsic::sadd_with_overflow, MVT::i8, 3},
443  {Intrinsic::uadd_with_overflow, MVT::i8, 3},
444  {Intrinsic::sadd_with_overflow, MVT::i16, 3},
445  {Intrinsic::uadd_with_overflow, MVT::i16, 3},
446  {Intrinsic::sadd_with_overflow, MVT::i32, 1},
447  {Intrinsic::uadd_with_overflow, MVT::i32, 1},
448  {Intrinsic::sadd_with_overflow, MVT::i64, 1},
449  {Intrinsic::uadd_with_overflow, MVT::i64, 1},
450  {Intrinsic::ssub_with_overflow, MVT::i8, 3},
451  {Intrinsic::usub_with_overflow, MVT::i8, 3},
452  {Intrinsic::ssub_with_overflow, MVT::i16, 3},
453  {Intrinsic::usub_with_overflow, MVT::i16, 3},
454  {Intrinsic::ssub_with_overflow, MVT::i32, 1},
455  {Intrinsic::usub_with_overflow, MVT::i32, 1},
456  {Intrinsic::ssub_with_overflow, MVT::i64, 1},
457  {Intrinsic::usub_with_overflow, MVT::i64, 1},
458  {Intrinsic::smul_with_overflow, MVT::i8, 5},
459  {Intrinsic::umul_with_overflow, MVT::i8, 4},
460  {Intrinsic::smul_with_overflow, MVT::i16, 5},
461  {Intrinsic::umul_with_overflow, MVT::i16, 4},
462  {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
463  {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
464  {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
465  {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
466  };
467  EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
468  if (MTy.isSimple())
469  if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
470  MTy.getSimpleVT()))
471  return Entry->Cost;
472  break;
473  }
474  case Intrinsic::fptosi_sat:
475  case Intrinsic::fptoui_sat: {
476  if (ICA.getArgTypes().empty())
477  break;
478  bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
479  auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
480  EVT MTy = TLI->getValueType(DL, RetTy);
481  // Check for the legal types, which are where the size of the input and the
482  // output are the same, or we are using cvt f64->i32 or f32->i64.
483  if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
484  LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
485  LT.second == MVT::v2f64) &&
486  (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
487  (LT.second == MVT::f64 && MTy == MVT::i32) ||
488  (LT.second == MVT::f32 && MTy == MVT::i64)))
489  return LT.first;
490  // Similarly for fp16 sizes
491  if (ST->hasFullFP16() &&
492  ((LT.second == MVT::f16 && MTy == MVT::i32) ||
493  ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
494  (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
495  return LT.first;
496 
497  // Otherwise we use a legal convert followed by a min+max
498  if ((LT.second.getScalarType() == MVT::f32 ||
499  LT.second.getScalarType() == MVT::f64 ||
500  (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
501  LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
502  Type *LegalTy =
503  Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
504  if (LT.second.isVector())
505  LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
506  InstructionCost Cost = 1;
508  LegalTy, {LegalTy, LegalTy});
509  Cost += getIntrinsicInstrCost(Attrs1, CostKind);
511  LegalTy, {LegalTy, LegalTy});
512  Cost += getIntrinsicInstrCost(Attrs2, CostKind);
513  return LT.first * Cost;
514  }
515  break;
516  }
517  default:
518  break;
519  }
520  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
521 }
522 
523 /// The function will remove redundant reinterprets casting in the presence
524 /// of the control flow
525 static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
526  IntrinsicInst &II) {
528  auto RequiredType = II.getType();
529 
530  auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
531  assert(PN && "Expected Phi Node!");
532 
533  // Don't create a new Phi unless we can remove the old one.
534  if (!PN->hasOneUse())
535  return std::nullopt;
536 
537  for (Value *IncValPhi : PN->incoming_values()) {
538  auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
539  if (!Reinterpret ||
540  Reinterpret->getIntrinsicID() !=
541  Intrinsic::aarch64_sve_convert_to_svbool ||
542  RequiredType != Reinterpret->getArgOperand(0)->getType())
543  return std::nullopt;
544  }
545 
546  // Create the new Phi
547  LLVMContext &Ctx = PN->getContext();
548  IRBuilder<> Builder(Ctx);
549  Builder.SetInsertPoint(PN);
550  PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
551  Worklist.push_back(PN);
552 
553  for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
554  auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
555  NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
556  Worklist.push_back(Reinterpret);
557  }
558 
559  // Cleanup Phi Node and reinterprets
560  return IC.replaceInstUsesWith(II, NPN);
561 }
562 
563 // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
564 // => (binop (pred) (from_svbool _) (from_svbool _))
565 //
566 // The above transformation eliminates a `to_svbool` in the predicate
567 // operand of bitwise operation `binop` by narrowing the vector width of
568 // the operation. For example, it would convert a `<vscale x 16 x i1>
569 // and` into a `<vscale x 4 x i1> and`. This is profitable because
570 // to_svbool must zero the new lanes during widening, whereas
571 // from_svbool is free.
572 static std::optional<Instruction *>
574  auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
575  if (!BinOp)
576  return std::nullopt;
577 
578  auto IntrinsicID = BinOp->getIntrinsicID();
579  switch (IntrinsicID) {
580  case Intrinsic::aarch64_sve_and_z:
581  case Intrinsic::aarch64_sve_bic_z:
582  case Intrinsic::aarch64_sve_eor_z:
583  case Intrinsic::aarch64_sve_nand_z:
584  case Intrinsic::aarch64_sve_nor_z:
585  case Intrinsic::aarch64_sve_orn_z:
586  case Intrinsic::aarch64_sve_orr_z:
587  break;
588  default:
589  return std::nullopt;
590  }
591 
592  auto BinOpPred = BinOp->getOperand(0);
593  auto BinOpOp1 = BinOp->getOperand(1);
594  auto BinOpOp2 = BinOp->getOperand(2);
595 
596  auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
597  if (!PredIntr ||
598  PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
599  return std::nullopt;
600 
601  auto PredOp = PredIntr->getOperand(0);
602  auto PredOpTy = cast<VectorType>(PredOp->getType());
603  if (PredOpTy != II.getType())
604  return std::nullopt;
605 
607  Builder.SetInsertPoint(&II);
608 
609  SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
610  auto NarrowBinOpOp1 = Builder.CreateIntrinsic(
611  Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
612  NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
613  if (BinOpOp1 == BinOpOp2)
614  NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
615  else
616  NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic(
617  Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
618 
619  auto NarrowedBinOp =
620  Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
621  return IC.replaceInstUsesWith(II, NarrowedBinOp);
622 }
623 
624 static std::optional<Instruction *>
626  // If the reinterpret instruction operand is a PHI Node
627  if (isa<PHINode>(II.getArgOperand(0)))
628  return processPhiNode(IC, II);
629 
630  if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
631  return BinOpCombine;
632 
633  SmallVector<Instruction *, 32> CandidatesForRemoval;
634  Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
635 
636  const auto *IVTy = cast<VectorType>(II.getType());
637 
638  // Walk the chain of conversions.
639  while (Cursor) {
640  // If the type of the cursor has fewer lanes than the final result, zeroing
641  // must take place, which breaks the equivalence chain.
642  const auto *CursorVTy = cast<VectorType>(Cursor->getType());
643  if (CursorVTy->getElementCount().getKnownMinValue() <
644  IVTy->getElementCount().getKnownMinValue())
645  break;
646 
647  // If the cursor has the same type as I, it is a viable replacement.
648  if (Cursor->getType() == IVTy)
649  EarliestReplacement = Cursor;
650 
651  auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
652 
653  // If this is not an SVE conversion intrinsic, this is the end of the chain.
654  if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
655  Intrinsic::aarch64_sve_convert_to_svbool ||
656  IntrinsicCursor->getIntrinsicID() ==
657  Intrinsic::aarch64_sve_convert_from_svbool))
658  break;
659 
660  CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
661  Cursor = IntrinsicCursor->getOperand(0);
662  }
663 
664  // If no viable replacement in the conversion chain was found, there is
665  // nothing to do.
666  if (!EarliestReplacement)
667  return std::nullopt;
668 
669  return IC.replaceInstUsesWith(II, EarliestReplacement);
670 }
671 
672 static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
673  IntrinsicInst &II) {
674  IRBuilder<> Builder(&II);
675  auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1),
676  II.getOperand(2));
677  return IC.replaceInstUsesWith(II, Select);
678 }
679 
680 static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
681  IntrinsicInst &II) {
682  IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
683  if (!Pg)
684  return std::nullopt;
685 
686  if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
687  return std::nullopt;
688 
689  const auto PTruePattern =
690  cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
691  if (PTruePattern != AArch64SVEPredPattern::vl1)
692  return std::nullopt;
693 
694  // The intrinsic is inserting into lane zero so use an insert instead.
695  auto *IdxTy = Type::getInt64Ty(II.getContext());
697  II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
698  Insert->insertBefore(&II);
699  Insert->takeName(&II);
700 
701  return IC.replaceInstUsesWith(II, Insert);
702 }
703 
704 static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
705  IntrinsicInst &II) {
706  // Replace DupX with a regular IR splat.
708  Builder.SetInsertPoint(&II);
709  auto *RetTy = cast<ScalableVectorType>(II.getType());
710  Value *Splat =
711  Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0));
712  Splat->takeName(&II);
713  return IC.replaceInstUsesWith(II, Splat);
714 }
715 
716 static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
717  IntrinsicInst &II) {
718  LLVMContext &Ctx = II.getContext();
719  IRBuilder<> Builder(Ctx);
720  Builder.SetInsertPoint(&II);
721 
722  // Check that the predicate is all active
723  auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
724  if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
725  return std::nullopt;
726 
727  const auto PTruePattern =
728  cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
729  if (PTruePattern != AArch64SVEPredPattern::all)
730  return std::nullopt;
731 
732  // Check that we have a compare of zero..
733  auto *SplatValue =
734  dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
735  if (!SplatValue || !SplatValue->isZero())
736  return std::nullopt;
737 
738  // ..against a dupq
739  auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
740  if (!DupQLane ||
741  DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
742  return std::nullopt;
743 
744  // Where the dupq is a lane 0 replicate of a vector insert
745  if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
746  return std::nullopt;
747 
748  auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
749  if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
750  return std::nullopt;
751 
752  // Where the vector insert is a fixed constant vector insert into undef at
753  // index zero
754  if (!isa<UndefValue>(VecIns->getArgOperand(0)))
755  return std::nullopt;
756 
757  if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
758  return std::nullopt;
759 
760  auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
761  if (!ConstVec)
762  return std::nullopt;
763 
764  auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
765  auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
766  if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
767  return std::nullopt;
768 
769  unsigned NumElts = VecTy->getNumElements();
770  unsigned PredicateBits = 0;
771 
772  // Expand intrinsic operands to a 16-bit byte level predicate
773  for (unsigned I = 0; I < NumElts; ++I) {
774  auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
775  if (!Arg)
776  return std::nullopt;
777  if (!Arg->isZero())
778  PredicateBits |= 1 << (I * (16 / NumElts));
779  }
780 
781  // If all bits are zero bail early with an empty predicate
782  if (PredicateBits == 0) {
783  auto *PFalse = Constant::getNullValue(II.getType());
784  PFalse->takeName(&II);
785  return IC.replaceInstUsesWith(II, PFalse);
786  }
787 
788  // Calculate largest predicate type used (where byte predicate is largest)
789  unsigned Mask = 8;
790  for (unsigned I = 0; I < 16; ++I)
791  if ((PredicateBits & (1 << I)) != 0)
792  Mask |= (I % 8);
793 
794  unsigned PredSize = Mask & -Mask;
795  auto *PredType = ScalableVectorType::get(
796  Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
797 
798  // Ensure all relevant bits are set
799  for (unsigned I = 0; I < 16; I += PredSize)
800  if ((PredicateBits & (1 << I)) == 0)
801  return std::nullopt;
802 
803  auto *PTruePat =
805  auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
806  {PredType}, {PTruePat});
807  auto *ConvertToSVBool = Builder.CreateIntrinsic(
808  Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
809  auto *ConvertFromSVBool =
810  Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
811  {II.getType()}, {ConvertToSVBool});
812 
813  ConvertFromSVBool->takeName(&II);
814  return IC.replaceInstUsesWith(II, ConvertFromSVBool);
815 }
816 
817 static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
818  IntrinsicInst &II) {
820  Builder.SetInsertPoint(&II);
821  Value *Pg = II.getArgOperand(0);
822  Value *Vec = II.getArgOperand(1);
823  auto IntrinsicID = II.getIntrinsicID();
824  bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
825 
826  // lastX(splat(X)) --> X
827  if (auto *SplatVal = getSplatValue(Vec))
828  return IC.replaceInstUsesWith(II, SplatVal);
829 
830  // If x and/or y is a splat value then:
831  // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
832  Value *LHS, *RHS;
833  if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
834  if (isSplatValue(LHS) || isSplatValue(RHS)) {
835  auto *OldBinOp = cast<BinaryOperator>(Vec);
836  auto OpC = OldBinOp->getOpcode();
837  auto *NewLHS =
838  Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
839  auto *NewRHS =
840  Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
841  auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
842  OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
843  return IC.replaceInstUsesWith(II, NewBinOp);
844  }
845  }
846 
847  auto *C = dyn_cast<Constant>(Pg);
848  if (IsAfter && C && C->isNullValue()) {
849  // The intrinsic is extracting lane 0 so use an extract instead.
850  auto *IdxTy = Type::getInt64Ty(II.getContext());
851  auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
852  Extract->insertBefore(&II);
853  Extract->takeName(&II);
854  return IC.replaceInstUsesWith(II, Extract);
855  }
856 
857  auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
858  if (!IntrPG)
859  return std::nullopt;
860 
861  if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
862  return std::nullopt;
863 
864  const auto PTruePattern =
865  cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
866 
867  // Can the intrinsic's predicate be converted to a known constant index?
868  unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
869  if (!MinNumElts)
870  return std::nullopt;
871 
872  unsigned Idx = MinNumElts - 1;
873  // Increment the index if extracting the element after the last active
874  // predicate element.
875  if (IsAfter)
876  ++Idx;
877 
878  // Ignore extracts whose index is larger than the known minimum vector
879  // length. NOTE: This is an artificial constraint where we prefer to
880  // maintain what the user asked for until an alternative is proven faster.
881  auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
882  if (Idx >= PgVTy->getMinNumElements())
883  return std::nullopt;
884 
885  // The intrinsic is extracting a fixed lane so use an extract instead.
886  auto *IdxTy = Type::getInt64Ty(II.getContext());
887  auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
888  Extract->insertBefore(&II);
889  Extract->takeName(&II);
890  return IC.replaceInstUsesWith(II, Extract);
891 }
892 
893 static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
894  IntrinsicInst &II) {
895  // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
896  // integer variant across a variety of micro-architectures. Replace scalar
897  // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
898  // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
899  // depending on the micro-architecture, but has been observed as generally
900  // being faster, particularly when the CLAST[AB] op is a loop-carried
901  // dependency.
903  Builder.SetInsertPoint(&II);
904  Value *Pg = II.getArgOperand(0);
905  Value *Fallback = II.getArgOperand(1);
906  Value *Vec = II.getArgOperand(2);
907  Type *Ty = II.getType();
908 
909  if (!Ty->isIntegerTy())
910  return std::nullopt;
911 
912  Type *FPTy;
913  switch (cast<IntegerType>(Ty)->getBitWidth()) {
914  default:
915  return std::nullopt;
916  case 16:
917  FPTy = Builder.getHalfTy();
918  break;
919  case 32:
920  FPTy = Builder.getFloatTy();
921  break;
922  case 64:
923  FPTy = Builder.getDoubleTy();
924  break;
925  }
926 
927  Value *FPFallBack = Builder.CreateBitCast(Fallback, FPTy);
928  auto *FPVTy = VectorType::get(
929  FPTy, cast<VectorType>(Vec->getType())->getElementCount());
930  Value *FPVec = Builder.CreateBitCast(Vec, FPVTy);
931  auto *FPII = Builder.CreateIntrinsic(II.getIntrinsicID(), {FPVec->getType()},
932  {Pg, FPFallBack, FPVec});
933  Value *FPIItoInt = Builder.CreateBitCast(FPII, II.getType());
934  return IC.replaceInstUsesWith(II, FPIItoInt);
935 }
936 
937 static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
938  IntrinsicInst &II) {
939  LLVMContext &Ctx = II.getContext();
940  IRBuilder<> Builder(Ctx);
941  Builder.SetInsertPoint(&II);
942  // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
943  // can work with RDFFR_PP for ptest elimination.
944  auto *AllPat =
946  auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
947  {II.getType()}, {AllPat});
948  auto *RDFFR =
949  Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
950  RDFFR->takeName(&II);
951  return IC.replaceInstUsesWith(II, RDFFR);
952 }
953 
954 static std::optional<Instruction *>
955 instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
956  const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
957 
959  LLVMContext &Ctx = II.getContext();
960  IRBuilder<> Builder(Ctx);
961  Builder.SetInsertPoint(&II);
962 
963  Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
964  auto *VScale = Builder.CreateVScale(StepVal);
965  VScale->takeName(&II);
966  return IC.replaceInstUsesWith(II, VScale);
967  }
968 
969  unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
970 
971  return MinNumElts && NumElts >= MinNumElts
972  ? std::optional<Instruction *>(IC.replaceInstUsesWith(
973  II, ConstantInt::get(II.getType(), MinNumElts)))
974  : std::nullopt;
975 }
976 
977 static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
978  IntrinsicInst &II) {
979  Value *PgVal = II.getArgOperand(0);
980  Value *OpVal = II.getArgOperand(1);
981 
983  Builder.SetInsertPoint(&II);
984 
985  // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
986  // Later optimizations prefer this form.
987  if (PgVal == OpVal &&
988  (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
989  II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
990  Value *Ops[] = {PgVal, OpVal};
991  Type *Tys[] = {PgVal->getType()};
992 
993  auto *PTest =
994  Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
995  PTest->takeName(&II);
996 
997  return IC.replaceInstUsesWith(II, PTest);
998  }
999 
1000  IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1001  IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1002 
1003  if (!Pg || !Op)
1004  return std::nullopt;
1005 
1006  Intrinsic::ID OpIID = Op->getIntrinsicID();
1007 
1008  if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1009  OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1010  Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1011  Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1012  Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1013 
1014  auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1015 
1016  PTest->takeName(&II);
1017  return IC.replaceInstUsesWith(II, PTest);
1018  }
1019 
1020  // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1021  // Later optimizations may rewrite sequence to use the flag-setting variant
1022  // of instruction X to remove PTEST.
1023  if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1024  ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1025  (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1026  (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1027  (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1028  (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1029  (OpIID == Intrinsic::aarch64_sve_and_z) ||
1030  (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1031  (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1032  (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1033  (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1034  (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1035  (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1036  Value *Ops[] = {Pg->getArgOperand(0), Pg};
1037  Type *Tys[] = {Pg->getType()};
1038 
1039  auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1040  PTest->takeName(&II);
1041 
1042  return IC.replaceInstUsesWith(II, PTest);
1043  }
1044 
1045  return std::nullopt;
1046 }
1047 
1048 static std::optional<Instruction *>
1050  // fold (fadd p a (fmul p b c)) -> (fma p a b c)
1051  Value *P = II.getOperand(0);
1052  Value *A = II.getOperand(1);
1053  auto FMul = II.getOperand(2);
1054  Value *B, *C;
1055  if (!match(FMul, m_Intrinsic<Intrinsic::aarch64_sve_fmul>(
1056  m_Specific(P), m_Value(B), m_Value(C))))
1057  return std::nullopt;
1058 
1059  if (!FMul->hasOneUse())
1060  return std::nullopt;
1061 
1062  llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1063  // Stop the combine when the flags on the inputs differ in case dropping flags
1064  // would lead to us missing out on more beneficial optimizations.
1065  if (FAddFlags != cast<CallInst>(FMul)->getFastMathFlags())
1066  return std::nullopt;
1067  if (!FAddFlags.allowContract())
1068  return std::nullopt;
1069 
1071  Builder.SetInsertPoint(&II);
1072  auto FMLA = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmla,
1073  {II.getType()}, {P, A, B, C}, &II);
1074  FMLA->setFastMathFlags(FAddFlags);
1075  return IC.replaceInstUsesWith(II, FMLA);
1076 }
1077 
1078 static bool isAllActivePredicate(Value *Pred) {
1079  // Look through convert.from.svbool(convert.to.svbool(...) chain.
1080  Value *UncastedPred;
1081  if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1082  m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1083  m_Value(UncastedPred)))))
1084  // If the predicate has the same or less lanes than the uncasted
1085  // predicate then we know the casting has no effect.
1086  if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1087  cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1088  Pred = UncastedPred;
1089 
1090  return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1091  m_ConstantInt<AArch64SVEPredPattern::all>()));
1092 }
1093 
1094 static std::optional<Instruction *>
1097  Builder.SetInsertPoint(&II);
1098 
1099  Value *Pred = II.getOperand(0);
1100  Value *PtrOp = II.getOperand(1);
1101  Type *VecTy = II.getType();
1102  Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo());
1103 
1104  if (isAllActivePredicate(Pred)) {
1105  LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr);
1106  Load->copyMetadata(II);
1107  return IC.replaceInstUsesWith(II, Load);
1108  }
1109 
1110  CallInst *MaskedLoad =
1111  Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL),
1112  Pred, ConstantAggregateZero::get(VecTy));
1113  MaskedLoad->copyMetadata(II);
1114  return IC.replaceInstUsesWith(II, MaskedLoad);
1115 }
1116 
1117 static std::optional<Instruction *>
1120  Builder.SetInsertPoint(&II);
1121 
1122  Value *VecOp = II.getOperand(0);
1123  Value *Pred = II.getOperand(1);
1124  Value *PtrOp = II.getOperand(2);
1125  Value *VecPtr =
1126  Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo());
1127 
1128  if (isAllActivePredicate(Pred)) {
1129  StoreInst *Store = Builder.CreateStore(VecOp, VecPtr);
1130  Store->copyMetadata(II);
1131  return IC.eraseInstFromFunction(II);
1132  }
1133 
1134  CallInst *MaskedStore = Builder.CreateMaskedStore(
1135  VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred);
1136  MaskedStore->copyMetadata(II);
1137  return IC.eraseInstFromFunction(II);
1138 }
1139 
1141  switch (Intrinsic) {
1142  case Intrinsic::aarch64_sve_fmul:
1143  return Instruction::BinaryOps::FMul;
1144  case Intrinsic::aarch64_sve_fadd:
1145  return Instruction::BinaryOps::FAdd;
1146  case Intrinsic::aarch64_sve_fsub:
1147  return Instruction::BinaryOps::FSub;
1148  default:
1149  return Instruction::BinaryOpsEnd;
1150  }
1151 }
1152 
1153 static std::optional<Instruction *>
1155  auto *OpPredicate = II.getOperand(0);
1156  auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1157  if (BinOpCode == Instruction::BinaryOpsEnd ||
1158  !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1159  m_ConstantInt<AArch64SVEPredPattern::all>())))
1160  return std::nullopt;
1162  Builder.SetInsertPoint(&II);
1163  Builder.setFastMathFlags(II.getFastMathFlags());
1164  auto BinOp =
1165  Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
1166  return IC.replaceInstUsesWith(II, BinOp);
1167 }
1168 
1169 static std::optional<Instruction *>
1171  if (auto FMLA = instCombineSVEVectorFMLA(IC, II))
1172  return FMLA;
1173  return instCombineSVEVectorBinOp(IC, II);
1174 }
1175 
1176 static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1177  IntrinsicInst &II) {
1178  auto *OpPredicate = II.getOperand(0);
1179  auto *OpMultiplicand = II.getOperand(1);
1180  auto *OpMultiplier = II.getOperand(2);
1181 
1183  Builder.SetInsertPoint(&II);
1184 
1185  // Return true if a given instruction is a unit splat value, false otherwise.
1186  auto IsUnitSplat = [](auto *I) {
1187  auto *SplatValue = getSplatValue(I);
1188  if (!SplatValue)
1189  return false;
1190  return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1191  };
1192 
1193  // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1194  // with a unit splat value, false otherwise.
1195  auto IsUnitDup = [](auto *I) {
1196  auto *IntrI = dyn_cast<IntrinsicInst>(I);
1197  if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1198  return false;
1199 
1200  auto *SplatValue = IntrI->getOperand(2);
1201  return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1202  };
1203 
1204  if (IsUnitSplat(OpMultiplier)) {
1205  // [f]mul pg %n, (dupx 1) => %n
1206  OpMultiplicand->takeName(&II);
1207  return IC.replaceInstUsesWith(II, OpMultiplicand);
1208  } else if (IsUnitDup(OpMultiplier)) {
1209  // [f]mul pg %n, (dup pg 1) => %n
1210  auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1211  auto *DupPg = DupInst->getOperand(1);
1212  // TODO: this is naive. The optimization is still valid if DupPg
1213  // 'encompasses' OpPredicate, not only if they're the same predicate.
1214  if (OpPredicate == DupPg) {
1215  OpMultiplicand->takeName(&II);
1216  return IC.replaceInstUsesWith(II, OpMultiplicand);
1217  }
1218  }
1219 
1220  return instCombineSVEVectorBinOp(IC, II);
1221 }
1222 
1223 static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1224  IntrinsicInst &II) {
1226  Builder.SetInsertPoint(&II);
1227  Value *UnpackArg = II.getArgOperand(0);
1228  auto *RetTy = cast<ScalableVectorType>(II.getType());
1229  bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1230  II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1231 
1232  // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1233  // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1234  if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1235  ScalarArg =
1236  Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1237  Value *NewVal =
1238  Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1239  NewVal->takeName(&II);
1240  return IC.replaceInstUsesWith(II, NewVal);
1241  }
1242 
1243  return std::nullopt;
1244 }
1245 static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1246  IntrinsicInst &II) {
1247  auto *OpVal = II.getOperand(0);
1248  auto *OpIndices = II.getOperand(1);
1249  VectorType *VTy = cast<VectorType>(II.getType());
1250 
1251  // Check whether OpIndices is a constant splat value < minimal element count
1252  // of result.
1253  auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1254  if (!SplatValue ||
1255  SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1256  return std::nullopt;
1257 
1258  // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1259  // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1261  Builder.SetInsertPoint(&II);
1262  auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
1263  auto *VectorSplat =
1264  Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1265 
1266  VectorSplat->takeName(&II);
1267  return IC.replaceInstUsesWith(II, VectorSplat);
1268 }
1269 
1270 static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1271  IntrinsicInst &II) {
1272  // zip1(uzp1(A, B), uzp2(A, B)) --> A
1273  // zip2(uzp1(A, B), uzp2(A, B)) --> B
1274  Value *A, *B;
1275  if (match(II.getArgOperand(0),
1276  m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1277  match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1278  m_Specific(A), m_Specific(B))))
1279  return IC.replaceInstUsesWith(
1280  II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1281 
1282  return std::nullopt;
1283 }
1284 
1285 static std::optional<Instruction *>
1287  Value *Mask = II.getOperand(0);
1288  Value *BasePtr = II.getOperand(1);
1289  Value *Index = II.getOperand(2);
1290  Type *Ty = II.getType();
1291  Value *PassThru = ConstantAggregateZero::get(Ty);
1292 
1293  // Contiguous gather => masked load.
1294  // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1295  // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1296  Value *IndexBase;
1297  if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1298  m_Value(IndexBase), m_SpecificInt(1)))) {
1300  Builder.SetInsertPoint(&II);
1301 
1302  Align Alignment =
1303  BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1304 
1305  Type *VecPtrTy = PointerType::getUnqual(Ty);
1306  Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1307  BasePtr, IndexBase);
1308  Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1309  CallInst *MaskedLoad =
1310  Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1311  MaskedLoad->takeName(&II);
1312  return IC.replaceInstUsesWith(II, MaskedLoad);
1313  }
1314 
1315  return std::nullopt;
1316 }
1317 
1318 static std::optional<Instruction *>
1320  Value *Val = II.getOperand(0);
1321  Value *Mask = II.getOperand(1);
1322  Value *BasePtr = II.getOperand(2);
1323  Value *Index = II.getOperand(3);
1324  Type *Ty = Val->getType();
1325 
1326  // Contiguous scatter => masked store.
1327  // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1328  // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1329  Value *IndexBase;
1330  if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1331  m_Value(IndexBase), m_SpecificInt(1)))) {
1333  Builder.SetInsertPoint(&II);
1334 
1335  Align Alignment =
1336  BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1337 
1338  Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1339  BasePtr, IndexBase);
1340  Type *VecPtrTy = PointerType::getUnqual(Ty);
1341  Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1342 
1343  (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1344 
1345  return IC.eraseInstFromFunction(II);
1346  }
1347 
1348  return std::nullopt;
1349 }
1350 
1351 static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1352  IntrinsicInst &II) {
1354  Builder.SetInsertPoint(&II);
1355  Type *Int32Ty = Builder.getInt32Ty();
1356  Value *Pred = II.getOperand(0);
1357  Value *Vec = II.getOperand(1);
1358  Value *DivVec = II.getOperand(2);
1359 
1360  Value *SplatValue = getSplatValue(DivVec);
1361  ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1362  if (!SplatConstantInt)
1363  return std::nullopt;
1364  APInt Divisor = SplatConstantInt->getValue();
1365 
1366  if (Divisor.isPowerOf2()) {
1367  Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1368  auto ASRD = Builder.CreateIntrinsic(
1369  Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1370  return IC.replaceInstUsesWith(II, ASRD);
1371  }
1372  if (Divisor.isNegatedPowerOf2()) {
1373  Divisor.negate();
1374  Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1375  auto ASRD = Builder.CreateIntrinsic(
1376  Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1377  auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg,
1378  {ASRD->getType()}, {ASRD, Pred, ASRD});
1379  return IC.replaceInstUsesWith(II, NEG);
1380  }
1381 
1382  return std::nullopt;
1383 }
1384 
1385 static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
1386  IntrinsicInst &II) {
1387  Value *A = II.getArgOperand(0);
1388  Value *B = II.getArgOperand(1);
1389  if (A == B)
1390  return IC.replaceInstUsesWith(II, A);
1391 
1392  return std::nullopt;
1393 }
1394 
1395 static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
1396  IntrinsicInst &II) {
1397  IRBuilder<> Builder(&II);
1398  Value *Pred = II.getOperand(0);
1399  Value *Vec = II.getOperand(1);
1400  Value *Shift = II.getOperand(2);
1401 
1402  // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
1403  Value *AbsPred, *MergedValue;
1404  if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
1405  m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
1406  !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
1407  m_Value(MergedValue), m_Value(AbsPred), m_Value())))
1408 
1409  return std::nullopt;
1410 
1411  // Transform is valid if any of the following are true:
1412  // * The ABS merge value is an undef or non-negative
1413  // * The ABS predicate is all active
1414  // * The ABS predicate and the SRSHL predicates are the same
1415  if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
1416  AbsPred != Pred && !isAllActivePredicate(AbsPred))
1417  return std::nullopt;
1418 
1419  // Only valid when the shift amount is non-negative, otherwise the rounding
1420  // behaviour of SRSHL cannot be ignored.
1421  if (!match(Shift, m_NonNegative()))
1422  return std::nullopt;
1423 
1424  auto LSL = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, {II.getType()},
1425  {Pred, Vec, Shift});
1426 
1427  return IC.replaceInstUsesWith(II, LSL);
1428 }
1429 
1430 std::optional<Instruction *>
1432  IntrinsicInst &II) const {
1433  Intrinsic::ID IID = II.getIntrinsicID();
1434  switch (IID) {
1435  default:
1436  break;
1437  case Intrinsic::aarch64_neon_fmaxnm:
1438  case Intrinsic::aarch64_neon_fminnm:
1439  return instCombineMaxMinNM(IC, II);
1440  case Intrinsic::aarch64_sve_convert_from_svbool:
1441  return instCombineConvertFromSVBool(IC, II);
1442  case Intrinsic::aarch64_sve_dup:
1443  return instCombineSVEDup(IC, II);
1444  case Intrinsic::aarch64_sve_dup_x:
1445  return instCombineSVEDupX(IC, II);
1446  case Intrinsic::aarch64_sve_cmpne:
1447  case Intrinsic::aarch64_sve_cmpne_wide:
1448  return instCombineSVECmpNE(IC, II);
1449  case Intrinsic::aarch64_sve_rdffr:
1450  return instCombineRDFFR(IC, II);
1451  case Intrinsic::aarch64_sve_lasta:
1452  case Intrinsic::aarch64_sve_lastb:
1453  return instCombineSVELast(IC, II);
1454  case Intrinsic::aarch64_sve_clasta_n:
1455  case Intrinsic::aarch64_sve_clastb_n:
1456  return instCombineSVECondLast(IC, II);
1457  case Intrinsic::aarch64_sve_cntd:
1458  return instCombineSVECntElts(IC, II, 2);
1459  case Intrinsic::aarch64_sve_cntw:
1460  return instCombineSVECntElts(IC, II, 4);
1461  case Intrinsic::aarch64_sve_cnth:
1462  return instCombineSVECntElts(IC, II, 8);
1463  case Intrinsic::aarch64_sve_cntb:
1464  return instCombineSVECntElts(IC, II, 16);
1465  case Intrinsic::aarch64_sve_ptest_any:
1466  case Intrinsic::aarch64_sve_ptest_first:
1467  case Intrinsic::aarch64_sve_ptest_last:
1468  return instCombineSVEPTest(IC, II);
1469  case Intrinsic::aarch64_sve_mul:
1470  case Intrinsic::aarch64_sve_fmul:
1471  return instCombineSVEVectorMul(IC, II);
1472  case Intrinsic::aarch64_sve_fadd:
1473  return instCombineSVEVectorFAdd(IC, II);
1474  case Intrinsic::aarch64_sve_fsub:
1475  return instCombineSVEVectorBinOp(IC, II);
1476  case Intrinsic::aarch64_sve_tbl:
1477  return instCombineSVETBL(IC, II);
1478  case Intrinsic::aarch64_sve_uunpkhi:
1479  case Intrinsic::aarch64_sve_uunpklo:
1480  case Intrinsic::aarch64_sve_sunpkhi:
1481  case Intrinsic::aarch64_sve_sunpklo:
1482  return instCombineSVEUnpack(IC, II);
1483  case Intrinsic::aarch64_sve_zip1:
1484  case Intrinsic::aarch64_sve_zip2:
1485  return instCombineSVEZip(IC, II);
1486  case Intrinsic::aarch64_sve_ld1_gather_index:
1487  return instCombineLD1GatherIndex(IC, II);
1488  case Intrinsic::aarch64_sve_st1_scatter_index:
1489  return instCombineST1ScatterIndex(IC, II);
1490  case Intrinsic::aarch64_sve_ld1:
1491  return instCombineSVELD1(IC, II, DL);
1492  case Intrinsic::aarch64_sve_st1:
1493  return instCombineSVEST1(IC, II, DL);
1494  case Intrinsic::aarch64_sve_sdiv:
1495  return instCombineSVESDIV(IC, II);
1496  case Intrinsic::aarch64_sve_sel:
1497  return instCombineSVESel(IC, II);
1498  case Intrinsic::aarch64_sve_srshl:
1499  return instCombineSVESrshl(IC, II);
1500  }
1501 
1502  return std::nullopt;
1503 }
1504 
1506  InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
1507  APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
1508  std::function<void(Instruction *, unsigned, APInt, APInt &)>
1509  SimplifyAndSetOp) const {
1510  switch (II.getIntrinsicID()) {
1511  default:
1512  break;
1513  case Intrinsic::aarch64_neon_fcvtxn:
1514  case Intrinsic::aarch64_neon_rshrn:
1515  case Intrinsic::aarch64_neon_sqrshrn:
1516  case Intrinsic::aarch64_neon_sqrshrun:
1517  case Intrinsic::aarch64_neon_sqshrn:
1518  case Intrinsic::aarch64_neon_sqshrun:
1519  case Intrinsic::aarch64_neon_sqxtn:
1520  case Intrinsic::aarch64_neon_sqxtun:
1521  case Intrinsic::aarch64_neon_uqrshrn:
1522  case Intrinsic::aarch64_neon_uqshrn:
1523  case Intrinsic::aarch64_neon_uqxtn:
1524  SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
1525  break;
1526  }
1527 
1528  return std::nullopt;
1529 }
1530 
1531 TypeSize
1533  switch (K) {
1535  return TypeSize::getFixed(64);
1537  if (!ST->isStreamingSVEModeDisabled() &&
1539  return TypeSize::getFixed(0);
1540 
1541  if (ST->hasSVE())
1542  return TypeSize::getFixed(
1543  std::max(ST->getMinSVEVectorSizeInBits(), 128u));
1544 
1545  return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
1547  if (!ST->isStreamingSVEModeDisabled() && !EnableScalableAutovecInStreamingMode)
1548  return TypeSize::getScalable(0);
1549 
1550  return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
1551  }
1552  llvm_unreachable("Unsupported register kind");
1553 }
1554 
1555 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
1557 
1558  // A helper that returns a vector type from the given type. The number of
1559  // elements in type Ty determines the vector width.
1560  auto toVectorTy = [&](Type *ArgTy) {
1561  return VectorType::get(ArgTy->getScalarType(),
1562  cast<VectorType>(DstTy)->getElementCount());
1563  };
1564 
1565  // Exit early if DstTy is not a vector type whose elements are at least
1566  // 16-bits wide. SVE doesn't generally have the same set of instructions to
1567  // perform an extend with the add/sub/mul. There are SMULLB style
1568  // instructions, but they operate on top/bottom, requiring some sort of lane
1569  // interleaving to be used with zext/sext.
1570  if (!useNeonVector(DstTy) || DstTy->getScalarSizeInBits() < 16)
1571  return false;
1572 
1573  // Determine if the operation has a widening variant. We consider both the
1574  // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
1575  // instructions.
1576  //
1577  // TODO: Add additional widening operations (e.g., shl, etc.) once we
1578  // verify that their extending operands are eliminated during code
1579  // generation.
1580  switch (Opcode) {
1581  case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
1582  case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
1583  case Instruction::Mul: // SMULL(2), UMULL(2)
1584  break;
1585  default:
1586  return false;
1587  }
1588 
1589  // To be a widening instruction (either the "wide" or "long" versions), the
1590  // second operand must be a sign- or zero extend.
1591  if (Args.size() != 2 ||
1592  (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])))
1593  return false;
1594  auto *Extend = cast<CastInst>(Args[1]);
1595  auto *Arg0 = dyn_cast<CastInst>(Args[0]);
1596 
1597  // A mul only has a mull version (not like addw). Both operands need to be
1598  // extending and the same type.
1599  if (Opcode == Instruction::Mul &&
1600  (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() ||
1601  Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType()))
1602  return false;
1603 
1604  // Legalize the destination type and ensure it can be used in a widening
1605  // operation.
1606  auto DstTyL = getTypeLegalizationCost(DstTy);
1607  unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
1608  if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
1609  return false;
1610 
1611  // Legalize the source type and ensure it can be used in a widening
1612  // operation.
1613  auto *SrcTy = toVectorTy(Extend->getSrcTy());
1614  auto SrcTyL = getTypeLegalizationCost(SrcTy);
1615  unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
1616  if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
1617  return false;
1618 
1619  // Get the total number of vector elements in the legalized types.
1620  InstructionCost NumDstEls =
1621  DstTyL.first * DstTyL.second.getVectorMinNumElements();
1622  InstructionCost NumSrcEls =
1623  SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
1624 
1625  // Return true if the legalized types have the same number of vector elements
1626  // and the destination element type size is twice that of the source type.
1627  return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
1628 }
1629 
1631  Type *Src,
1634  const Instruction *I) {
1635  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1636  assert(ISD && "Invalid opcode");
1637 
1638  // If the cast is observable, and it is used by a widening instruction (e.g.,
1639  // uaddl, saddw, etc.), it may be free.
1640  if (I && I->hasOneUser()) {
1641  auto *SingleUser = cast<Instruction>(*I->user_begin());
1642  SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
1643  if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
1644  // If the cast is the second operand, it is free. We will generate either
1645  // a "wide" or "long" version of the widening instruction.
1646  if (I == SingleUser->getOperand(1))
1647  return 0;
1648  // If the cast is not the second operand, it will be free if it looks the
1649  // same as the second operand. In this case, we will generate a "long"
1650  // version of the widening instruction.
1651  if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
1652  if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
1653  cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
1654  return 0;
1655  }
1656  }
1657 
1658  // TODO: Allow non-throughput costs that aren't binary.
1659  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1661  return Cost == 0 ? 0 : 1;
1662  return Cost;
1663  };
1664 
1665  EVT SrcTy = TLI->getValueType(DL, Src);
1666  EVT DstTy = TLI->getValueType(DL, Dst);
1667 
1668  if (!SrcTy.isSimple() || !DstTy.isSimple())
1669  return AdjustCost(
1670  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1671 
1672  static const TypeConversionCostTblEntry
1673  ConversionTbl[] = {
1674  { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
1675  { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
1676  { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
1677  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
1678  { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
1679  { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
1680  { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
1681  { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
1682  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
1683  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
1684  { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
1685  { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
1686  { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
1687  { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
1688  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
1689  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
1690  { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
1691  { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
1692  { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
1693  { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
1694 
1695  // Truncations on nxvmiN
1712 
1713  // The number of shll instructions for the extension.
1730 
1731  // LowerVectorINT_TO_FP:
1738 
1739  // Complex: to v2f32
1746 
1747  // Complex: to v4f32
1752 
1753  // Complex: to v8f32
1758 
1759  // Complex: to v16f32
1762 
1763  // Complex: to v2f64
1770 
1771  // Complex: to v4f64
1774 
1775  // LowerVectorFP_TO_INT
1782 
1783  // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
1790 
1791  // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
1796 
1797  // Complex, from nxv2f32.
1806 
1807  // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
1814 
1815  // Complex, from nxv2f64.
1824 
1825  // Complex, from nxv4f32.
1834 
1835  // Complex, from nxv8f64. Illegal -> illegal conversions not required.
1840 
1841  // Complex, from nxv4f64. Illegal -> illegal conversions not required.
1848 
1849  // Complex, from nxv8f32. Illegal -> illegal conversions not required.
1854 
1855  // Complex, from nxv8f16.
1864 
1865  // Complex, from nxv4f16.
1874 
1875  // Complex, from nxv2f16.
1884 
1885  // Truncate from nxvmf32 to nxvmf16.
1889 
1890  // Truncate from nxvmf64 to nxvmf16.
1894 
1895  // Truncate from nxvmf64 to nxvmf32.
1899 
1900  // Extend from nxvmf16 to nxvmf32.
1904 
1905  // Extend from nxvmf16 to nxvmf64.
1909 
1910  // Extend from nxvmf32 to nxvmf64.
1914 
1915  // Bitcasts from float to integer
1919 
1920  // Bitcasts from integer to float
1924  };
1925 
1926  if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
1927  DstTy.getSimpleVT(),
1928  SrcTy.getSimpleVT()))
1929  return AdjustCost(Entry->Cost);
1930 
1931  static const TypeConversionCostTblEntry FP16Tbl[] = {
1932  {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
1934  {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
1936  {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
1938  {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
1940  {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
1942  {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
1944  {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
1946  {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
1948  {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
1950  {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
1951  {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
1952  {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
1953  {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
1954  };
1955 
1956  if (ST->hasFullFP16())
1957  if (const auto *Entry = ConvertCostTableLookup(
1958  FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
1959  return AdjustCost(Entry->Cost);
1960 
1961  return AdjustCost(
1962  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1963 }
1964 
1966  Type *Dst,
1967  VectorType *VecTy,
1968  unsigned Index) {
1969 
1970  // Make sure we were given a valid extend opcode.
1971  assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
1972  "Invalid opcode");
1973 
1974  // We are extending an element we extract from a vector, so the source type
1975  // of the extend is the element type of the vector.
1976  auto *Src = VecTy->getElementType();
1977 
1978  // Sign- and zero-extends are for integer types only.
1979  assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
1980 
1981  // Get the cost for the extract. We compute the cost (if any) for the extend
1982  // below.
1984  getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
1985 
1986  // Legalize the types.
1987  auto VecLT = getTypeLegalizationCost(VecTy);
1988  auto DstVT = TLI->getValueType(DL, Dst);
1989  auto SrcVT = TLI->getValueType(DL, Src);
1991 
1992  // If the resulting type is still a vector and the destination type is legal,
1993  // we may get the extension for free. If not, get the default cost for the
1994  // extend.
1995  if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
1996  return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1997  CostKind);
1998 
1999  // The destination type should be larger than the element type. If not, get
2000  // the default cost for the extend.
2001  if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2002  return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2003  CostKind);
2004 
2005  switch (Opcode) {
2006  default:
2007  llvm_unreachable("Opcode should be either SExt or ZExt");
2008 
2009  // For sign-extends, we only need a smov, which performs the extension
2010  // automatically.
2011  case Instruction::SExt:
2012  return Cost;
2013 
2014  // For zero-extends, the extend is performed automatically by a umov unless
2015  // the destination type is i64 and the element type is i8 or i16.
2016  case Instruction::ZExt:
2017  if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
2018  return Cost;
2019  }
2020 
2021  // If we are unable to perform the extend for free, get the default cost.
2022  return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2023  CostKind);
2024 }
2025 
2028  const Instruction *I) {
2030  return Opcode == Instruction::PHI ? 0 : 1;
2031  assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
2032  // Branches are assumed to be predicted.
2033  return 0;
2034 }
2035 
2036 InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(Type *Val,
2037  unsigned Index,
2038  bool HasRealUse) {
2039  assert(Val->isVectorTy() && "This must be a vector type");
2040 
2041  if (Index != -1U) {
2042  // Legalize the type.
2043  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2044 
2045  // This type is legalized to a scalar type.
2046  if (!LT.second.isVector())
2047  return 0;
2048 
2049  // The type may be split. For fixed-width vectors we can normalize the
2050  // index to the new type.
2051  if (LT.second.isFixedLengthVector()) {
2052  unsigned Width = LT.second.getVectorNumElements();
2053  Index = Index % Width;
2054  }
2055 
2056  // The element at index zero is already inside the vector.
2057  // - For a physical (HasRealUse==true) insert-element or extract-element
2058  // instruction that extracts integers, an explicit FPR -> GPR move is
2059  // needed. So it has non-zero cost.
2060  // - For the rest of cases (virtual instruction or element type is float),
2061  // consider the instruction free.
2062  //
2063  // FIXME:
2064  // If the extract-element and insert-element instructions could be
2065  // simplified away (e.g., could be combined into users by looking at use-def
2066  // context), they have no cost. This is not done in the first place for
2067  // compile-time considerations.
2068  if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
2069  return 0;
2070  }
2071 
2072  // All other insert/extracts cost this much.
2073  return ST->getVectorInsertExtractBaseCost();
2074 }
2075 
2077  unsigned Index) {
2078  return getVectorInstrCostHelper(Val, Index, false /* HasRealUse */);
2079 }
2080 
2082  Type *Val, unsigned Index) {
2083  return getVectorInstrCostHelper(Val, Index, true /* HasRealUse */);
2084 }
2085 
2087  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2090  const Instruction *CxtI) {
2091 
2092  // TODO: Handle more cost kinds.
2094  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2095  Op2Info, Args, CxtI);
2096 
2097  // Legalize the type.
2098  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2099  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2100 
2101  switch (ISD) {
2102  default:
2103  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2104  Op2Info);
2105  case ISD::SDIV:
2106  if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
2107  // On AArch64, scalar signed division by constants power-of-two are
2108  // normally expanded to the sequence ADD + CMP + SELECT + SRA.
2109  // The OperandValue properties many not be same as that of previous
2110  // operation; conservatively assume OP_None.
2111  InstructionCost Cost = getArithmeticInstrCost(
2113  Op1Info.getNoProps(), Op2Info.getNoProps());
2114  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
2115  Op1Info.getNoProps(), Op2Info.getNoProps());
2116  Cost += getArithmeticInstrCost(
2118  Op1Info.getNoProps(), Op2Info.getNoProps());
2119  Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
2120  Op1Info.getNoProps(), Op2Info.getNoProps());
2121  return Cost;
2122  }
2123  [[fallthrough]];
2124  case ISD::UDIV: {
2125  if (Op2Info.isConstant() && Op2Info.isUniform()) {
2126  auto VT = TLI->getValueType(DL, Ty);
2127  if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
2128  // Vector signed division by constant are expanded to the
2129  // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
2130  // to MULHS + SUB + SRL + ADD + SRL.
2131  InstructionCost MulCost = getArithmeticInstrCost(
2132  Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2133  InstructionCost AddCost = getArithmeticInstrCost(
2134  Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2135  InstructionCost ShrCost = getArithmeticInstrCost(
2136  Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2137  return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
2138  }
2139  }
2140 
2141  InstructionCost Cost = BaseT::getArithmeticInstrCost(
2142  Opcode, Ty, CostKind, Op1Info, Op2Info);
2143  if (Ty->isVectorTy()) {
2144  if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
2145  // SDIV/UDIV operations are lowered using SVE, then we can have less
2146  // costs.
2147  if (isa<FixedVectorType>(Ty) &&
2148  cast<FixedVectorType>(Ty)->getPrimitiveSizeInBits().getFixedSize() <
2149  128) {
2150  EVT VT = TLI->getValueType(DL, Ty);
2151  static const CostTblEntry DivTbl[]{
2152  {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
2153  {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
2155  {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
2156  {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
2157  {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
2158 
2159  const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
2160  if (nullptr != Entry)
2161  return Entry->Cost;
2162  }
2163  // For 8/16-bit elements, the cost is higher because the type
2164  // requires promotion and possibly splitting:
2165  if (LT.second.getScalarType() == MVT::i8)
2166  Cost *= 8;
2167  else if (LT.second.getScalarType() == MVT::i16)
2168  Cost *= 4;
2169  return Cost;
2170  } else {
2171  // If one of the operands is a uniform constant then the cost for each
2172  // element is Cost for insertion, extraction and division.
2173  // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
2174  // operation with scalar type
2175  if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
2176  (Op2Info.isConstant() && Op2Info.isUniform())) {
2177  if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2178  InstructionCost DivCost = BaseT::getArithmeticInstrCost(
2179  Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
2180  return (4 + DivCost) * VTy->getNumElements();
2181  }
2182  }
2183  // On AArch64, without SVE, vector divisions are expanded
2184  // into scalar divisions of each pair of elements.
2185  Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
2186  CostKind, Op1Info, Op2Info);
2187  Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
2188  Op1Info, Op2Info);
2189  }
2190 
2191  // TODO: if one of the arguments is scalar, then it's not necessary to
2192  // double the cost of handling the vector elements.
2193  Cost += Cost;
2194  }
2195  return Cost;
2196  }
2197  case ISD::MUL:
2198  // When SVE is available, then we can lower the v2i64 operation using
2199  // the SVE mul instruction, which has a lower cost.
2200  if (LT.second == MVT::v2i64 && ST->hasSVE())
2201  return LT.first;
2202 
2203  // When SVE is not available, there is no MUL.2d instruction,
2204  // which means mul <2 x i64> is expensive as elements are extracted
2205  // from the vectors and the muls scalarized.
2206  // As getScalarizationOverhead is a bit too pessimistic, we
2207  // estimate the cost for a i64 vector directly here, which is:
2208  // - four 2-cost i64 extracts,
2209  // - two 2-cost i64 inserts, and
2210  // - two 1-cost muls.
2211  // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
2212  // LT.first = 2 the cost is 28. If both operands are extensions it will not
2213  // need to scalarize so the cost can be cheaper (smull or umull).
2214  // so the cost can be cheaper (smull or umull).
2215  if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
2216  return LT.first;
2217  return LT.first * 14;
2218  case ISD::ADD:
2219  case ISD::XOR:
2220  case ISD::OR:
2221  case ISD::AND:
2222  case ISD::SRL:
2223  case ISD::SRA:
2224  case ISD::SHL:
2225  // These nodes are marked as 'custom' for combining purposes only.
2226  // We know that they are legal. See LowerAdd in ISelLowering.
2227  return LT.first;
2228 
2229  case ISD::FADD:
2230  case ISD::FSUB:
2231  case ISD::FMUL:
2232  case ISD::FDIV:
2233  case ISD::FNEG:
2234  // These nodes are marked as 'custom' just to lower them to SVE.
2235  // We know said lowering will incur no additional cost.
2236  if (!Ty->getScalarType()->isFP128Ty())
2237  return 2 * LT.first;
2238 
2239  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2240  Op2Info);
2241  }
2242 }
2243 
2245  ScalarEvolution *SE,
2246  const SCEV *Ptr) {
2247  // Address computations in vectorized code with non-consecutive addresses will
2248  // likely result in more instructions compared to scalar code where the
2249  // computation can more often be merged into the index mode. The resulting
2250  // extra micro-ops can significantly decrease throughput.
2251  unsigned NumVectorInstToHideOverhead = 10;
2252  int MaxMergeDistance = 64;
2253 
2254  if (Ty->isVectorTy() && SE &&
2255  !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
2256  return NumVectorInstToHideOverhead;
2257 
2258  // In many cases the address computation is not merged into the instruction
2259  // addressing mode.
2260  return 1;
2261 }
2262 
2264  Type *CondTy,
2265  CmpInst::Predicate VecPred,
2267  const Instruction *I) {
2268  // TODO: Handle other cost kinds.
2270  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2271  I);
2272 
2273  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2274  // We don't lower some vector selects well that are wider than the register
2275  // width.
2276  if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
2277  // We would need this many instructions to hide the scalarization happening.
2278  const int AmortizationCost = 20;
2279 
2280  // If VecPred is not set, check if we can get a predicate from the context
2281  // instruction, if its type matches the requested ValTy.
2282  if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
2283  CmpInst::Predicate CurrentPred;
2284  if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
2285  m_Value())))
2286  VecPred = CurrentPred;
2287  }
2288  // Check if we have a compare/select chain that can be lowered using
2289  // a (F)CMxx & BFI pair.
2290  if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
2291  VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
2292  VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
2293  VecPred == CmpInst::FCMP_UNE) {
2294  static const auto ValidMinMaxTys = {
2297  static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
2298 
2299  auto LT = getTypeLegalizationCost(ValTy);
2300  if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
2301  (ST->hasFullFP16() &&
2302  any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
2303  return LT.first;
2304  }
2305 
2306  static const TypeConversionCostTblEntry
2307  VectorSelectTbl[] = {
2309  { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
2311  { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
2312  { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
2313  { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
2314  };
2315 
2316  EVT SelCondTy = TLI->getValueType(DL, CondTy);
2317  EVT SelValTy = TLI->getValueType(DL, ValTy);
2318  if (SelCondTy.isSimple() && SelValTy.isSimple()) {
2319  if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
2320  SelCondTy.getSimpleVT(),
2321  SelValTy.getSimpleVT()))
2322  return Entry->Cost;
2323  }
2324  }
2325  // The base case handles scalable vectors fine for now, since it treats the
2326  // cost as 1 * legalization cost.
2327  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2328 }
2329 
2331 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
2333  if (ST->requiresStrictAlign()) {
2334  // TODO: Add cost modeling for strict align. Misaligned loads expand to
2335  // a bunch of instructions when strict align is enabled.
2336  return Options;
2337  }
2338  Options.AllowOverlappingLoads = true;
2339  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
2340  Options.NumLoadsPerBlock = Options.MaxNumLoads;
2341  // TODO: Though vector loads usually perform well on AArch64, in some targets
2342  // they may wake up the FP unit, which raises the power consumption. Perhaps
2343  // they could be used with no holds barred (-O3).
2344  Options.LoadSizes = {8, 4, 2, 1};
2345  return Options;
2346 }
2347 
2349  return ST->hasSVE();
2350 }
2351 
2354  Align Alignment, unsigned AddressSpace,
2356  if (useNeonVector(Src))
2357  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2358  CostKind);
2359  auto LT = getTypeLegalizationCost(Src);
2360  if (!LT.first.isValid())
2361  return InstructionCost::getInvalid();
2362 
2363  // The code-generator is currently not able to handle scalable vectors
2364  // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2365  // it. This change will be removed when code-generation for these types is
2366  // sufficiently reliable.
2367  if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
2368  return InstructionCost::getInvalid();
2369 
2370  return LT.first * 2;
2371 }
2372 
2373 static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
2375 }
2376 
2378  unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
2379  Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
2380  if (useNeonVector(DataTy))
2381  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
2382  Alignment, CostKind, I);
2383  auto *VT = cast<VectorType>(DataTy);
2384  auto LT = getTypeLegalizationCost(DataTy);
2385  if (!LT.first.isValid())
2386  return InstructionCost::getInvalid();
2387 
2388  // The code-generator is currently not able to handle scalable vectors
2389  // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2390  // it. This change will be removed when code-generation for these types is
2391  // sufficiently reliable.
2392  if (cast<VectorType>(DataTy)->getElementCount() ==
2394  return InstructionCost::getInvalid();
2395 
2396  ElementCount LegalVF = LT.second.getVectorElementCount();
2397  InstructionCost MemOpCost =
2398  getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
2399  {TTI::OK_AnyValue, TTI::OP_None}, I);
2400  // Add on an overhead cost for using gathers/scatters.
2401  // TODO: At the moment this is applied unilaterally for all CPUs, but at some
2402  // point we may want a per-CPU overhead.
2403  MemOpCost *= getSVEGatherScatterOverhead(Opcode);
2404  return LT.first * MemOpCost * getMaxNumElements(LegalVF);
2405 }
2406 
2407 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
2408  return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
2409 }
2410 
2412  MaybeAlign Alignment,
2413  unsigned AddressSpace,
2415  TTI::OperandValueInfo OpInfo,
2416  const Instruction *I) {
2417  EVT VT = TLI->getValueType(DL, Ty, true);
2418  // Type legalization can't handle structs
2419  if (VT == MVT::Other)
2420  return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
2421  CostKind);
2422 
2423  auto LT = getTypeLegalizationCost(Ty);
2424  if (!LT.first.isValid())
2425  return InstructionCost::getInvalid();
2426 
2427  // The code-generator is currently not able to handle scalable vectors
2428  // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2429  // it. This change will be removed when code-generation for these types is
2430  // sufficiently reliable.
2431  if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
2432  if (VTy->getElementCount() == ElementCount::getScalable(1))
2433  return InstructionCost::getInvalid();
2434 
2435  // TODO: consider latency as well for TCK_SizeAndLatency.
2437  return LT.first;
2438 
2440  return 1;
2441 
2442  if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
2443  LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
2444  // Unaligned stores are extremely inefficient. We don't split all
2445  // unaligned 128-bit stores because the negative impact that has shown in
2446  // practice on inlined block copy code.
2447  // We make such stores expensive so that we will only vectorize if there
2448  // are 6 other instructions getting vectorized.
2449  const int AmortizationCost = 6;
2450 
2451  return LT.first * 2 * AmortizationCost;
2452  }
2453 
2454  // Check truncating stores and extending loads.
2455  if (useNeonVector(Ty) &&
2456  Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
2457  // v4i8 types are lowered to scalar a load/store and sshll/xtn.
2458  if (VT == MVT::v4i8)
2459  return 2;
2460  // Otherwise we need to scalarize.
2461  return cast<FixedVectorType>(Ty)->getNumElements() * 2;
2462  }
2463 
2464  return LT.first;
2465 }
2466 
2468  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
2469  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
2470  bool UseMaskForCond, bool UseMaskForGaps) {
2471  assert(Factor >= 2 && "Invalid interleave factor");
2472  auto *VecVTy = cast<FixedVectorType>(VecTy);
2473 
2474  if (!UseMaskForCond && !UseMaskForGaps &&
2475  Factor <= TLI->getMaxSupportedInterleaveFactor()) {
2476  unsigned NumElts = VecVTy->getNumElements();
2477  auto *SubVecTy =
2478  FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
2479 
2480  // ldN/stN only support legal vector types of size 64 or 128 in bits.
2481  // Accesses having vector types that are a multiple of 128 bits can be
2482  // matched to more than one ldN/stN instruction.
2483  bool UseScalable;
2484  if (NumElts % Factor == 0 &&
2485  TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
2486  return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
2487  }
2488 
2489  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2490  Alignment, AddressSpace, CostKind,
2491  UseMaskForCond, UseMaskForGaps);
2492 }
2493 
2496  InstructionCost Cost = 0;
2498  for (auto *I : Tys) {
2499  if (!I->isVectorTy())
2500  continue;
2501  if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
2502  128)
2503  Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
2504  getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
2505  }
2506  return Cost;
2507 }
2508 
2510  return ST->getMaxInterleaveFactor();
2511 }
2512 
2513 // For Falkor, we want to avoid having too many strided loads in a loop since
2514 // that can exhaust the HW prefetcher resources. We adjust the unroller
2515 // MaxCount preference below to attempt to ensure unrolling doesn't create too
2516 // many strided loads.
2517 static void
2520  enum { MaxStridedLoads = 7 };
2521  auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
2522  int StridedLoads = 0;
2523  // FIXME? We could make this more precise by looking at the CFG and
2524  // e.g. not counting loads in each side of an if-then-else diamond.
2525  for (const auto BB : L->blocks()) {
2526  for (auto &I : *BB) {
2527  LoadInst *LMemI = dyn_cast<LoadInst>(&I);
2528  if (!LMemI)
2529  continue;
2530 
2531  Value *PtrValue = LMemI->getPointerOperand();
2532  if (L->isLoopInvariant(PtrValue))
2533  continue;
2534 
2535  const SCEV *LSCEV = SE.getSCEV(PtrValue);
2536  const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
2537  if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
2538  continue;
2539 
2540  // FIXME? We could take pairing of unrolled load copies into account
2541  // by looking at the AddRec, but we would probably have to limit this
2542  // to loops with no stores or other memory optimization barriers.
2543  ++StridedLoads;
2544  // We've seen enough strided loads that seeing more won't make a
2545  // difference.
2546  if (StridedLoads > MaxStridedLoads / 2)
2547  return StridedLoads;
2548  }
2549  }
2550  return StridedLoads;
2551  };
2552 
2553  int StridedLoads = countStridedLoads(L, SE);
2554  LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
2555  << " strided loads\n");
2556  // Pick the largest power of 2 unroll count that won't result in too many
2557  // strided loads.
2558  if (StridedLoads) {
2559  UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
2560  LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
2561  << UP.MaxCount << '\n');
2562  }
2563 }
2564 
2568  // Enable partial unrolling and runtime unrolling.
2569  BaseT::getUnrollingPreferences(L, SE, UP, ORE);
2570 
2571  UP.UpperBound = true;
2572 
2573  // For inner loop, it is more likely to be a hot one, and the runtime check
2574  // can be promoted out from LICM pass, so the overhead is less, let's try
2575  // a larger threshold to unroll more loops.
2576  if (L->getLoopDepth() > 1)
2577  UP.PartialThreshold *= 2;
2578 
2579  // Disable partial & runtime unrolling on -Os.
2580  UP.PartialOptSizeThreshold = 0;
2581 
2582  if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
2584  getFalkorUnrollingPreferences(L, SE, UP);
2585 
2586  // Scan the loop: don't unroll loops with calls as this could prevent
2587  // inlining. Don't unroll vector loops either, as they don't benefit much from
2588  // unrolling.
2589  for (auto *BB : L->getBlocks()) {
2590  for (auto &I : *BB) {
2591  // Don't unroll vectorised loop.
2592  if (I.getType()->isVectorTy())
2593  return;
2594 
2595  if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2596  if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2597  if (!isLoweredToCall(F))
2598  continue;
2599  }
2600  return;
2601  }
2602  }
2603  }
2604 
2605  // Enable runtime unrolling for in-order models
2606  // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
2607  // checking for that case, we can ensure that the default behaviour is
2608  // unchanged
2609  if (ST->getProcFamily() != AArch64Subtarget::Others &&
2610  !ST->getSchedModel().isOutOfOrder()) {
2611  UP.Runtime = true;
2612  UP.Partial = true;
2613  UP.UnrollRemainder = true;
2615 
2616  UP.UnrollAndJam = true;
2618  }
2619 }
2620 
2623  BaseT::getPeelingPreferences(L, SE, PP);
2624 }
2625 
2627  Type *ExpectedType) {
2628  switch (Inst->getIntrinsicID()) {
2629  default:
2630  return nullptr;
2631  case Intrinsic::aarch64_neon_st2:
2632  case Intrinsic::aarch64_neon_st3:
2633  case Intrinsic::aarch64_neon_st4: {
2634  // Create a struct type
2635  StructType *ST = dyn_cast<StructType>(ExpectedType);
2636  if (!ST)
2637  return nullptr;
2638  unsigned NumElts = Inst->arg_size() - 1;
2639  if (ST->getNumElements() != NumElts)
2640  return nullptr;
2641  for (unsigned i = 0, e = NumElts; i != e; ++i) {
2642  if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
2643  return nullptr;
2644  }
2645  Value *Res = PoisonValue::get(ExpectedType);
2646  IRBuilder<> Builder(Inst);
2647  for (unsigned i = 0, e = NumElts; i != e; ++i) {
2648  Value *L = Inst->getArgOperand(i);
2649  Res = Builder.CreateInsertValue(Res, L, i);
2650  }
2651  return Res;
2652  }
2653  case Intrinsic::aarch64_neon_ld2:
2654  case Intrinsic::aarch64_neon_ld3:
2655  case Intrinsic::aarch64_neon_ld4:
2656  if (Inst->getType() == ExpectedType)
2657  return Inst;
2658  return nullptr;
2659  }
2660 }
2661 
2664  switch (Inst->getIntrinsicID()) {
2665  default:
2666  break;
2667  case Intrinsic::aarch64_neon_ld2:
2668  case Intrinsic::aarch64_neon_ld3:
2669  case Intrinsic::aarch64_neon_ld4:
2670  Info.ReadMem = true;
2671  Info.WriteMem = false;
2672  Info.PtrVal = Inst->getArgOperand(0);
2673  break;
2674  case Intrinsic::aarch64_neon_st2:
2675  case Intrinsic::aarch64_neon_st3:
2676  case Intrinsic::aarch64_neon_st4:
2677  Info.ReadMem = false;
2678  Info.WriteMem = true;
2679  Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
2680  break;
2681  }
2682 
2683  switch (Inst->getIntrinsicID()) {
2684  default:
2685  return false;
2686  case Intrinsic::aarch64_neon_ld2:
2687  case Intrinsic::aarch64_neon_st2:
2688  Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
2689  break;
2690  case Intrinsic::aarch64_neon_ld3:
2691  case Intrinsic::aarch64_neon_st3:
2692  Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
2693  break;
2694  case Intrinsic::aarch64_neon_ld4:
2695  case Intrinsic::aarch64_neon_st4:
2696  Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
2697  break;
2698  }
2699  return true;
2700 }
2701 
2702 /// See if \p I should be considered for address type promotion. We check if \p
2703 /// I is a sext with right type and used in memory accesses. If it used in a
2704 /// "complex" getelementptr, we allow it to be promoted without finding other
2705 /// sext instructions that sign extended the same initial value. A getelementptr
2706 /// is considered as "complex" if it has more than 2 operands.
2708  const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
2709  bool Considerable = false;
2710  AllowPromotionWithoutCommonHeader = false;
2711  if (!isa<SExtInst>(&I))
2712  return false;
2713  Type *ConsideredSExtType =
2714  Type::getInt64Ty(I.getParent()->getParent()->getContext());
2715  if (I.getType() != ConsideredSExtType)
2716  return false;
2717  // See if the sext is the one with the right type and used in at least one
2718  // GetElementPtrInst.
2719  for (const User *U : I.users()) {
2720  if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2721  Considerable = true;
2722  // A getelementptr is considered as "complex" if it has more than 2
2723  // operands. We will promote a SExt used in such complex GEP as we
2724  // expect some computation to be merged if they are done on 64 bits.
2725  if (GEPInst->getNumOperands() > 2) {
2726  AllowPromotionWithoutCommonHeader = true;
2727  break;
2728  }
2729  }
2730  }
2731  return Considerable;
2732 }
2733 
2735  const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
2736  if (!VF.isScalable())
2737  return true;
2738 
2739  Type *Ty = RdxDesc.getRecurrenceType();
2740  if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
2741  return false;
2742 
2743  switch (RdxDesc.getRecurrenceKind()) {
2744  case RecurKind::Add:
2745  case RecurKind::FAdd:
2746  case RecurKind::And:
2747  case RecurKind::Or:
2748  case RecurKind::Xor:
2749  case RecurKind::SMin:
2750  case RecurKind::SMax:
2751  case RecurKind::UMin:
2752  case RecurKind::UMax:
2753  case RecurKind::FMin:
2754  case RecurKind::FMax:
2755  case RecurKind::SelectICmp:
2756  case RecurKind::SelectFCmp:
2757  case RecurKind::FMulAdd:
2758  return true;
2759  default:
2760  return false;
2761  }
2762 }
2763 
2766  bool IsUnsigned,
2768  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2769 
2770  if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
2771  return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
2772 
2773  assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&
2774  "Both vector needs to be equally scalable");
2775 
2776  InstructionCost LegalizationCost = 0;
2777  if (LT.first > 1) {
2778  Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2779  unsigned MinMaxOpcode =
2780  Ty->isFPOrFPVectorTy()
2782  : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin);
2783  IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy});
2784  LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
2785  }
2786 
2787  return LegalizationCost + /*Cost of horizontal reduction*/ 2;
2788 }
2789 
2791  unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
2792  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2793  InstructionCost LegalizationCost = 0;
2794  if (LT.first > 1) {
2795  Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
2796  LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
2797  LegalizationCost *= LT.first - 1;
2798  }
2799 
2800  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2801  assert(ISD && "Invalid opcode");
2802  // Add the final reduction cost for the legal horizontal reduction
2803  switch (ISD) {
2804  case ISD::ADD:
2805  case ISD::AND:
2806  case ISD::OR:
2807  case ISD::XOR:
2808  case ISD::FADD:
2809  return LegalizationCost + 2;
2810  default:
2811  return InstructionCost::getInvalid();
2812  }
2813 }
2814 
2817  std::optional<FastMathFlags> FMF,
2819  if (TTI::requiresOrderedReduction(FMF)) {
2820  if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
2821  InstructionCost BaseCost =
2822  BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2823  // Add on extra cost to reflect the extra overhead on some CPUs. We still
2824  // end up vectorizing for more computationally intensive loops.
2825  return BaseCost + FixedVTy->getNumElements();
2826  }
2827 
2828  if (Opcode != Instruction::FAdd)
2829  return InstructionCost::getInvalid();
2830 
2831  auto *VTy = cast<ScalableVectorType>(ValTy);
2833  getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
2834  Cost *= getMaxNumElements(VTy->getElementCount());
2835  return Cost;
2836  }
2837 
2838  if (isa<ScalableVectorType>(ValTy))
2839  return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
2840 
2841  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2842  MVT MTy = LT.second;
2843  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2844  assert(ISD && "Invalid opcode");
2845 
2846  // Horizontal adds can use the 'addv' instruction. We model the cost of these
2847  // instructions as twice a normal vector add, plus 1 for each legalization
2848  // step (LT.first). This is the only arithmetic vector reduction operation for
2849  // which we have an instruction.
2850  // OR, XOR and AND costs should match the codegen from:
2851  // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
2852  // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
2853  // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
2854  static const CostTblEntry CostTblNoPairwise[]{
2855  {ISD::ADD, MVT::v8i8, 2},
2856  {ISD::ADD, MVT::v16i8, 2},
2857  {ISD::ADD, MVT::v4i16, 2},
2858  {ISD::ADD, MVT::v8i16, 2},
2859  {ISD::ADD, MVT::v4i32, 2},
2860  {ISD::ADD, MVT::v2i64, 2},
2861  {ISD::OR, MVT::v8i8, 15},
2862  {ISD::OR, MVT::v16i8, 17},
2863  {ISD::OR, MVT::v4i16, 7},
2864  {ISD::OR, MVT::v8i16, 9},
2865  {ISD::OR, MVT::v2i32, 3},
2866  {ISD::OR, MVT::v4i32, 5},
2867  {ISD::OR, MVT::v2i64, 3},
2868  {ISD::XOR, MVT::v8i8, 15},
2869  {ISD::XOR, MVT::v16i8, 17},
2870  {ISD::XOR, MVT::v4i16, 7},
2871  {ISD::XOR, MVT::v8i16, 9},
2872  {ISD::XOR, MVT::v2i32, 3},
2873  {ISD::XOR, MVT::v4i32, 5},
2874  {ISD::XOR, MVT::v2i64, 3},
2875  {ISD::AND, MVT::v8i8, 15},
2876  {ISD::AND, MVT::v16i8, 17},
2877  {ISD::AND, MVT::v4i16, 7},
2878  {ISD::AND, MVT::v8i16, 9},
2879  {ISD::AND, MVT::v2i32, 3},
2880  {ISD::AND, MVT::v4i32, 5},
2881  {ISD::AND, MVT::v2i64, 3},
2882  };
2883  switch (ISD) {
2884  default:
2885  break;
2886  case ISD::ADD:
2887  if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
2888  return (LT.first - 1) + Entry->Cost;
2889  break;
2890  case ISD::XOR:
2891  case ISD::AND:
2892  case ISD::OR:
2893  const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
2894  if (!Entry)
2895  break;
2896  auto *ValVTy = cast<FixedVectorType>(ValTy);
2897  if (!ValVTy->getElementType()->isIntegerTy(1) &&
2898  MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
2899  isPowerOf2_32(ValVTy->getNumElements())) {
2900  InstructionCost ExtraCost = 0;
2901  if (LT.first != 1) {
2902  // Type needs to be split, so there is an extra cost of LT.first - 1
2903  // arithmetic ops.
2904  auto *Ty = FixedVectorType::get(ValTy->getElementType(),
2905  MTy.getVectorNumElements());
2906  ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
2907  ExtraCost *= LT.first - 1;
2908  }
2909  return Entry->Cost + ExtraCost;
2910  }
2911  break;
2912  }
2913  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2914 }
2915 
2917  static const CostTblEntry ShuffleTbl[] = {
2918  { TTI::SK_Splice, MVT::nxv16i8, 1 },
2919  { TTI::SK_Splice, MVT::nxv8i16, 1 },
2920  { TTI::SK_Splice, MVT::nxv4i32, 1 },
2921  { TTI::SK_Splice, MVT::nxv2i64, 1 },
2922  { TTI::SK_Splice, MVT::nxv2f16, 1 },
2923  { TTI::SK_Splice, MVT::nxv4f16, 1 },
2924  { TTI::SK_Splice, MVT::nxv8f16, 1 },
2925  { TTI::SK_Splice, MVT::nxv2bf16, 1 },
2926  { TTI::SK_Splice, MVT::nxv4bf16, 1 },
2927  { TTI::SK_Splice, MVT::nxv8bf16, 1 },
2928  { TTI::SK_Splice, MVT::nxv2f32, 1 },
2929  { TTI::SK_Splice, MVT::nxv4f32, 1 },
2930  { TTI::SK_Splice, MVT::nxv2f64, 1 },
2931  };
2932 
2933  // The code-generator is currently not able to handle scalable vectors
2934  // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2935  // it. This change will be removed when code-generation for these types is
2936  // sufficiently reliable.
2938  return InstructionCost::getInvalid();
2939 
2940  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
2941  Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
2943  EVT PromotedVT = LT.second.getScalarType() == MVT::i1
2944  ? TLI->getPromotedVTForPredicate(EVT(LT.second))
2945  : LT.second;
2946  Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
2947  InstructionCost LegalizationCost = 0;
2948  if (Index < 0) {
2949  LegalizationCost =
2950  getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
2952  getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
2954  }
2955 
2956  // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
2957  // Cost performed on a promoted type.
2958  if (LT.second.getScalarType() == MVT::i1) {
2959  LegalizationCost +=
2960  getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
2962  getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
2964  }
2965  const auto *Entry =
2966  CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
2967  assert(Entry && "Illegal Type for Splice");
2968  LegalizationCost += Entry->Cost;
2969  return LegalizationCost * LT.first;
2970 }
2971 
2973  VectorType *Tp,
2976  int Index, VectorType *SubTp,
2978  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
2979  // If we have a Mask, and the LT is being legalized somehow, split the Mask
2980  // into smaller vectors and sum the cost of each shuffle.
2981  if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
2982  Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
2983  cast<FixedVectorType>(Tp)->getNumElements() >
2984  LT.second.getVectorNumElements() &&
2985  !Index && !SubTp) {
2986  unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements();
2987  assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!");
2988  unsigned LTNumElts = LT.second.getVectorNumElements();
2989  unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
2990  VectorType *NTp =
2991  VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
2993  for (unsigned N = 0; N < NumVecs; N++) {
2994  SmallVector<int> NMask;
2995  // Split the existing mask into chunks of size LTNumElts. Track the source
2996  // sub-vectors to ensure the result has at most 2 inputs.
2997  unsigned Source1, Source2;
2998  unsigned NumSources = 0;
2999  for (unsigned E = 0; E < LTNumElts; E++) {
3000  int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
3001  : UndefMaskElem;
3002  if (MaskElt < 0) {
3003  NMask.push_back(UndefMaskElem);
3004  continue;
3005  }
3006 
3007  // Calculate which source from the input this comes from and whether it
3008  // is new to us.
3009  unsigned Source = MaskElt / LTNumElts;
3010  if (NumSources == 0) {
3011  Source1 = Source;
3012  NumSources = 1;
3013  } else if (NumSources == 1 && Source != Source1) {
3014  Source2 = Source;
3015  NumSources = 2;
3016  } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
3017  NumSources++;
3018  }
3019 
3020  // Add to the new mask. For the NumSources>2 case these are not correct,
3021  // but are only used for the modular lane number.
3022  if (Source == Source1)
3023  NMask.push_back(MaskElt % LTNumElts);
3024  else if (Source == Source2)
3025  NMask.push_back(MaskElt % LTNumElts + LTNumElts);
3026  else
3027  NMask.push_back(MaskElt % LTNumElts);
3028  }
3029  // If the sub-mask has at most 2 input sub-vectors then re-cost it using
3030  // getShuffleCost. If not then cost it using the worst case.
3031  if (NumSources <= 2)
3032  Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
3034  NTp, NMask, CostKind, 0, nullptr, Args);
3035  else if (any_of(enumerate(NMask), [&](const auto &ME) {
3036  return ME.value() % LTNumElts == ME.index();
3037  }))
3038  Cost += LTNumElts - 1;
3039  else
3040  Cost += LTNumElts;
3041  }
3042  return Cost;
3043  }
3044 
3045  Kind = improveShuffleKindFromMask(Kind, Mask);
3046 
3047  // Check for broadcast loads.
3048  if (Kind == TTI::SK_Broadcast) {
3049  bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
3050  if (IsLoad && LT.second.isVector() &&
3051  isLegalBroadcastLoad(Tp->getElementType(),
3052  LT.second.getVectorElementCount()))
3053  return 0; // broadcast is handled by ld1r
3054  }
3055 
3056  // If we have 4 elements for the shuffle and a Mask, get the cost straight
3057  // from the perfect shuffle tables.
3058  if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
3059  (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
3060  all_of(Mask, [](int E) { return E < 8; }))
3061  return getPerfectShuffleCost(Mask);
3062 
3063  if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
3064  Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
3065  Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
3066  static const CostTblEntry ShuffleTbl[] = {
3067  // Broadcast shuffle kinds can be performed with 'dup'.
3078  // Transpose shuffle kinds can be performed with 'trn1/trn2' and
3079  // 'zip1/zip2' instructions.
3090  // Select shuffle kinds.
3091  // TODO: handle vXi8/vXi16.
3092  {TTI::SK_Select, MVT::v2i32, 1}, // mov.
3093  {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
3094  {TTI::SK_Select, MVT::v2i64, 1}, // mov.
3095  {TTI::SK_Select, MVT::v2f32, 1}, // mov.
3096  {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
3097  {TTI::SK_Select, MVT::v2f64, 1}, // mov.
3098  // PermuteSingleSrc shuffle kinds.
3099  {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
3100  {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
3101  {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
3102  {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
3103  {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
3104  {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
3105  {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
3106  {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
3107  {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
3108  {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
3109  {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
3110  {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
3111  {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
3112  {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
3113  // Reverse can be lowered with `rev`.
3114  {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
3115  {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
3116  {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
3117  {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
3118  {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
3119  {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
3120  {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
3121  {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
3122  {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
3123  {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
3124  {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
3125  {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
3126  // Splice can all be lowered as `ext`.
3127  {TTI::SK_Splice, MVT::v2i32, 1},
3128  {TTI::SK_Splice, MVT::v4i32, 1},
3129  {TTI::SK_Splice, MVT::v2i64, 1},
3130  {TTI::SK_Splice, MVT::v2f32, 1},
3131  {TTI::SK_Splice, MVT::v4f32, 1},
3132  {TTI::SK_Splice, MVT::v2f64, 1},
3133  {TTI::SK_Splice, MVT::v8f16, 1},
3135  {TTI::SK_Splice, MVT::v8i16, 1},
3136  {TTI::SK_Splice, MVT::v16i8, 1},
3138  {TTI::SK_Splice, MVT::v4f16, 1},
3139  {TTI::SK_Splice, MVT::v4i16, 1},
3140  {TTI::SK_Splice, MVT::v8i8, 1},
3141  // Broadcast shuffle kinds for scalable vectors
3159  // Handle the cases for vector.reverse with scalable vectors
3177  };
3178  if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
3179  return LT.first * Entry->Cost;
3180  }
3181 
3182  if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
3183  return getSpliceCost(Tp, Index);
3184 
3185  // Inserting a subvector can often be done with either a D, S or H register
3186  // move, so long as the inserted vector is "aligned".
3187  if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
3188  LT.second.getSizeInBits() <= 128 && SubTp) {
3189  std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
3190  if (SubLT.second.isVector()) {
3191  int NumElts = LT.second.getVectorNumElements();
3192  int NumSubElts = SubLT.second.getVectorNumElements();
3193  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
3194  return SubLT.first;
3195  }
3196  }
3197 
3198  return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
3199 }
3200 
3202  Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
3204  InterleavedAccessInfo *IAI) {
3205  if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled)
3206  return false;
3207 
3208  // We don't currently support vectorisation with interleaving for SVE - with
3209  // such loops we're better off not using tail-folding. This gives us a chance
3210  // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
3211  if (IAI->hasGroups())
3212  return false;
3213 
3214  TailFoldingKind Required; // Defaults to 0.
3215  if (LVL->getReductionVars().size())
3217  if (LVL->getFixedOrderRecurrences().size())
3219  if (!Required)
3221 
3222  return (TailFoldingKindLoc & Required) == Required;
3223 }
3224 
3227  int64_t BaseOffset, bool HasBaseReg,
3228  int64_t Scale, unsigned AddrSpace) const {
3229  // Scaling factors are not free at all.
3230  // Operands | Rt Latency
3231  // -------------------------------------------
3232  // Rt, [Xn, Xm] | 4
3233  // -------------------------------------------
3234  // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
3235  // Rt, [Xn, Wm, <extend> #imm] |
3237  AM.BaseGV = BaseGV;
3238  AM.BaseOffs = BaseOffset;
3239  AM.HasBaseReg = HasBaseReg;
3240  AM.Scale = Scale;
3241  if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
3242  // Scale represents reg2 * scale, thus account for 1 if
3243  // it is not equal to 0 or 1.
3244  return AM.Scale != 0 && AM.Scale != 1;
3245  return -1;
3246 }
i
i
Definition: README.txt:29
llvm::AArch64TTIImpl::getIntImmCostIntrin
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:262
llvm::MVT::nxv4i1
@ nxv4i1
Definition: MachineValueType.h:208
llvm::InstructionCost
Definition: InstructionCost.h:30
llvm::TargetTransformInfo::PSK_FastHardware
@ PSK_FastHardware
Definition: TargetTransformInfo.h:585
TailFoldingKind::operator=
void operator=(const std::string &Val)
Definition: AArch64TargetTransformInfo.cpp:55
Int32Ty
IntegerType * Int32Ty
Definition: NVVMIntrRange.cpp:67
llvm::AArch64TTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: AArch64TargetTransformInfo.cpp:2565
llvm::MVT::nxv4i64
@ nxv4i64
Definition: MachineValueType.h:238
llvm::TargetTransformInfo::UnrollingPreferences::PartialOptSizeThreshold
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
Definition: TargetTransformInfo.h:446
llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition: TargetTransformInfo.h:892
Attrs
Function Attrs
Definition: README_ALTIVEC.txt:215
TailFoldingKind
Definition: AArch64TargetTransformInfo.cpp:42
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:474
llvm::MVT::nxv2i1
@ nxv2i1
Definition: MachineValueType.h:207
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:218
llvm::Loop::isLoopInvariant
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:60
llvm::LoopVectorizationLegality::getReductionVars
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Definition: LoopVectorizationLegality.h:291
llvm::MVT::v4f16
@ v4f16
Definition: MachineValueType.h:149
llvm::PatternMatch::m_NonNegative
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
Definition: PatternMatch.h:485
llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition: TargetTransformInfo.h:442
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
llvm::ConvertCostTableLookup
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType >> Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
llvm::Type::getInt1Ty
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:236
llvm::CostTblEntryT
Cost Table Entry.
Definition: CostTable.h:25
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:69
getFalkorUnrollingPreferences
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
Definition: AArch64TargetTransformInfo.cpp:2518
llvm::Value::getPointerAlignment
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:918
llvm::MVT::nxv2f64
@ nxv2f64
Definition: MachineValueType.h:267
llvm::AArch64TTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: AArch64TargetTransformInfo.cpp:2076
llvm::RecurKind::Or
@ Or
Bitwise or logical OR of integers.
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:667
InstCombiner.h
llvm::AArch64_AM::LSL
@ LSL
Definition: AArch64AddressingModes.h:35
llvm::ISD::BITCAST
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:886
llvm::RecurKind::FMul
@ FMul
Product of floats.
Insert
Vector Rotate Left Mask Mask Insert
Definition: README_P9.txt:112
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:719
PHI
Rewrite undef for PHI
Definition: AMDGPURewriteUndefForPHI.cpp:101
llvm::SCEVAddRecExpr::isAffine
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
Definition: ScalarEvolutionExpressions.h:370
llvm::TargetTransformInfo::UnrollingPreferences::MaxCount
unsigned MaxCount
Definition: TargetTransformInfo.h:458
llvm::AArch64TTIImpl::getIntImmCost
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
Definition: AArch64TargetTransformInfo.cpp:153
IntrinsicInst.h
instCombineConvertFromSVBool
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:625
llvm::ElementCount
Definition: TypeSize.h:404
llvm::AArch64TTIImpl::instCombineIntrinsic
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
Definition: AArch64TargetTransformInfo.cpp:1431
llvm::Function
Definition: Function.h:60
llvm::cl::location
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:466
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
llvm::BinaryOperator::CreateWithCopiedFlags
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Instruction *CopyO, const Twine &Name="", Instruction *InsertBefore=nullptr)
Definition: InstrTypes.h:249
llvm::ISD::UDIV
@ UDIV
Definition: ISDOpcodes.h:243
instCombineSVEVectorMul
static std::optional< Instruction * > instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1176
llvm::TargetTransformInfo::PopcntSupportKind
PopcntSupportKind
Flags indicating the kind of support for population count.
Definition: TargetTransformInfo.h:585
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
llvm::MVT::nxv2f32
@ nxv2f32
Definition: MachineValueType.h:261
llvm::APInt::isPowerOf2
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:425
llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition: TargetTransformInfo.h:1270
llvm::PatternMatch::m_FPOne
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:818
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:328
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:133
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1199
llvm::MVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition: MachineValueType.h:386
llvm::enumerate
detail::enumerator< R > enumerate(R &&TheRange)
Given an input range, returns a new range whose values are are pair (A,B) such that A is the 0-based ...
Definition: STLExtras.h:2263
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1498
instCombineSVETBL
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1245
llvm::getSplatValue
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
Definition: VectorUtils.cpp:371
llvm::IRBuilder<>
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:153
llvm::AArch64TTIImpl::getRegisterBitWidth
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
Definition: AArch64TargetTransformInfo.cpp:1532
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:449
EnableScalableAutovecInStreamingMode
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
llvm::TargetTransformInfo::RGK_Scalar
@ RGK_Scalar
Definition: TargetTransformInfo.h:967
llvm::LoopVectorizationLegality
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
Definition: LoopVectorizationLegality.h:241
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJamInnerLoopThreshold
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
Definition: TargetTransformInfo.h:493
llvm::MVT::nxv2i64
@ nxv2i64
Definition: MachineValueType.h:237
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
llvm::TargetTransformInfo::UnrollingPreferences::UnrollRemainder
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
Definition: TargetTransformInfo.h:486
llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:819
llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:198
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:221
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
llvm::InsertElementInst::Create
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Definition: Instructions.h:1948
TailFoldingKind::add
void add(uint8_t Flag)
Definition: AArch64TargetTransformInfo.cpp:89
Shift
bool Shift
Definition: README.txt:468
llvm::ExtractElementInst::Create
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Definition: Instructions.h:1883
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:470
llvm::MVT::nxv4f16
@ nxv4f16
Definition: MachineValueType.h:248
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:529
llvm::SISrcMods::NEG
@ NEG
Definition: SIDefines.h:222
llvm::tgtok::Bits
@ Bits
Definition: TGLexer.h:50
llvm::AArch64TTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: AArch64TargetTransformInfo.cpp:2086
getPerfectShuffleCost
static unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
Definition: AArch64PerfectShuffle.h:6589
instCombineSVEDupX
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:704
instCombineSVELast
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:817
getSVEGatherScatterOverhead
static unsigned getSVEGatherScatterOverhead(unsigned Opcode)
Definition: AArch64TargetTransformInfo.cpp:2373
llvm::RecurKind::SelectFCmp
@ SelectFCmp
Integer select(fcmp(),x,y) where one of (x,y) is loop invariant.
llvm::PatternMatch::m_BinOp
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:84
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:40
llvm::MVT::nxv4f64
@ nxv4f64
Definition: MachineValueType.h:268
llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:422
getBitWidth
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Definition: ValueTracking.cpp:100
llvm::max
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:337
llvm::errs
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
Definition: raw_ostream.cpp:891
llvm::Instruction::copyMetadata
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Definition: Instruction.cpp:878
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:119
SVEScatterOverhead
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
RHS
Value * RHS
Definition: X86PartialReduction.cpp:76
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:261
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:458
llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:21
llvm::RecurKind::SMin
@ SMin
Signed integer min implemented in terms of select(cmp()).
llvm::MVT::v2f64
@ v2f64
Definition: MachineValueType.h:190
llvm::TargetTransformInfo::OperandValueInfo
Definition: TargetTransformInfo.h:926
llvm::SMEAttrs::requiresSMChange
std::optional< bool > requiresSMChange(const SMEAttrs &Callee, bool BodyOverridesInterface=false) const
Definition: AArch64SMEAttributes.cpp:53
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::CmpInst::FCMP_OGT
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:723
llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition: TargetTransformInfo.h:900
llvm::MVT::nxv8i16
@ nxv8i16
Definition: MachineValueType.h:225
llvm::LinearPolySize::isScalable
bool isScalable() const
Returns whether the size is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:298
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:239
llvm::APIntOps::umin
const APInt & umin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be unsigned.
Definition: APInt.h:2157
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:890
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::AArch64TTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: AArch64TargetTransformInfo.cpp:122
llvm::SMEAttrs
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
Definition: AArch64SMEAttributes.h:24
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:129
llvm::UndefMaskElem
constexpr int UndefMaskElem
Definition: Instructions.h:1994
llvm::MVT::nxv8bf16
@ nxv8bf16
Definition: MachineValueType.h:256
llvm::AArch64TTIImpl::getExtractWithExtendCost
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
Definition: AArch64TargetTransformInfo.cpp:1965
llvm::PatternMatch::m_OneUse
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:266
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::SMEAttrs::hasNewZAInterface
bool hasNewZAInterface() const
Definition: AArch64SMEAttributes.h:75
llvm::MVT::v4bf16
@ v4bf16
Definition: MachineValueType.h:160
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:187
llvm::AArch64TTIImpl::prefersVectorizedAddressing
bool prefersVectorizedAddressing() const
Definition: AArch64TargetTransformInfo.cpp:2348
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
LHS
Value * LHS
Definition: X86PartialReduction.cpp:75
TargetLowering.h
llvm::IntrinsicCostAttributes::getArgTypes
const SmallVectorImpl< Type * > & getArgTypes() const
Definition: TargetTransformInfo.h:157
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::MVT::i1
@ i1
Definition: MachineValueType.h:43
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1734
llvm::TargetTransformInfo::SK_PermuteTwoSrc
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
Definition: TargetTransformInfo.h:898
llvm::MVT::v8f16
@ v8f16
Definition: MachineValueType.h:150
llvm::isSplatValue
bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
Definition: VectorUtils.cpp:386
llvm::MVT::nxv2bf16
@ nxv2bf16
Definition: MachineValueType.h:254
llvm::RecurKind::And
@ And
Bitwise or logical AND of integers.
llvm::TypeConversionCostTblEntryT
Type Conversion Cost Table.
Definition: CostTable.h:55
instCombineSVEUnpack
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1223
llvm::PatternMatch::m_Select
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
Definition: PatternMatch.h:1462
llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:713
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:763
InlinePriorityMode::Cost
@ Cost
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:889
llvm::AArch64_AM::isLogicalImmediate
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
Definition: AArch64AddressingModes.h:276
llvm::AArch64TTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:2816
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::AArch64TTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:2411
SVEGatherOverhead
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition: TargetTransformInfo.h:1142
llvm::User
Definition: User.h:44
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:34
Intrinsics.h
llvm::TargetLoweringBase::AddrMode::HasBaseReg
bool HasBaseReg
Definition: TargetLowering.h:2591
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::MVT::f64
@ f64
Definition: MachineValueType.h:58
llvm::cl::Required
@ Required
Definition: CommandLine.h:117
llvm::MVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition: MachineValueType.h:1138
llvm::AArch64TTIImpl::getCostOfKeepingLiveOverCall
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
Definition: AArch64TargetTransformInfo.cpp:2495
llvm::RecurrenceDescriptor::getRecurrenceType
Type * getRecurrenceType() const
Returns the type of the recurrence.
Definition: IVDescriptors.h:245
instCombineSVECntElts
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
Definition: AArch64TargetTransformInfo.cpp:955
llvm::AArch64Subtarget::Others
@ Others
Definition: AArch64Subtarget.h:41
llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:769
llvm::ISD::SRA
@ SRA
Definition: ISDOpcodes.h:692
instCombineMaxMinNM
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1385
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:195
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:246
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::MVT::v16i1
@ v16i1
Definition: MachineValueType.h:70
llvm::MVT::nxv2f16
@ nxv2f16
Definition: MachineValueType.h:247
llvm::VectorType::getElementCount
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:627
llvm::dwarf::Index
Index
Definition: Dwarf.h:472
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
llvm::Log2_32
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:547
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
instCombineSVEVectorFMLA
static std::optional< Instruction * > instCombineSVEVectorFMLA(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1049
llvm::MVT::v8i1
@ v8i1
Definition: MachineValueType.h:69
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJam
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
Definition: TargetTransformInfo.h:488
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:188
llvm::MVT::v4f64
@ v4f64
Definition: MachineValueType.h:192
llvm::AArch64TTIImpl::getIntImmCostInst
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
Definition: AArch64TargetTransformInfo.cpp:193
llvm::Instruction
Definition: Instruction.h:42
llvm::MCID::Flag
Flag
These should be considered private to the implementation of the MCInstrDesc class.
Definition: MCInstrDesc.h:147
llvm::InterleavedAccessInfo
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:759
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:189
llvm::RecurrenceDescriptor::getRecurrenceKind
RecurKind getRecurrenceKind() const
Definition: IVDescriptors.h:198
llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:773
llvm::InstCombiner::eraseInstFromFunction
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
llvm::AArch64TTIImpl::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:1630
Options
const char LLVMTargetMachineRef LLVMPassBuilderOptionsRef Options
Definition: PassBuilderBindings.cpp:48
llvm::MVT::v16f16
@ v16f16
Definition: MachineValueType.h:151
llvm::AArch64TTIImpl::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:2765
llvm::MVT::nxv4i8
@ nxv4i8
Definition: MachineValueType.h:216
llvm::MVT::nxv4f32
@ nxv4f32
Definition: MachineValueType.h:262
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:879
llvm::CmpInst::FCMP_OEQ
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:722
instCombineLD1GatherIndex
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1286
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::CmpInst::FCMP_OLT
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:725
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:666
Align
uint64_t Align
Definition: ELFObjHandler.cpp:82
PatternMatch.h
llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition: TargetTransformInfo.h:967
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:684
llvm::MVT::v1i64
@ v1i64
Definition: MachineValueType.h:130
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::MCID::Call
@ Call
Definition: MCInstrDesc.h:155
llvm::TargetTransformInfo::OperandValueInfo::isUniform
bool isUniform() const
Definition: TargetTransformInfo.h:933
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
IVDescriptors.h
llvm::LinearPolySize< TypeSize >::getFixed
static TypeSize getFixed(ScalarTy MinVal)
Definition: TypeSize.h:283
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:100
llvm::EVT::getTypeForEVT
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:182
llvm::MVT::v4i8
@ v4i8
Definition: MachineValueType.h:87
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:120
llvm::MVT::nxv4i16
@ nxv4i16
Definition: MachineValueType.h:224
processPhiNode
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
Definition: AArch64TargetTransformInfo.cpp:525
llvm::PatternMatch::m_One
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:517
llvm::RecurKind::UMin
@ UMin
Unisgned integer min implemented in terms of select(cmp()).
llvm::TargetTransformInfo::PSK_Software
@ PSK_Software
Definition: TargetTransformInfo.h:585
llvm::maxnum
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE maxNum semantics.
Definition: APFloat.h:1322
LoopInfo.h
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:74
llvm::APInt::ashr
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:808
llvm::AArch64TTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:323
llvm::ScalarEvolution::getSCEV
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition: ScalarEvolution.cpp:4442
AArch64AddressingModes.h
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:210
llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:891
llvm::MVT::v2i8
@ v2i8
Definition: MachineValueType.h:86
instCombineSVEVectorBinOp
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1154
llvm::AArch64TTIImpl::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: AArch64TargetTransformInfo.cpp:2467
llvm::MVT::v4i64
@ v4i64
Definition: MachineValueType.h:133
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
TailFoldingKindLoc
TailFoldingKind TailFoldingKindLoc
Definition: AArch64TargetTransformInfo.cpp:93
llvm::MVT::nxv16i8
@ nxv16i8
Definition: MachineValueType.h:218
llvm::cl::opt< bool >
llvm::AArch64TTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
Definition: AArch64TargetTransformInfo.cpp:2972
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:75
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:298
llvm::GlobalValue
Definition: GlobalValue.h:44
llvm::TargetTransformInfo::SK_InsertSubvector
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
Definition: TargetTransformInfo.h:896
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::MVT::nxv8i64
@ nxv8i64
Definition: MachineValueType.h:239
llvm::TargetTransformInfo::OperandValueInfo::isPowerOf2
bool isPowerOf2() const
Definition: TargetTransformInfo.h:936
llvm::MVT::v16i8
@ v16i8
Definition: MachineValueType.h:89
llvm::CostTableLookup
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType >> Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
llvm::AArch64TTIImpl::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:2353
llvm::PointerType::getUnqual
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:651
llvm::MVT::v16i16
@ v16i16
Definition: MachineValueType.h:102
llvm::MVT::v2i64
@ v2i64
Definition: MachineValueType.h:131
intrinsicIDToBinOpCode
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
Definition: AArch64TargetTransformInfo.cpp:1140
llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition: ISDOpcodes.h:820
llvm::AArch64::SVEBitsPerBlock
static constexpr unsigned SVEBitsPerBlock
Definition: AArch64BaseInfo.h:863
AArch64ExpandImm.h
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::MVT::v16f32
@ v16f32
Definition: MachineValueType.h:179
llvm::AArch64TTIImpl::getArithmeticReductionCostSVE
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:2790
llvm::LegalityPredicates::all
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
Definition: LegalizerInfo.h:228
instCombineSVEVectorFAdd
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1170
llvm::APInt::logBase2
unsigned logBase2() const
Definition: APInt.h:1672
llvm::ARM_AM::add
@ add
Definition: ARMAddressingModes.h:39
llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition: Instructions.h:2847
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:53
llvm::APInt::negate
void negate()
Negate this APInt in place.
Definition: APInt.h:1413
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:417
I
#define I(x, y, z)
Definition: MD5.cpp:58
getCalledFunction
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
Definition: MemoryBuiltins.cpp:155
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:929
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:446
AArch64PerfectShuffle.h
llvm::RecurKind::Add
@ Add
Sum of integers.
isAllActivePredicate
static bool isAllActivePredicate(Value *Pred)
Definition: AArch64TargetTransformInfo.cpp:1078
llvm::LoopVectorizationLegality::getFixedOrderRecurrences
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
Definition: LoopVectorizationLegality.h:297
llvm::MVT::v4f32
@ v4f32
Definition: MachineValueType.h:170
llvm::AArch64TTIImpl::isLegalToVectorizeReduction
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
Definition: AArch64TargetTransformInfo.cpp:2734
llvm::MVT::getVectorNumElements
unsigned getVectorNumElements() const
Definition: MachineValueType.h:905
llvm::MVT::i8
@ i8
Definition: MachineValueType.h:46
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
llvm::MVT::Other
@ Other
Definition: MachineValueType.h:42
llvm::LoopBase::getLoopDepth
unsigned getLoopDepth() const
Return the nesting level of this loop.
Definition: LoopInfo.h:97
llvm::CmpInst::FCMP_OGE
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:724
llvm::MVT::nxv4i32
@ nxv4i32
Definition: MachineValueType.h:231
llvm::Instruction::getFastMathFlags
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
Definition: Instruction.cpp:326
Ptr
@ Ptr
Definition: TargetLibraryInfo.cpp:60
llvm::CmpInst::BAD_ICMP_PREDICATE
@ BAD_ICMP_PREDICATE
Definition: InstrTypes.h:752
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:82
llvm::MVT
Machine Value Type.
Definition: MachineValueType.h:31
llvm::TargetTransformInfo::OperandValueInfo::getNoProps
OperandValueInfo getNoProps() const
Definition: TargetTransformInfo.h:943
llvm::TargetTransformInfo::SK_Splice
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
Definition: TargetTransformInfo.h:902
llvm::MVT::nxv4bf16
@ nxv4bf16
Definition: MachineValueType.h:255
llvm::LinearPolySize::getKnownMinValue
ScalarTy getKnownMinValue() const
Returns the minimum value this size can represent.
Definition: TypeSize.h:296
llvm::AArch64TTIImpl::getMaxInterleaveFactor
unsigned getMaxInterleaveFactor(unsigned VF)
Definition: AArch64TargetTransformInfo.cpp:2509
llvm::AArch64TTIImpl::getSpliceCost
InstructionCost getSpliceCost(VectorType *Tp, int Index)
Definition: AArch64TargetTransformInfo.cpp:2916
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
Definition: AArch64TargetTransformInfo.cpp:1505
llvm::CmpInst::isIntPredicate
bool isIntPredicate() const
Definition: InstrTypes.h:827
llvm::TargetTransformInfo::MemCmpExpansionOptions
Returns options for expansion of memcmp. IsZeroCmp is.
Definition: TargetTransformInfo.h:780
TailFoldingKind::remove
void remove(uint8_t Flag)
Definition: AArch64TargetTransformInfo.cpp:90
llvm::MVT::nxv2i32
@ nxv2i32
Definition: MachineValueType.h:230
llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition: TargetTransformInfo.h:244
llvm::APIntOps::smin
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition: APInt.h:2147
llvm::sys::fs::remove
std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
llvm::RecurKind::UMax
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
llvm::PatternMatch::m_SpecificInt
specific_intval< false > m_SpecificInt(APInt V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:854
llvm::Sched::Source
@ Source
Definition: TargetLowering.h:99
Fallback
@ Fallback
Definition: WholeProgramDevirt.cpp:180
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::LoopInfo
Definition: LoopInfo.h:1108
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
Mul
BinaryOperator * Mul
Definition: X86PartialReduction.cpp:70
tryCombineFromSVBoolBinOp
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:573
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1741
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:49
llvm::StructType
Class to represent struct types.
Definition: DerivedTypes.h:213
llvm::MVT::v2i32
@ v2i32
Definition: MachineValueType.h:110
instCombineSVESrshl
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1395
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:352
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:42
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:222
TailFoldingKind::TFSimple
@ TFSimple
Definition: AArch64TargetTransformInfo.cpp:51
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::MVT::v2f32
@ v2f32
Definition: MachineValueType.h:168
llvm::TargetLoweringBase::AddrMode::BaseGV
GlobalValue * BaseGV
Definition: TargetLowering.h:2589
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:994
instCombineSVECondLast
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:893
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::TargetTransformInfo::UnrollingPreferences::DefaultUnrollRuntimeCount
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
Definition: TargetTransformInfo.h:453
llvm::RecurKind::FMax
@ FMax
FP max implemented in terms of select(cmp()).
llvm::MVT::v4i32
@ v4i32
Definition: MachineValueType.h:112
instCombineSVECmpNE
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:716
llvm::AArch64TTIImpl::getPopcntSupport
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
Definition: AArch64TargetTransformInfo.cpp:314
llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:174
llvm::ISD::FMUL
@ FMUL
Definition: ISDOpcodes.h:392
llvm::TargetTransformInfo::SK_Transpose
@ SK_Transpose
Transpose two vectors.
Definition: TargetTransformInfo.h:895
llvm::MVT::v8i64
@ v8i64
Definition: MachineValueType.h:134
llvm::TargetTransformInfo::CastContextHint::None
@ None
The cast is not used with a load/store of any kind.
llvm::SmallPtrSetImplBase::size
size_type size() const
Definition: SmallPtrSet.h:93
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:668
llvm::InstCombiner::replaceInstUsesWith
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:417
AArch64TargetTransformInfo.h
llvm::MVT::v16i32
@ v16i32
Definition: MachineValueType.h:121
Callee
amdgpu Simplify well known AMD library false FunctionCallee Callee
Definition: AMDGPULibCalls.cpp:187
llvm::AArch64TTIImpl::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:2263
llvm::MCID::Select
@ Select
Definition: MCInstrDesc.h:164
llvm::RecurKind::FMulAdd
@ FMulAdd
Fused multiply-add of floats (a * b + c).
llvm::MVT::nxv16i1
@ nxv16i1
Definition: MachineValueType.h:210
llvm::APIntOps::umax
const APInt & umax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be unsigned.
Definition: APInt.h:2162
llvm::MVT::v8bf16
@ v8bf16
Definition: MachineValueType.h:161
llvm::MVT::nxv2i16
@ nxv2i16
Definition: MachineValueType.h:223
llvm::TargetLoweringBase::AddrMode::BaseOffs
int64_t BaseOffs
Definition: TargetLowering.h:2590
instCombineST1ScatterIndex
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1319
llvm::Type::getInt64Ty
static IntegerType * getInt64Ty(LLVMContext &C)
Definition: Type.cpp:240
instCombineSVEST1
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
Definition: AArch64TargetTransformInfo.cpp:1118
TailFoldingKind::TailFoldingOpts
TailFoldingOpts
Definition: AArch64TargetTransformInfo.cpp:47
Insn
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
Definition: AArch64MIPeepholeOpt.cpp:129
llvm::Constant::getNullValue
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:350
llvm::Type::getIntNTy
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:243
llvm::TargetLoweringBase::AddrMode::Scale
int64_t Scale
Definition: TargetLowering.h:2592
EnableFalkorHWPFUnrollFix
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
CostTable.h
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:351
llvm::TargetTransformInfo::UnrollingPreferences::UpperBound
bool UpperBound
Allow using trip count upper bound to unroll loops.
Definition: TargetTransformInfo.h:484
llvm::AArch64_IMM::expandMOVImm
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
Definition: AArch64ExpandImm.cpp:303
llvm::CallBase::arg_size
unsigned arg_size() const
Definition: InstrTypes.h:1339
instCombineSVEZip
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1270
llvm::APInt::sextOrTrunc
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1002
llvm::MVT::v8i16
@ v8i16
Definition: MachineValueType.h:101
llvm::APInt::isNegatedPowerOf2
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:434
llvm::TypeSize
Definition: TypeSize.h:435
llvm::SCEVAddRecExpr
This node represents a polynomial recurrence on the trip count of the specified loop.
Definition: ScalarEvolutionExpressions.h:342
llvm::AArch64TTIImpl::getTgtMemIntrinsic
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
Definition: AArch64TargetTransformInfo.cpp:2662
instCombineSVESel
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:672
llvm::InterleavedAccessInfo::hasGroups
bool hasGroups() const
Returns true if we have any interleave groups.
Definition: VectorUtils.h:823
llvm::LinearPolySize< TypeSize >::getScalable
static TypeSize getScalable(ScalarTy MinVal)
Definition: TypeSize.h:286
llvm::MVT::i32
@ i32
Definition: MachineValueType.h:48
llvm::MapVector::size
size_type size() const
Definition: MapVector.h:61
llvm::Type::getPointerTo
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:774
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:226
LoopVectorizationLegality.h
llvm::ISD::SDIV
@ SDIV
Definition: ISDOpcodes.h:242
llvm::AArch64TTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:2026
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:185
TailFoldingKind::TFDisabled
@ TFDisabled
Definition: AArch64TargetTransformInfo.cpp:48
llvm::MVT::nxv8i8
@ nxv8i8
Definition: MachineValueType.h:217
llvm::MVT::v8i32
@ v8i32
Definition: MachineValueType.h:116
llvm::InstCombiner
The core instruction combiner logic.
Definition: InstCombiner.h:45
llvm::AMDGPU::Hwreg::Width
Width
Definition: SIDefines.h:436
llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition: ISDOpcodes.h:774
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
llvm::MVT::nxv8i32
@ nxv8i32
Definition: MachineValueType.h:232
llvm::TargetTransformInfo::RGK_ScalableVector
@ RGK_ScalableVector
Definition: TargetTransformInfo.h:967
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
llvm::MVT::nxv8f16
@ nxv8f16
Definition: MachineValueType.h:249
llvm::InstructionCost::getInvalid
static InstructionCost getInvalid(CostType Val=0)
Definition: InstructionCost.h:74
llvm::RISCVMatInt::Imm
@ Imm
Definition: RISCVMatInt.h:23
llvm::Instruction::BinaryOps
BinaryOps
Definition: Instruction.h:793
llvm::RecurrenceDescriptor
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:69
llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:871
llvm::AArch64TTIImpl::getScalingFactorCost
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
Definition: AArch64TargetTransformInfo.cpp:3226
llvm::RecurKind::SelectICmp
@ SelectICmp
Integer select(icmp(),x,y) where one of (x,y) is loop invariant.
llvm::RecurKind::FAdd
@ FAdd
Sum of floats.
llvm::getNumElementsFromSVEPredPattern
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
Definition: AArch64BaseInfo.h:512
llvm::ISD::FSUB
@ FSUB
Definition: ISDOpcodes.h:391
llvm::Pattern
Definition: FileCheckImpl.h:614
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:151
llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:691
llvm::TargetTransformInfo::RegisterKind
RegisterKind
Definition: TargetTransformInfo.h:967
llvm::MVT::nxv8i1
@ nxv8i1
Definition: MachineValueType.h:209
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:772
llvm::Type::isBFloatTy
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:241
llvm::MVT::f16
@ f16
Definition: MachineValueType.h:56
llvm::CmpInst::FCMP_UNE
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:735
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1341
N
#define N
llvm::ISD::SRL
@ SRL
Definition: ISDOpcodes.h:693
EnableFixedwidthAutovecInStreamingMode
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
TargetTransformInfo.h
llvm::AArch64TTIImpl::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:2377
llvm::TargetLoweringBase::AddrMode
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
Definition: TargetLowering.h:2588
llvm::PHINode
Definition: Instructions.h:2697
llvm::PatternMatch
Definition: PatternMatch.h:47
instCombineSVESDIV
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1351
llvm::CmpInst::FCMP_OLE
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:726
llvm::ISD::MULHU
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:637
instCombineSVELD1
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
Definition: AArch64TargetTransformInfo.cpp:1095
llvm::AArch64TTIImpl::preferPredicateOverEpilogue
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, InterleavedAccessInfo *IAI)
Definition: AArch64TargetTransformInfo.cpp:3201
llvm::MemIntrinsicInfo
Information about a load/store intrinsic defined by the target.
Definition: TargetTransformInfo.h:72
llvm::AArch64TTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AArch64TargetTransformInfo.cpp:2621
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:398
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:47
llvm::ScalableVectorType::get
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:705
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1474
llvm::ISD::FNEG
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:911
llvm::RecurKind::FMin
@ FMin
FP min implemented in terms of select(cmp()).
TailFoldingKind::TFRecurrences
@ TFRecurrences
Definition: AArch64TargetTransformInfo.cpp:50
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::AArch64TTIImpl::shouldMaximizeVectorBandwidth
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
Definition: AArch64TargetTransformInfo.cpp:144
llvm::AArch64TTIImpl::shouldConsiderAddressTypePromotion
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
Definition: AArch64TargetTransformInfo.cpp:2707
llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition: TargetTransformInfo.h:245
llvm::Value::takeName
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:381
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:394
llvm::FastMathFlags::allowContract
bool allowContract() const
Definition: FMF.h:71
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
BasicTTIImpl.h
llvm::cl::desc
Definition: CommandLine.h:412
llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:760
TailFoldingKind::TFReductions
@ TFReductions
Definition: AArch64TargetTransformInfo.cpp:49
llvm::MVT::v8i8
@ v8i8
Definition: MachineValueType.h:88
llvm::TargetTransformInfo::OperandValueInfo::isConstant
bool isConstant() const
Definition: TargetTransformInfo.h:930
llvm::MVT::nxv2i8
@ nxv2i8
Definition: MachineValueType.h:215
llvm::MVT::v8f32
@ v8f32
Definition: MachineValueType.h:174
llvm::ConstantAggregateZero::get
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1587
instCombineSVEPTest
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:977
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:98
llvm::MVT::v16i64
@ v16i64
Definition: MachineValueType.h:135
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1297
llvm::PatternMatch::m_Cmp
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:89
llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:852
llvm::MVT::f32
@ f32
Definition: MachineValueType.h:57
llvm::StringRef::split
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:692
llvm::RecurKind::SMax
@ SMax
Signed integer max implemented in terms of select(cmp()).
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::AArch64Subtarget::Falkor
@ Falkor
Definition: AArch64Subtarget.h:74
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:219
llvm::Type::isFP128Ty
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:162
Debug.h
llvm::VectorType::get
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:668
SVETailFolding
cl::opt< TailFoldingKind, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE:" "\ndisabled No loop types will vectorize using tail-folding" "\ndefault Uses the default tail-folding settings for the target " "CPU" "\nall All legal loop types will vectorize using tail-folding" "\nsimple Use tail-folding for simple loops (not reductions or " "recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences"), cl::location(TailFoldingKindLoc))
instCombineSVEDup
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:680
llvm::AArch64TTIImpl::useNeonVector
bool useNeonVector(const Type *Ty) const
Definition: AArch64TargetTransformInfo.cpp:2407
llvm::APIntOps::smax
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition: APInt.h:2152
llvm::MVT::nxv8f64
@ nxv8f64
Definition: MachineValueType.h:269
llvm::SMEAttrs::requiresLazySave
bool requiresLazySave(const SMEAttrs &Callee) const
Definition: AArch64SMEAttributes.h:82
llvm::ISD::CTPOP
@ CTPOP
Definition: ISDOpcodes.h:703
llvm::RecurKind::Xor
@ Xor
Bitwise or logical XOR of integers.
instCombineRDFFR
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:937
llvm::AArch64TTIImpl::getOrCreateResultFromMemIntrinsic
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
Definition: AArch64TargetTransformInfo.cpp:2626
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:164
llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:288
llvm::MVT::v4i1
@ v4i1
Definition: MachineValueType.h:68
llvm::AArch64TTIImpl::enableMemCmpExpansion
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
Definition: AArch64TargetTransformInfo.cpp:2331
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:39
llvm::AArch64TTIImpl::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
Definition: AArch64TargetTransformInfo.cpp:2244
llvm::MVT::nxv8f32
@ nxv8f32
Definition: MachineValueType.h:263
llvm::SmallVectorImpl::insert
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:809
llvm::PoisonValue::get
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1732
llvm::ISD::FDIV
@ FDIV
Definition: ISDOpcodes.h:393