LLVM  15.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "AArch64ExpandImm.h"
11 #include "AArch64PerfectShuffle.h"
14 #include "llvm/Analysis/LoopInfo.h"
17 #include "llvm/CodeGen/CostTable.h"
19 #include "llvm/IR/IntrinsicInst.h"
20 #include "llvm/IR/Intrinsics.h"
21 #include "llvm/IR/IntrinsicsAArch64.h"
22 #include "llvm/IR/PatternMatch.h"
23 #include "llvm/Support/Debug.h"
25 #include <algorithm>
26 using namespace llvm;
27 using namespace llvm::PatternMatch;
28 
29 #define DEBUG_TYPE "aarch64tti"
30 
31 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
32  cl::init(true), cl::Hidden);
33 
34 static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
35  cl::Hidden);
36 
37 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
38  cl::init(10), cl::Hidden);
39 
41  const Function *Callee) const {
42  const TargetMachine &TM = getTLI()->getTargetMachine();
43 
44  const FeatureBitset &CallerBits =
45  TM.getSubtargetImpl(*Caller)->getFeatureBits();
46  const FeatureBitset &CalleeBits =
47  TM.getSubtargetImpl(*Callee)->getFeatureBits();
48 
49  // Inline a callee if its target-features are a subset of the callers
50  // target-features.
51  return (CallerBits & CalleeBits) == CalleeBits;
52 }
53 
54 /// Calculate the cost of materializing a 64-bit value. This helper
55 /// method might only calculate a fraction of a larger immediate. Therefore it
56 /// is valid to return a cost of ZERO.
58  // Check if the immediate can be encoded within an instruction.
59  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
60  return 0;
61 
62  if (Val < 0)
63  Val = ~Val;
64 
65  // Calculate how many moves we will need to materialize this constant.
68  return Insn.size();
69 }
70 
71 /// Calculate the cost of materializing the given constant.
74  assert(Ty->isIntegerTy());
75 
76  unsigned BitSize = Ty->getPrimitiveSizeInBits();
77  if (BitSize == 0)
78  return ~0U;
79 
80  // Sign-extend all constants to a multiple of 64-bit.
81  APInt ImmVal = Imm;
82  if (BitSize & 0x3f)
83  ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
84 
85  // Split the constant into 64-bit chunks and calculate the cost for each
86  // chunk.
87  InstructionCost Cost = 0;
88  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
89  APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
90  int64_t Val = Tmp.getSExtValue();
91  Cost += getIntImmCost(Val);
92  }
93  // We need at least one instruction to materialze the constant.
94  return std::max<InstructionCost>(1, Cost);
95 }
96 
97 InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
98  const APInt &Imm, Type *Ty,
100  Instruction *Inst) {
101  assert(Ty->isIntegerTy());
102 
103  unsigned BitSize = Ty->getPrimitiveSizeInBits();
104  // There is no cost model for constants with a bit size of 0. Return TCC_Free
105  // here, so that constant hoisting will ignore this constant.
106  if (BitSize == 0)
107  return TTI::TCC_Free;
108 
109  unsigned ImmIdx = ~0U;
110  switch (Opcode) {
111  default:
112  return TTI::TCC_Free;
113  case Instruction::GetElementPtr:
114  // Always hoist the base address of a GetElementPtr.
115  if (Idx == 0)
116  return 2 * TTI::TCC_Basic;
117  return TTI::TCC_Free;
118  case Instruction::Store:
119  ImmIdx = 0;
120  break;
121  case Instruction::Add:
122  case Instruction::Sub:
123  case Instruction::Mul:
124  case Instruction::UDiv:
125  case Instruction::SDiv:
126  case Instruction::URem:
127  case Instruction::SRem:
128  case Instruction::And:
129  case Instruction::Or:
130  case Instruction::Xor:
131  case Instruction::ICmp:
132  ImmIdx = 1;
133  break;
134  // Always return TCC_Free for the shift value of a shift instruction.
135  case Instruction::Shl:
136  case Instruction::LShr:
137  case Instruction::AShr:
138  if (Idx == 1)
139  return TTI::TCC_Free;
140  break;
141  case Instruction::Trunc:
142  case Instruction::ZExt:
143  case Instruction::SExt:
144  case Instruction::IntToPtr:
145  case Instruction::PtrToInt:
146  case Instruction::BitCast:
147  case Instruction::PHI:
148  case Instruction::Call:
149  case Instruction::Select:
150  case Instruction::Ret:
151  case Instruction::Load:
152  break;
153  }
154 
155  if (Idx == ImmIdx) {
156  int NumConstants = (BitSize + 63) / 64;
158  return (Cost <= NumConstants * TTI::TCC_Basic)
159  ? static_cast<int>(TTI::TCC_Free)
160  : Cost;
161  }
162  return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
163 }
164 
167  const APInt &Imm, Type *Ty,
169  assert(Ty->isIntegerTy());
170 
171  unsigned BitSize = Ty->getPrimitiveSizeInBits();
172  // There is no cost model for constants with a bit size of 0. Return TCC_Free
173  // here, so that constant hoisting will ignore this constant.
174  if (BitSize == 0)
175  return TTI::TCC_Free;
176 
177  // Most (all?) AArch64 intrinsics do not support folding immediates into the
178  // selected instruction, so we compute the materialization cost for the
179  // immediate directly.
180  if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
181  return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
182 
183  switch (IID) {
184  default:
185  return TTI::TCC_Free;
186  case Intrinsic::sadd_with_overflow:
187  case Intrinsic::uadd_with_overflow:
188  case Intrinsic::ssub_with_overflow:
189  case Intrinsic::usub_with_overflow:
190  case Intrinsic::smul_with_overflow:
191  case Intrinsic::umul_with_overflow:
192  if (Idx == 1) {
193  int NumConstants = (BitSize + 63) / 64;
195  return (Cost <= NumConstants * TTI::TCC_Basic)
196  ? static_cast<int>(TTI::TCC_Free)
197  : Cost;
198  }
199  break;
200  case Intrinsic::experimental_stackmap:
201  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
202  return TTI::TCC_Free;
203  break;
204  case Intrinsic::experimental_patchpoint_void:
205  case Intrinsic::experimental_patchpoint_i64:
206  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
207  return TTI::TCC_Free;
208  break;
209  case Intrinsic::experimental_gc_statepoint:
210  if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
211  return TTI::TCC_Free;
212  break;
213  }
214  return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
215 }
216 
219  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
220  if (TyWidth == 32 || TyWidth == 64)
221  return TTI::PSK_FastHardware;
222  // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
223  return TTI::PSK_Software;
224 }
225 
229  auto *RetTy = ICA.getReturnType();
230  switch (ICA.getID()) {
231  case Intrinsic::umin:
232  case Intrinsic::umax:
233  case Intrinsic::smin:
234  case Intrinsic::smax: {
235  static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
237  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
238  // v2i64 types get converted to cmp+bif hence the cost of 2
239  if (LT.second == MVT::v2i64)
240  return LT.first * 2;
241  if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
242  return LT.first;
243  break;
244  }
245  case Intrinsic::sadd_sat:
246  case Intrinsic::ssub_sat:
247  case Intrinsic::uadd_sat:
248  case Intrinsic::usub_sat: {
249  static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
251  MVT::v2i64};
252  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
253  // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
254  // need to extend the type, as it uses shr(qadd(shl, shl)).
255  unsigned Instrs =
256  LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
257  if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
258  return LT.first * Instrs;
259  break;
260  }
261  case Intrinsic::abs: {
262  static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
264  MVT::v2i64};
265  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
266  if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
267  return LT.first;
268  break;
269  }
270  case Intrinsic::experimental_stepvector: {
271  InstructionCost Cost = 1; // Cost of the `index' instruction
272  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
273  // Legalisation of illegal vectors involves an `index' instruction plus
274  // (LT.first - 1) vector adds.
275  if (LT.first > 1) {
276  Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
277  InstructionCost AddCost =
278  getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
279  Cost += AddCost * (LT.first - 1);
280  }
281  return Cost;
282  }
283  case Intrinsic::bitreverse: {
284  static const CostTblEntry BitreverseTbl[] = {
285  {Intrinsic::bitreverse, MVT::i32, 1},
286  {Intrinsic::bitreverse, MVT::i64, 1},
287  {Intrinsic::bitreverse, MVT::v8i8, 1},
288  {Intrinsic::bitreverse, MVT::v16i8, 1},
289  {Intrinsic::bitreverse, MVT::v4i16, 2},
290  {Intrinsic::bitreverse, MVT::v8i16, 2},
291  {Intrinsic::bitreverse, MVT::v2i32, 2},
292  {Intrinsic::bitreverse, MVT::v4i32, 2},
293  {Intrinsic::bitreverse, MVT::v1i64, 2},
294  {Intrinsic::bitreverse, MVT::v2i64, 2},
295  };
296  const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy);
297  const auto *Entry =
298  CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
299  if (Entry) {
300  // Cost Model is using the legal type(i32) that i8 and i16 will be
301  // converted to +1 so that we match the actual lowering cost
302  if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
303  TLI->getValueType(DL, RetTy, true) == MVT::i16)
304  return LegalisationCost.first * Entry->Cost + 1;
305 
306  return LegalisationCost.first * Entry->Cost;
307  }
308  break;
309  }
310  case Intrinsic::ctpop: {
311  static const CostTblEntry CtpopCostTbl[] = {
312  {ISD::CTPOP, MVT::v2i64, 4},
313  {ISD::CTPOP, MVT::v4i32, 3},
314  {ISD::CTPOP, MVT::v8i16, 2},
315  {ISD::CTPOP, MVT::v16i8, 1},
316  {ISD::CTPOP, MVT::i64, 4},
317  {ISD::CTPOP, MVT::v2i32, 3},
318  {ISD::CTPOP, MVT::v4i16, 2},
319  {ISD::CTPOP, MVT::v8i8, 1},
320  {ISD::CTPOP, MVT::i32, 5},
321  };
322  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
323  MVT MTy = LT.second;
324  if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
325  // Extra cost of +1 when illegal vector types are legalized by promoting
326  // the integer type.
327  int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
328  RetTy->getScalarSizeInBits()
329  ? 1
330  : 0;
331  return LT.first * Entry->Cost + ExtraCost;
332  }
333  break;
334  }
335  case Intrinsic::sadd_with_overflow:
336  case Intrinsic::uadd_with_overflow:
337  case Intrinsic::ssub_with_overflow:
338  case Intrinsic::usub_with_overflow:
339  case Intrinsic::smul_with_overflow:
340  case Intrinsic::umul_with_overflow: {
341  static const CostTblEntry WithOverflowCostTbl[] = {
342  {Intrinsic::sadd_with_overflow, MVT::i8, 3},
343  {Intrinsic::uadd_with_overflow, MVT::i8, 3},
344  {Intrinsic::sadd_with_overflow, MVT::i16, 3},
345  {Intrinsic::uadd_with_overflow, MVT::i16, 3},
346  {Intrinsic::sadd_with_overflow, MVT::i32, 1},
347  {Intrinsic::uadd_with_overflow, MVT::i32, 1},
348  {Intrinsic::sadd_with_overflow, MVT::i64, 1},
349  {Intrinsic::uadd_with_overflow, MVT::i64, 1},
350  {Intrinsic::ssub_with_overflow, MVT::i8, 3},
351  {Intrinsic::usub_with_overflow, MVT::i8, 3},
352  {Intrinsic::ssub_with_overflow, MVT::i16, 3},
353  {Intrinsic::usub_with_overflow, MVT::i16, 3},
354  {Intrinsic::ssub_with_overflow, MVT::i32, 1},
355  {Intrinsic::usub_with_overflow, MVT::i32, 1},
356  {Intrinsic::ssub_with_overflow, MVT::i64, 1},
357  {Intrinsic::usub_with_overflow, MVT::i64, 1},
358  {Intrinsic::smul_with_overflow, MVT::i8, 5},
359  {Intrinsic::umul_with_overflow, MVT::i8, 4},
360  {Intrinsic::smul_with_overflow, MVT::i16, 5},
361  {Intrinsic::umul_with_overflow, MVT::i16, 4},
362  {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
363  {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
364  {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
365  {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
366  };
367  EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
368  if (MTy.isSimple())
369  if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
370  MTy.getSimpleVT()))
371  return Entry->Cost;
372  break;
373  }
374  case Intrinsic::fptosi_sat:
375  case Intrinsic::fptoui_sat: {
376  if (ICA.getArgTypes().empty())
377  break;
378  bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
379  auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]);
380  EVT MTy = TLI->getValueType(DL, RetTy);
381  // Check for the legal types, which are where the size of the input and the
382  // output are the same, or we are using cvt f64->i32 or f32->i64.
383  if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
384  LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
385  LT.second == MVT::v2f64) &&
386  (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
387  (LT.second == MVT::f64 && MTy == MVT::i32) ||
388  (LT.second == MVT::f32 && MTy == MVT::i64)))
389  return LT.first;
390  // Similarly for fp16 sizes
391  if (ST->hasFullFP16() &&
392  ((LT.second == MVT::f16 && MTy == MVT::i32) ||
393  ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
394  (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
395  return LT.first;
396 
397  // Otherwise we use a legal convert followed by a min+max
398  if ((LT.second.getScalarType() == MVT::f32 ||
399  LT.second.getScalarType() == MVT::f64 ||
400  (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
401  LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
402  Type *LegalTy =
403  Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
404  if (LT.second.isVector())
405  LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
406  InstructionCost Cost = 1;
408  LegalTy, {LegalTy, LegalTy});
409  Cost += getIntrinsicInstrCost(Attrs1, CostKind);
411  LegalTy, {LegalTy, LegalTy});
412  Cost += getIntrinsicInstrCost(Attrs2, CostKind);
413  return LT.first * Cost;
414  }
415  break;
416  }
417  default:
418  break;
419  }
420  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
421 }
422 
423 /// The function will remove redundant reinterprets casting in the presence
424 /// of the control flow
426  IntrinsicInst &II) {
428  auto RequiredType = II.getType();
429 
430  auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
431  assert(PN && "Expected Phi Node!");
432 
433  // Don't create a new Phi unless we can remove the old one.
434  if (!PN->hasOneUse())
435  return None;
436 
437  for (Value *IncValPhi : PN->incoming_values()) {
438  auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
439  if (!Reinterpret ||
440  Reinterpret->getIntrinsicID() !=
441  Intrinsic::aarch64_sve_convert_to_svbool ||
442  RequiredType != Reinterpret->getArgOperand(0)->getType())
443  return None;
444  }
445 
446  // Create the new Phi
447  LLVMContext &Ctx = PN->getContext();
448  IRBuilder<> Builder(Ctx);
449  Builder.SetInsertPoint(PN);
450  PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
451  Worklist.push_back(PN);
452 
453  for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
454  auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
455  NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
456  Worklist.push_back(Reinterpret);
457  }
458 
459  // Cleanup Phi Node and reinterprets
460  return IC.replaceInstUsesWith(II, NPN);
461 }
462 
463 // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
464 // => (binop (pred) (from_svbool _) (from_svbool _))
465 //
466 // The above transformation eliminates a `to_svbool` in the predicate
467 // operand of bitwise operation `binop` by narrowing the vector width of
468 // the operation. For example, it would convert a `<vscale x 16 x i1>
469 // and` into a `<vscale x 4 x i1> and`. This is profitable because
470 // to_svbool must zero the new lanes during widening, whereas
471 // from_svbool is free.
473  IntrinsicInst &II) {
474  auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
475  if (!BinOp)
476  return None;
477 
478  auto IntrinsicID = BinOp->getIntrinsicID();
479  switch (IntrinsicID) {
480  case Intrinsic::aarch64_sve_and_z:
481  case Intrinsic::aarch64_sve_bic_z:
482  case Intrinsic::aarch64_sve_eor_z:
483  case Intrinsic::aarch64_sve_nand_z:
484  case Intrinsic::aarch64_sve_nor_z:
485  case Intrinsic::aarch64_sve_orn_z:
486  case Intrinsic::aarch64_sve_orr_z:
487  break;
488  default:
489  return None;
490  }
491 
492  auto BinOpPred = BinOp->getOperand(0);
493  auto BinOpOp1 = BinOp->getOperand(1);
494  auto BinOpOp2 = BinOp->getOperand(2);
495 
496  auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
497  if (!PredIntr ||
498  PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
499  return None;
500 
501  auto PredOp = PredIntr->getOperand(0);
502  auto PredOpTy = cast<VectorType>(PredOp->getType());
503  if (PredOpTy != II.getType())
504  return None;
505 
507  Builder.SetInsertPoint(&II);
508 
509  SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
510  auto NarrowBinOpOp1 = Builder.CreateIntrinsic(
511  Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
512  NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
513  if (BinOpOp1 == BinOpOp2)
514  NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
515  else
516  NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic(
517  Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
518 
519  auto NarrowedBinOp =
520  Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
521  return IC.replaceInstUsesWith(II, NarrowedBinOp);
522 }
523 
525  IntrinsicInst &II) {
526  // If the reinterpret instruction operand is a PHI Node
527  if (isa<PHINode>(II.getArgOperand(0)))
528  return processPhiNode(IC, II);
529 
530  if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
531  return BinOpCombine;
532 
533  SmallVector<Instruction *, 32> CandidatesForRemoval;
534  Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
535 
536  const auto *IVTy = cast<VectorType>(II.getType());
537 
538  // Walk the chain of conversions.
539  while (Cursor) {
540  // If the type of the cursor has fewer lanes than the final result, zeroing
541  // must take place, which breaks the equivalence chain.
542  const auto *CursorVTy = cast<VectorType>(Cursor->getType());
543  if (CursorVTy->getElementCount().getKnownMinValue() <
544  IVTy->getElementCount().getKnownMinValue())
545  break;
546 
547  // If the cursor has the same type as I, it is a viable replacement.
548  if (Cursor->getType() == IVTy)
549  EarliestReplacement = Cursor;
550 
551  auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
552 
553  // If this is not an SVE conversion intrinsic, this is the end of the chain.
554  if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
555  Intrinsic::aarch64_sve_convert_to_svbool ||
556  IntrinsicCursor->getIntrinsicID() ==
557  Intrinsic::aarch64_sve_convert_from_svbool))
558  break;
559 
560  CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
561  Cursor = IntrinsicCursor->getOperand(0);
562  }
563 
564  // If no viable replacement in the conversion chain was found, there is
565  // nothing to do.
566  if (!EarliestReplacement)
567  return None;
568 
569  return IC.replaceInstUsesWith(II, EarliestReplacement);
570 }
571 
573  IntrinsicInst &II) {
574  IRBuilder<> Builder(&II);
575  auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1),
576  II.getOperand(2));
577  return IC.replaceInstUsesWith(II, Select);
578 }
579 
581  IntrinsicInst &II) {
582  IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
583  if (!Pg)
584  return None;
585 
586  if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
587  return None;
588 
589  const auto PTruePattern =
590  cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
591  if (PTruePattern != AArch64SVEPredPattern::vl1)
592  return None;
593 
594  // The intrinsic is inserting into lane zero so use an insert instead.
595  auto *IdxTy = Type::getInt64Ty(II.getContext());
597  II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
598  Insert->insertBefore(&II);
599  Insert->takeName(&II);
600 
601  return IC.replaceInstUsesWith(II, Insert);
602 }
603 
605  IntrinsicInst &II) {
606  // Replace DupX with a regular IR splat.
608  Builder.SetInsertPoint(&II);
609  auto *RetTy = cast<ScalableVectorType>(II.getType());
610  Value *Splat =
611  Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0));
612  Splat->takeName(&II);
613  return IC.replaceInstUsesWith(II, Splat);
614 }
615 
617  IntrinsicInst &II) {
618  LLVMContext &Ctx = II.getContext();
619  IRBuilder<> Builder(Ctx);
620  Builder.SetInsertPoint(&II);
621 
622  // Check that the predicate is all active
623  auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
624  if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
625  return None;
626 
627  const auto PTruePattern =
628  cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
629  if (PTruePattern != AArch64SVEPredPattern::all)
630  return None;
631 
632  // Check that we have a compare of zero..
633  auto *SplatValue =
634  dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
635  if (!SplatValue || !SplatValue->isZero())
636  return None;
637 
638  // ..against a dupq
639  auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
640  if (!DupQLane ||
641  DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
642  return None;
643 
644  // Where the dupq is a lane 0 replicate of a vector insert
645  if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
646  return None;
647 
648  auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
649  if (!VecIns ||
650  VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert)
651  return None;
652 
653  // Where the vector insert is a fixed constant vector insert into undef at
654  // index zero
655  if (!isa<UndefValue>(VecIns->getArgOperand(0)))
656  return None;
657 
658  if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
659  return None;
660 
661  auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
662  if (!ConstVec)
663  return None;
664 
665  auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
666  auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
667  if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
668  return None;
669 
670  unsigned NumElts = VecTy->getNumElements();
671  unsigned PredicateBits = 0;
672 
673  // Expand intrinsic operands to a 16-bit byte level predicate
674  for (unsigned I = 0; I < NumElts; ++I) {
675  auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
676  if (!Arg)
677  return None;
678  if (!Arg->isZero())
679  PredicateBits |= 1 << (I * (16 / NumElts));
680  }
681 
682  // If all bits are zero bail early with an empty predicate
683  if (PredicateBits == 0) {
684  auto *PFalse = Constant::getNullValue(II.getType());
685  PFalse->takeName(&II);
686  return IC.replaceInstUsesWith(II, PFalse);
687  }
688 
689  // Calculate largest predicate type used (where byte predicate is largest)
690  unsigned Mask = 8;
691  for (unsigned I = 0; I < 16; ++I)
692  if ((PredicateBits & (1 << I)) != 0)
693  Mask |= (I % 8);
694 
695  unsigned PredSize = Mask & -Mask;
696  auto *PredType = ScalableVectorType::get(
697  Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
698 
699  // Ensure all relevant bits are set
700  for (unsigned I = 0; I < 16; I += PredSize)
701  if ((PredicateBits & (1 << I)) == 0)
702  return None;
703 
704  auto *PTruePat =
706  auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
707  {PredType}, {PTruePat});
708  auto *ConvertToSVBool = Builder.CreateIntrinsic(
709  Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
710  auto *ConvertFromSVBool =
711  Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
712  {II.getType()}, {ConvertToSVBool});
713 
714  ConvertFromSVBool->takeName(&II);
715  return IC.replaceInstUsesWith(II, ConvertFromSVBool);
716 }
717 
719  IntrinsicInst &II) {
721  Builder.SetInsertPoint(&II);
722  Value *Pg = II.getArgOperand(0);
723  Value *Vec = II.getArgOperand(1);
724  auto IntrinsicID = II.getIntrinsicID();
725  bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
726 
727  // lastX(splat(X)) --> X
728  if (auto *SplatVal = getSplatValue(Vec))
729  return IC.replaceInstUsesWith(II, SplatVal);
730 
731  // If x and/or y is a splat value then:
732  // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
733  Value *LHS, *RHS;
734  if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
735  if (isSplatValue(LHS) || isSplatValue(RHS)) {
736  auto *OldBinOp = cast<BinaryOperator>(Vec);
737  auto OpC = OldBinOp->getOpcode();
738  auto *NewLHS =
739  Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
740  auto *NewRHS =
741  Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
742  auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
743  OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
744  return IC.replaceInstUsesWith(II, NewBinOp);
745  }
746  }
747 
748  auto *C = dyn_cast<Constant>(Pg);
749  if (IsAfter && C && C->isNullValue()) {
750  // The intrinsic is extracting lane 0 so use an extract instead.
751  auto *IdxTy = Type::getInt64Ty(II.getContext());
752  auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
753  Extract->insertBefore(&II);
754  Extract->takeName(&II);
755  return IC.replaceInstUsesWith(II, Extract);
756  }
757 
758  auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
759  if (!IntrPG)
760  return None;
761 
762  if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
763  return None;
764 
765  const auto PTruePattern =
766  cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
767 
768  // Can the intrinsic's predicate be converted to a known constant index?
769  unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
770  if (!MinNumElts)
771  return None;
772 
773  unsigned Idx = MinNumElts - 1;
774  // Increment the index if extracting the element after the last active
775  // predicate element.
776  if (IsAfter)
777  ++Idx;
778 
779  // Ignore extracts whose index is larger than the known minimum vector
780  // length. NOTE: This is an artificial constraint where we prefer to
781  // maintain what the user asked for until an alternative is proven faster.
782  auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
783  if (Idx >= PgVTy->getMinNumElements())
784  return None;
785 
786  // The intrinsic is extracting a fixed lane so use an extract instead.
787  auto *IdxTy = Type::getInt64Ty(II.getContext());
788  auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
789  Extract->insertBefore(&II);
790  Extract->takeName(&II);
791  return IC.replaceInstUsesWith(II, Extract);
792 }
793 
795  IntrinsicInst &II) {
796  LLVMContext &Ctx = II.getContext();
797  IRBuilder<> Builder(Ctx);
798  Builder.SetInsertPoint(&II);
799  // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
800  // can work with RDFFR_PP for ptest elimination.
801  auto *AllPat =
803  auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
804  {II.getType()}, {AllPat});
805  auto *RDFFR =
806  Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
807  RDFFR->takeName(&II);
808  return IC.replaceInstUsesWith(II, RDFFR);
809 }
810 
812 instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
813  const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
814 
816  LLVMContext &Ctx = II.getContext();
817  IRBuilder<> Builder(Ctx);
818  Builder.SetInsertPoint(&II);
819 
820  Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
821  auto *VScale = Builder.CreateVScale(StepVal);
822  VScale->takeName(&II);
823  return IC.replaceInstUsesWith(II, VScale);
824  }
825 
826  unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
827 
828  return MinNumElts && NumElts >= MinNumElts
830  II, ConstantInt::get(II.getType(), MinNumElts)))
831  : None;
832 }
833 
835  IntrinsicInst &II) {
836  IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
837  IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
838 
839  if (Op1 && Op2 &&
840  Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
841  Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
842  Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
843 
845  Builder.SetInsertPoint(&II);
846 
847  Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
848  Type *Tys[] = {Op1->getArgOperand(0)->getType()};
849 
850  auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
851 
852  PTest->takeName(&II);
853  return IC.replaceInstUsesWith(II, PTest);
854  }
855 
856  return None;
857 }
858 
860  IntrinsicInst &II) {
861  // fold (fadd p a (fmul p b c)) -> (fma p a b c)
862  Value *P = II.getOperand(0);
863  Value *A = II.getOperand(1);
864  auto FMul = II.getOperand(2);
865  Value *B, *C;
866  if (!match(FMul, m_Intrinsic<Intrinsic::aarch64_sve_fmul>(
867  m_Specific(P), m_Value(B), m_Value(C))))
868  return None;
869 
870  if (!FMul->hasOneUse())
871  return None;
872 
873  llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
874  // Stop the combine when the flags on the inputs differ in case dropping flags
875  // would lead to us missing out on more beneficial optimizations.
876  if (FAddFlags != cast<CallInst>(FMul)->getFastMathFlags())
877  return None;
878  if (!FAddFlags.allowContract())
879  return None;
880 
882  Builder.SetInsertPoint(&II);
883  auto FMLA = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmla,
884  {II.getType()}, {P, A, B, C}, &II);
885  FMLA->setFastMathFlags(FAddFlags);
886  return IC.replaceInstUsesWith(II, FMLA);
887 }
888 
889 static bool isAllActivePredicate(Value *Pred) {
890  // Look through convert.from.svbool(convert.to.svbool(...) chain.
891  Value *UncastedPred;
892  if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
893  m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
894  m_Value(UncastedPred)))))
895  // If the predicate has the same or less lanes than the uncasted
896  // predicate then we know the casting has no effect.
897  if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
898  cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
899  Pred = UncastedPred;
900 
901  return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
902  m_ConstantInt<AArch64SVEPredPattern::all>()));
903 }
904 
908  Builder.SetInsertPoint(&II);
909 
910  Value *Pred = II.getOperand(0);
911  Value *PtrOp = II.getOperand(1);
912  Type *VecTy = II.getType();
913  Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo());
914 
915  if (isAllActivePredicate(Pred)) {
916  LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr);
917  Load->copyMetadata(II);
918  return IC.replaceInstUsesWith(II, Load);
919  }
920 
921  CallInst *MaskedLoad =
922  Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL),
923  Pred, ConstantAggregateZero::get(VecTy));
924  MaskedLoad->copyMetadata(II);
925  return IC.replaceInstUsesWith(II, MaskedLoad);
926 }
927 
931  Builder.SetInsertPoint(&II);
932 
933  Value *VecOp = II.getOperand(0);
934  Value *Pred = II.getOperand(1);
935  Value *PtrOp = II.getOperand(2);
936  Value *VecPtr =
937  Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo());
938 
939  if (isAllActivePredicate(Pred)) {
940  StoreInst *Store = Builder.CreateStore(VecOp, VecPtr);
941  Store->copyMetadata(II);
942  return IC.eraseInstFromFunction(II);
943  }
944 
945  CallInst *MaskedStore = Builder.CreateMaskedStore(
946  VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred);
947  MaskedStore->copyMetadata(II);
948  return IC.eraseInstFromFunction(II);
949 }
950 
951 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
952  switch (Intrinsic) {
953  case Intrinsic::aarch64_sve_fmul:
954  return Instruction::BinaryOps::FMul;
955  case Intrinsic::aarch64_sve_fadd:
956  return Instruction::BinaryOps::FAdd;
957  case Intrinsic::aarch64_sve_fsub:
958  return Instruction::BinaryOps::FSub;
959  default:
960  return Instruction::BinaryOpsEnd;
961  }
962 }
963 
965  IntrinsicInst &II) {
966  auto *OpPredicate = II.getOperand(0);
967  auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
968  if (BinOpCode == Instruction::BinaryOpsEnd ||
969  !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
970  m_ConstantInt<AArch64SVEPredPattern::all>())))
971  return None;
973  Builder.SetInsertPoint(&II);
974  Builder.setFastMathFlags(II.getFastMathFlags());
975  auto BinOp =
976  Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
977  return IC.replaceInstUsesWith(II, BinOp);
978 }
979 
981  IntrinsicInst &II) {
982  if (auto FMLA = instCombineSVEVectorFMLA(IC, II))
983  return FMLA;
984  return instCombineSVEVectorBinOp(IC, II);
985 }
986 
988  IntrinsicInst &II) {
989  auto *OpPredicate = II.getOperand(0);
990  auto *OpMultiplicand = II.getOperand(1);
991  auto *OpMultiplier = II.getOperand(2);
992 
994  Builder.SetInsertPoint(&II);
995 
996  // Return true if a given instruction is a unit splat value, false otherwise.
997  auto IsUnitSplat = [](auto *I) {
998  auto *SplatValue = getSplatValue(I);
999  if (!SplatValue)
1000  return false;
1001  return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1002  };
1003 
1004  // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1005  // with a unit splat value, false otherwise.
1006  auto IsUnitDup = [](auto *I) {
1007  auto *IntrI = dyn_cast<IntrinsicInst>(I);
1008  if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1009  return false;
1010 
1011  auto *SplatValue = IntrI->getOperand(2);
1012  return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1013  };
1014 
1015  if (IsUnitSplat(OpMultiplier)) {
1016  // [f]mul pg %n, (dupx 1) => %n
1017  OpMultiplicand->takeName(&II);
1018  return IC.replaceInstUsesWith(II, OpMultiplicand);
1019  } else if (IsUnitDup(OpMultiplier)) {
1020  // [f]mul pg %n, (dup pg 1) => %n
1021  auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1022  auto *DupPg = DupInst->getOperand(1);
1023  // TODO: this is naive. The optimization is still valid if DupPg
1024  // 'encompasses' OpPredicate, not only if they're the same predicate.
1025  if (OpPredicate == DupPg) {
1026  OpMultiplicand->takeName(&II);
1027  return IC.replaceInstUsesWith(II, OpMultiplicand);
1028  }
1029  }
1030 
1031  return instCombineSVEVectorBinOp(IC, II);
1032 }
1033 
1035  IntrinsicInst &II) {
1037  Builder.SetInsertPoint(&II);
1038  Value *UnpackArg = II.getArgOperand(0);
1039  auto *RetTy = cast<ScalableVectorType>(II.getType());
1040  bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1041  II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1042 
1043  // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1044  // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1045  if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1046  ScalarArg =
1047  Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1048  Value *NewVal =
1049  Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1050  NewVal->takeName(&II);
1051  return IC.replaceInstUsesWith(II, NewVal);
1052  }
1053 
1054  return None;
1055 }
1057  IntrinsicInst &II) {
1058  auto *OpVal = II.getOperand(0);
1059  auto *OpIndices = II.getOperand(1);
1060  VectorType *VTy = cast<VectorType>(II.getType());
1061 
1062  // Check whether OpIndices is a constant splat value < minimal element count
1063  // of result.
1064  auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1065  if (!SplatValue ||
1066  SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1067  return None;
1068 
1069  // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1070  // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1072  Builder.SetInsertPoint(&II);
1073  auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
1074  auto *VectorSplat =
1075  Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1076 
1077  VectorSplat->takeName(&II);
1078  return IC.replaceInstUsesWith(II, VectorSplat);
1079 }
1080 
1082  IntrinsicInst &II) {
1083  // Try to remove sequences of tuple get/set.
1084  Value *SetTuple, *SetIndex, *SetValue;
1085  auto *GetTuple = II.getArgOperand(0);
1086  auto *GetIndex = II.getArgOperand(1);
1087  // Check that we have tuple_get(GetTuple, GetIndex) where GetTuple is a
1088  // call to tuple_set i.e. tuple_set(SetTuple, SetIndex, SetValue).
1089  // Make sure that the types of the current intrinsic and SetValue match
1090  // in order to safely remove the sequence.
1091  if (!match(GetTuple,
1092  m_Intrinsic<Intrinsic::aarch64_sve_tuple_set>(
1093  m_Value(SetTuple), m_Value(SetIndex), m_Value(SetValue))) ||
1094  SetValue->getType() != II.getType())
1095  return None;
1096  // Case where we get the same index right after setting it.
1097  // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) --> SetValue
1098  if (GetIndex == SetIndex)
1099  return IC.replaceInstUsesWith(II, SetValue);
1100  // If we are getting a different index than what was set in the tuple_set
1101  // intrinsic. We can just set the input tuple to the one up in the chain.
1102  // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex)
1103  // --> tuple_get(SetTuple, GetIndex)
1104  return IC.replaceOperand(II, 0, SetTuple);
1105 }
1106 
1108  IntrinsicInst &II) {
1109  // zip1(uzp1(A, B), uzp2(A, B)) --> A
1110  // zip2(uzp1(A, B), uzp2(A, B)) --> B
1111  Value *A, *B;
1112  if (match(II.getArgOperand(0),
1113  m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1114  match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1115  m_Specific(A), m_Specific(B))))
1116  return IC.replaceInstUsesWith(
1117  II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1118 
1119  return None;
1120 }
1121 
1123  IntrinsicInst &II) {
1124  Value *Mask = II.getOperand(0);
1125  Value *BasePtr = II.getOperand(1);
1126  Value *Index = II.getOperand(2);
1127  Type *Ty = II.getType();
1128  Value *PassThru = ConstantAggregateZero::get(Ty);
1129 
1130  // Contiguous gather => masked load.
1131  // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1132  // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1133  Value *IndexBase;
1134  if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1135  m_Value(IndexBase), m_SpecificInt(1)))) {
1137  Builder.SetInsertPoint(&II);
1138 
1139  Align Alignment =
1140  BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1141 
1142  Type *VecPtrTy = PointerType::getUnqual(Ty);
1143  Value *Ptr = Builder.CreateGEP(
1144  cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase);
1145  Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1146  CallInst *MaskedLoad =
1147  Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1148  MaskedLoad->takeName(&II);
1149  return IC.replaceInstUsesWith(II, MaskedLoad);
1150  }
1151 
1152  return None;
1153 }
1154 
1156  IntrinsicInst &II) {
1157  Value *Val = II.getOperand(0);
1158  Value *Mask = II.getOperand(1);
1159  Value *BasePtr = II.getOperand(2);
1160  Value *Index = II.getOperand(3);
1161  Type *Ty = Val->getType();
1162 
1163  // Contiguous scatter => masked store.
1164  // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1165  // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1166  Value *IndexBase;
1167  if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1168  m_Value(IndexBase), m_SpecificInt(1)))) {
1170  Builder.SetInsertPoint(&II);
1171 
1172  Align Alignment =
1173  BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1174 
1175  Value *Ptr = Builder.CreateGEP(
1176  cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase);
1177  Type *VecPtrTy = PointerType::getUnqual(Ty);
1178  Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1179 
1180  (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1181 
1182  return IC.eraseInstFromFunction(II);
1183  }
1184 
1185  return None;
1186 }
1187 
1189  IntrinsicInst &II) {
1191  Builder.SetInsertPoint(&II);
1192  Type *Int32Ty = Builder.getInt32Ty();
1193  Value *Pred = II.getOperand(0);
1194  Value *Vec = II.getOperand(1);
1195  Value *DivVec = II.getOperand(2);
1196 
1197  Value *SplatValue = getSplatValue(DivVec);
1198  ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1199  if (!SplatConstantInt)
1200  return None;
1201  APInt Divisor = SplatConstantInt->getValue();
1202 
1203  if (Divisor.isPowerOf2()) {
1204  Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1205  auto ASRD = Builder.CreateIntrinsic(
1206  Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1207  return IC.replaceInstUsesWith(II, ASRD);
1208  }
1209  if (Divisor.isNegatedPowerOf2()) {
1210  Divisor.negate();
1211  Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1212  auto ASRD = Builder.CreateIntrinsic(
1213  Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1214  auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg,
1215  {ASRD->getType()}, {ASRD, Pred, ASRD});
1216  return IC.replaceInstUsesWith(II, NEG);
1217  }
1218 
1219  return None;
1220 }
1221 
1223  IntrinsicInst &II) {
1224  Value *A = II.getArgOperand(0);
1225  Value *B = II.getArgOperand(1);
1226  if (A == B)
1227  return IC.replaceInstUsesWith(II, A);
1228 
1229  return None;
1230 }
1231 
1233  IntrinsicInst &II) {
1234  IRBuilder<> Builder(&II);
1235  Value *Pred = II.getOperand(0);
1236  Value *Vec = II.getOperand(1);
1237  Value *Shift = II.getOperand(2);
1238 
1239  // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
1240  Value *AbsPred, *MergedValue;
1241  if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
1242  m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
1243  !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
1244  m_Value(MergedValue), m_Value(AbsPred), m_Value())))
1245 
1246  return None;
1247 
1248  // Transform is valid if any of the following are true:
1249  // * The ABS merge value is an undef or non-negative
1250  // * The ABS predicate is all active
1251  // * The ABS predicate and the SRSHL predicates are the same
1252  if (!isa<UndefValue>(MergedValue) &&
1253  !match(MergedValue, m_NonNegative()) &&
1254  AbsPred != Pred && !isAllActivePredicate(AbsPred))
1255  return None;
1256 
1257  // Only valid when the shift amount is non-negative, otherwise the rounding
1258  // behaviour of SRSHL cannot be ignored.
1259  if (!match(Shift, m_NonNegative()))
1260  return None;
1261 
1262  auto LSL = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, {II.getType()},
1263  {Pred, Vec, Shift});
1264 
1265  return IC.replaceInstUsesWith(II, LSL);
1266 }
1267 
1270  IntrinsicInst &II) const {
1271  Intrinsic::ID IID = II.getIntrinsicID();
1272  switch (IID) {
1273  default:
1274  break;
1275  case Intrinsic::aarch64_neon_fmaxnm:
1276  case Intrinsic::aarch64_neon_fminnm:
1277  return instCombineMaxMinNM(IC, II);
1278  case Intrinsic::aarch64_sve_convert_from_svbool:
1279  return instCombineConvertFromSVBool(IC, II);
1280  case Intrinsic::aarch64_sve_dup:
1281  return instCombineSVEDup(IC, II);
1282  case Intrinsic::aarch64_sve_dup_x:
1283  return instCombineSVEDupX(IC, II);
1284  case Intrinsic::aarch64_sve_cmpne:
1285  case Intrinsic::aarch64_sve_cmpne_wide:
1286  return instCombineSVECmpNE(IC, II);
1287  case Intrinsic::aarch64_sve_rdffr:
1288  return instCombineRDFFR(IC, II);
1289  case Intrinsic::aarch64_sve_lasta:
1290  case Intrinsic::aarch64_sve_lastb:
1291  return instCombineSVELast(IC, II);
1292  case Intrinsic::aarch64_sve_cntd:
1293  return instCombineSVECntElts(IC, II, 2);
1294  case Intrinsic::aarch64_sve_cntw:
1295  return instCombineSVECntElts(IC, II, 4);
1296  case Intrinsic::aarch64_sve_cnth:
1297  return instCombineSVECntElts(IC, II, 8);
1298  case Intrinsic::aarch64_sve_cntb:
1299  return instCombineSVECntElts(IC, II, 16);
1300  case Intrinsic::aarch64_sve_ptest_any:
1301  case Intrinsic::aarch64_sve_ptest_first:
1302  case Intrinsic::aarch64_sve_ptest_last:
1303  return instCombineSVEPTest(IC, II);
1304  case Intrinsic::aarch64_sve_mul:
1305  case Intrinsic::aarch64_sve_fmul:
1306  return instCombineSVEVectorMul(IC, II);
1307  case Intrinsic::aarch64_sve_fadd:
1308  return instCombineSVEVectorFAdd(IC, II);
1309  case Intrinsic::aarch64_sve_fsub:
1310  return instCombineSVEVectorBinOp(IC, II);
1311  case Intrinsic::aarch64_sve_tbl:
1312  return instCombineSVETBL(IC, II);
1313  case Intrinsic::aarch64_sve_uunpkhi:
1314  case Intrinsic::aarch64_sve_uunpklo:
1315  case Intrinsic::aarch64_sve_sunpkhi:
1316  case Intrinsic::aarch64_sve_sunpklo:
1317  return instCombineSVEUnpack(IC, II);
1318  case Intrinsic::aarch64_sve_tuple_get:
1319  return instCombineSVETupleGet(IC, II);
1320  case Intrinsic::aarch64_sve_zip1:
1321  case Intrinsic::aarch64_sve_zip2:
1322  return instCombineSVEZip(IC, II);
1323  case Intrinsic::aarch64_sve_ld1_gather_index:
1324  return instCombineLD1GatherIndex(IC, II);
1325  case Intrinsic::aarch64_sve_st1_scatter_index:
1326  return instCombineST1ScatterIndex(IC, II);
1327  case Intrinsic::aarch64_sve_ld1:
1328  return instCombineSVELD1(IC, II, DL);
1329  case Intrinsic::aarch64_sve_st1:
1330  return instCombineSVEST1(IC, II, DL);
1331  case Intrinsic::aarch64_sve_sdiv:
1332  return instCombineSVESDIV(IC, II);
1333  case Intrinsic::aarch64_sve_sel:
1334  return instCombineSVESel(IC, II);
1335  case Intrinsic::aarch64_sve_srshl:
1336  return instCombineSVESrshl(IC, II);
1337  }
1338 
1339  return None;
1340 }
1341 
1343  InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
1344  APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
1345  std::function<void(Instruction *, unsigned, APInt, APInt &)>
1346  SimplifyAndSetOp) const {
1347  switch (II.getIntrinsicID()) {
1348  default:
1349  break;
1350  case Intrinsic::aarch64_neon_fcvtxn:
1351  case Intrinsic::aarch64_neon_rshrn:
1352  case Intrinsic::aarch64_neon_sqrshrn:
1353  case Intrinsic::aarch64_neon_sqrshrun:
1354  case Intrinsic::aarch64_neon_sqshrn:
1355  case Intrinsic::aarch64_neon_sqshrun:
1356  case Intrinsic::aarch64_neon_sqxtn:
1357  case Intrinsic::aarch64_neon_sqxtun:
1358  case Intrinsic::aarch64_neon_uqrshrn:
1359  case Intrinsic::aarch64_neon_uqshrn:
1360  case Intrinsic::aarch64_neon_uqxtn:
1361  SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
1362  break;
1363  }
1364 
1365  return None;
1366 }
1367 
1368 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
1370 
1371  // A helper that returns a vector type from the given type. The number of
1372  // elements in type Ty determines the vector width.
1373  auto toVectorTy = [&](Type *ArgTy) {
1374  return VectorType::get(ArgTy->getScalarType(),
1375  cast<VectorType>(DstTy)->getElementCount());
1376  };
1377 
1378  // Exit early if DstTy is not a vector type whose elements are at least
1379  // 16-bits wide.
1380  if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
1381  return false;
1382 
1383  // Determine if the operation has a widening variant. We consider both the
1384  // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
1385  // instructions.
1386  //
1387  // TODO: Add additional widening operations (e.g., shl, etc.) once we
1388  // verify that their extending operands are eliminated during code
1389  // generation.
1390  switch (Opcode) {
1391  case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
1392  case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
1393  case Instruction::Mul: // SMULL(2), UMULL(2)
1394  break;
1395  default:
1396  return false;
1397  }
1398 
1399  // To be a widening instruction (either the "wide" or "long" versions), the
1400  // second operand must be a sign- or zero extend.
1401  if (Args.size() != 2 ||
1402  (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])))
1403  return false;
1404  auto *Extend = cast<CastInst>(Args[1]);
1405  auto *Arg0 = dyn_cast<CastInst>(Args[0]);
1406 
1407  // A mul only has a mull version (not like addw). Both operands need to be
1408  // extending and the same type.
1409  if (Opcode == Instruction::Mul &&
1410  (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() ||
1411  Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType()))
1412  return false;
1413 
1414  // Legalize the destination type and ensure it can be used in a widening
1415  // operation.
1416  auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
1417  unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
1418  if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
1419  return false;
1420 
1421  // Legalize the source type and ensure it can be used in a widening
1422  // operation.
1423  auto *SrcTy = toVectorTy(Extend->getSrcTy());
1424  auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
1425  unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
1426  if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
1427  return false;
1428 
1429  // Get the total number of vector elements in the legalized types.
1430  InstructionCost NumDstEls =
1431  DstTyL.first * DstTyL.second.getVectorMinNumElements();
1432  InstructionCost NumSrcEls =
1433  SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
1434 
1435  // Return true if the legalized types have the same number of vector elements
1436  // and the destination element type size is twice that of the source type.
1437  return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
1438 }
1439 
1441  Type *Src,
1444  const Instruction *I) {
1445  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1446  assert(ISD && "Invalid opcode");
1447 
1448  // If the cast is observable, and it is used by a widening instruction (e.g.,
1449  // uaddl, saddw, etc.), it may be free.
1450  if (I && I->hasOneUser()) {
1451  auto *SingleUser = cast<Instruction>(*I->user_begin());
1452  SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
1453  if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
1454  // If the cast is the second operand, it is free. We will generate either
1455  // a "wide" or "long" version of the widening instruction.
1456  if (I == SingleUser->getOperand(1))
1457  return 0;
1458  // If the cast is not the second operand, it will be free if it looks the
1459  // same as the second operand. In this case, we will generate a "long"
1460  // version of the widening instruction.
1461  if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
1462  if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
1463  cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
1464  return 0;
1465  }
1466  }
1467 
1468  // TODO: Allow non-throughput costs that aren't binary.
1469  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1471  return Cost == 0 ? 0 : 1;
1472  return Cost;
1473  };
1474 
1475  EVT SrcTy = TLI->getValueType(DL, Src);
1476  EVT DstTy = TLI->getValueType(DL, Dst);
1477 
1478  if (!SrcTy.isSimple() || !DstTy.isSimple())
1479  return AdjustCost(
1480  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1481 
1482  static const TypeConversionCostTblEntry
1483  ConversionTbl[] = {
1488 
1489  // Truncations on nxvmiN
1506 
1507  // The number of shll instructions for the extension.
1524 
1525  // LowerVectorINT_TO_FP:
1532 
1533  // Complex: to v2f32
1540 
1541  // Complex: to v4f32
1546 
1547  // Complex: to v8f32
1552 
1553  // Complex: to v16f32
1556 
1557  // Complex: to v2f64
1564 
1565 
1566  // LowerVectorFP_TO_INT
1573 
1574  // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
1581 
1582  // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
1587 
1588  // Complex, from nxv2f32.
1597 
1598  // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
1605 
1606  // Complex, from nxv2f64.
1615 
1616  // Complex, from nxv4f32.
1625 
1626  // Complex, from nxv8f64. Illegal -> illegal conversions not required.
1631 
1632  // Complex, from nxv4f64. Illegal -> illegal conversions not required.
1639 
1640  // Complex, from nxv8f32. Illegal -> illegal conversions not required.
1645 
1646  // Complex, from nxv8f16.
1655 
1656  // Complex, from nxv4f16.
1665 
1666  // Complex, from nxv2f16.
1675 
1676  // Truncate from nxvmf32 to nxvmf16.
1680 
1681  // Truncate from nxvmf64 to nxvmf16.
1685 
1686  // Truncate from nxvmf64 to nxvmf32.
1690 
1691  // Extend from nxvmf16 to nxvmf32.
1695 
1696  // Extend from nxvmf16 to nxvmf64.
1700 
1701  // Extend from nxvmf32 to nxvmf64.
1705 
1706  // Bitcasts from float to integer
1710 
1711  // Bitcasts from integer to float
1715  };
1716 
1717  if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
1718  DstTy.getSimpleVT(),
1719  SrcTy.getSimpleVT()))
1720  return AdjustCost(Entry->Cost);
1721 
1722  static const TypeConversionCostTblEntry FP16Tbl[] = {
1723  {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
1725  {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
1727  {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
1729  {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
1731  {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
1733  {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
1735  {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
1737  {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
1739  {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
1741  {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
1742  {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
1743  {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
1744  {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
1745  };
1746 
1747  if (ST->hasFullFP16())
1748  if (const auto *Entry = ConvertCostTableLookup(
1749  FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
1750  return AdjustCost(Entry->Cost);
1751 
1752  return AdjustCost(
1753  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1754 }
1755 
1757  Type *Dst,
1758  VectorType *VecTy,
1759  unsigned Index) {
1760 
1761  // Make sure we were given a valid extend opcode.
1762  assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
1763  "Invalid opcode");
1764 
1765  // We are extending an element we extract from a vector, so the source type
1766  // of the extend is the element type of the vector.
1767  auto *Src = VecTy->getElementType();
1768 
1769  // Sign- and zero-extends are for integer types only.
1770  assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
1771 
1772  // Get the cost for the extract. We compute the cost (if any) for the extend
1773  // below.
1774  InstructionCost Cost =
1775  getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
1776 
1777  // Legalize the types.
1778  auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
1779  auto DstVT = TLI->getValueType(DL, Dst);
1780  auto SrcVT = TLI->getValueType(DL, Src);
1782 
1783  // If the resulting type is still a vector and the destination type is legal,
1784  // we may get the extension for free. If not, get the default cost for the
1785  // extend.
1786  if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
1787  return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1788  CostKind);
1789 
1790  // The destination type should be larger than the element type. If not, get
1791  // the default cost for the extend.
1792  if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
1793  return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1794  CostKind);
1795 
1796  switch (Opcode) {
1797  default:
1798  llvm_unreachable("Opcode should be either SExt or ZExt");
1799 
1800  // For sign-extends, we only need a smov, which performs the extension
1801  // automatically.
1802  case Instruction::SExt:
1803  return Cost;
1804 
1805  // For zero-extends, the extend is performed automatically by a umov unless
1806  // the destination type is i64 and the element type is i8 or i16.
1807  case Instruction::ZExt:
1808  if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
1809  return Cost;
1810  }
1811 
1812  // If we are unable to perform the extend for free, get the default cost.
1813  return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1814  CostKind);
1815 }
1816 
1819  const Instruction *I) {
1821  return Opcode == Instruction::PHI ? 0 : 1;
1822  assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
1823  // Branches are assumed to be predicted.
1824  return 0;
1825 }
1826 
1828  unsigned Index) {
1829  assert(Val->isVectorTy() && "This must be a vector type");
1830 
1831  if (Index != -1U) {
1832  // Legalize the type.
1833  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
1834 
1835  // This type is legalized to a scalar type.
1836  if (!LT.second.isVector())
1837  return 0;
1838 
1839  // The type may be split. For fixed-width vectors we can normalize the
1840  // index to the new type.
1841  if (LT.second.isFixedLengthVector()) {
1842  unsigned Width = LT.second.getVectorNumElements();
1843  Index = Index % Width;
1844  }
1845 
1846  // The element at index zero is already inside the vector.
1847  if (Index == 0)
1848  return 0;
1849  }
1850 
1851  // All other insert/extracts cost this much.
1852  return ST->getVectorInsertExtractBaseCost();
1853 }
1854 
1856  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1857  TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
1858  TTI::OperandValueProperties Opd1PropInfo,
1860  const Instruction *CxtI) {
1861  // TODO: Handle more cost kinds.
1863  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1864  Opd2Info, Opd1PropInfo,
1865  Opd2PropInfo, Args, CxtI);
1866 
1867  // Legalize the type.
1868  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1869  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1870 
1871  switch (ISD) {
1872  default:
1873  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1874  Opd2Info, Opd1PropInfo, Opd2PropInfo);
1875  case ISD::SDIV:
1877  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
1878  // On AArch64, scalar signed division by constants power-of-two are
1879  // normally expanded to the sequence ADD + CMP + SELECT + SRA.
1880  // The OperandValue properties many not be same as that of previous
1881  // operation; conservatively assume OP_None.
1882  InstructionCost Cost = getArithmeticInstrCost(
1883  Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info,
1885  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Opd1Info,
1886  Opd2Info, TargetTransformInfo::OP_None,
1888  Cost += getArithmeticInstrCost(
1889  Instruction::Select, Ty, CostKind, Opd1Info, Opd2Info,
1891  Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Opd1Info,
1892  Opd2Info, TargetTransformInfo::OP_None,
1894  return Cost;
1895  }
1897  case ISD::UDIV: {
1899  auto VT = TLI->getValueType(DL, Ty);
1900  if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
1901  // Vector signed division by constant are expanded to the
1902  // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
1903  // to MULHS + SUB + SRL + ADD + SRL.
1904  InstructionCost MulCost = getArithmeticInstrCost(
1905  Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info,
1907  InstructionCost AddCost = getArithmeticInstrCost(
1908  Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info,
1910  InstructionCost ShrCost = getArithmeticInstrCost(
1911  Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info,
1913  return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
1914  }
1915  }
1916 
1917  InstructionCost Cost = BaseT::getArithmeticInstrCost(
1918  Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
1919  if (Ty->isVectorTy()) {
1920  // On AArch64, vector divisions are not supported natively and are
1921  // expanded into scalar divisions of each pair of elements.
1922  Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind,
1923  Opd1Info, Opd2Info, Opd1PropInfo,
1924  Opd2PropInfo);
1925  Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
1926  Opd1Info, Opd2Info, Opd1PropInfo,
1927  Opd2PropInfo);
1928  // TODO: if one of the arguments is scalar, then it's not necessary to
1929  // double the cost of handling the vector elements.
1930  Cost += Cost;
1931  }
1932  return Cost;
1933  }
1934  case ISD::MUL:
1935  // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive
1936  // as elements are extracted from the vectors and the muls scalarized.
1937  // As getScalarizationOverhead is a bit too pessimistic, we estimate the
1938  // cost for a i64 vector directly here, which is:
1939  // - four 2-cost i64 extracts,
1940  // - two 2-cost i64 inserts, and
1941  // - two 1-cost muls.
1942  // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
1943  // LT.first = 2 the cost is 28. If both operands are extensions it will not
1944  // need to scalarize so the cost can be cheaper (smull or umull).
1945  if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
1946  return LT.first;
1947  return LT.first * 14;
1948  case ISD::ADD:
1949  case ISD::XOR:
1950  case ISD::OR:
1951  case ISD::AND:
1952  case ISD::SRL:
1953  case ISD::SRA:
1954  case ISD::SHL:
1955  // These nodes are marked as 'custom' for combining purposes only.
1956  // We know that they are legal. See LowerAdd in ISelLowering.
1957  return LT.first;
1958 
1959  case ISD::FADD:
1960  case ISD::FSUB:
1961  case ISD::FMUL:
1962  case ISD::FDIV:
1963  case ISD::FNEG:
1964  // These nodes are marked as 'custom' just to lower them to SVE.
1965  // We know said lowering will incur no additional cost.
1966  if (!Ty->getScalarType()->isFP128Ty())
1967  return 2 * LT.first;
1968 
1969  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1970  Opd2Info, Opd1PropInfo, Opd2PropInfo);
1971  }
1972 }
1973 
1975  ScalarEvolution *SE,
1976  const SCEV *Ptr) {
1977  // Address computations in vectorized code with non-consecutive addresses will
1978  // likely result in more instructions compared to scalar code where the
1979  // computation can more often be merged into the index mode. The resulting
1980  // extra micro-ops can significantly decrease throughput.
1981  unsigned NumVectorInstToHideOverhead = 10;
1982  int MaxMergeDistance = 64;
1983 
1984  if (Ty->isVectorTy() && SE &&
1985  !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1986  return NumVectorInstToHideOverhead;
1987 
1988  // In many cases the address computation is not merged into the instruction
1989  // addressing mode.
1990  return 1;
1991 }
1992 
1994  Type *CondTy,
1995  CmpInst::Predicate VecPred,
1997  const Instruction *I) {
1998  // TODO: Handle other cost kinds.
2000  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2001  I);
2002 
2003  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2004  // We don't lower some vector selects well that are wider than the register
2005  // width.
2006  if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
2007  // We would need this many instructions to hide the scalarization happening.
2008  const int AmortizationCost = 20;
2009 
2010  // If VecPred is not set, check if we can get a predicate from the context
2011  // instruction, if its type matches the requested ValTy.
2012  if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
2013  CmpInst::Predicate CurrentPred;
2014  if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
2015  m_Value())))
2016  VecPred = CurrentPred;
2017  }
2018  // Check if we have a compare/select chain that can be lowered using
2019  // a (F)CMxx & BFI pair.
2020  if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
2021  VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
2022  VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
2023  VecPred == CmpInst::FCMP_UNE) {
2024  static const auto ValidMinMaxTys = {
2027  static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
2028 
2029  auto LT = TLI->getTypeLegalizationCost(DL, ValTy);
2030  if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
2031  (ST->hasFullFP16() &&
2032  any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
2033  return LT.first;
2034  }
2035 
2036  static const TypeConversionCostTblEntry
2037  VectorSelectTbl[] = {
2039  { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
2041  { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
2042  { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
2043  { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
2044  };
2045 
2046  EVT SelCondTy = TLI->getValueType(DL, CondTy);
2047  EVT SelValTy = TLI->getValueType(DL, ValTy);
2048  if (SelCondTy.isSimple() && SelValTy.isSimple()) {
2049  if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
2050  SelCondTy.getSimpleVT(),
2051  SelValTy.getSimpleVT()))
2052  return Entry->Cost;
2053  }
2054  }
2055  // The base case handles scalable vectors fine for now, since it treats the
2056  // cost as 1 * legalization cost.
2057  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2058 }
2059 
2061 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
2063  if (ST->requiresStrictAlign()) {
2064  // TODO: Add cost modeling for strict align. Misaligned loads expand to
2065  // a bunch of instructions when strict align is enabled.
2066  return Options;
2067  }
2068  Options.AllowOverlappingLoads = true;
2069  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
2070  Options.NumLoadsPerBlock = Options.MaxNumLoads;
2071  // TODO: Though vector loads usually perform well on AArch64, in some targets
2072  // they may wake up the FP unit, which raises the power consumption. Perhaps
2073  // they could be used with no holds barred (-O3).
2074  Options.LoadSizes = {8, 4, 2, 1};
2075  return Options;
2076 }
2077 
2080  Align Alignment, unsigned AddressSpace,
2082  if (useNeonVector(Src))
2083  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2084  CostKind);
2085  auto LT = TLI->getTypeLegalizationCost(DL, Src);
2086  if (!LT.first.isValid())
2087  return InstructionCost::getInvalid();
2088 
2089  // The code-generator is currently not able to handle scalable vectors
2090  // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2091  // it. This change will be removed when code-generation for these types is
2092  // sufficiently reliable.
2093  if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
2094  return InstructionCost::getInvalid();
2095 
2096  return LT.first * 2;
2097 }
2098 
2099 static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
2101 }
2102 
2104  unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
2105  Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
2106  if (useNeonVector(DataTy))
2107  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
2108  Alignment, CostKind, I);
2109  auto *VT = cast<VectorType>(DataTy);
2110  auto LT = TLI->getTypeLegalizationCost(DL, DataTy);
2111  if (!LT.first.isValid())
2112  return InstructionCost::getInvalid();
2113 
2114  // The code-generator is currently not able to handle scalable vectors
2115  // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2116  // it. This change will be removed when code-generation for these types is
2117  // sufficiently reliable.
2118  if (cast<VectorType>(DataTy)->getElementCount() ==
2120  return InstructionCost::getInvalid();
2121 
2122  ElementCount LegalVF = LT.second.getVectorElementCount();
2123  InstructionCost MemOpCost =
2124  getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
2125  // Add on an overhead cost for using gathers/scatters.
2126  // TODO: At the moment this is applied unilaterally for all CPUs, but at some
2127  // point we may want a per-CPU overhead.
2128  MemOpCost *= getSVEGatherScatterOverhead(Opcode);
2129  return LT.first * MemOpCost * getMaxNumElements(LegalVF);
2130 }
2131 
2132 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
2133  return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
2134 }
2135 
2137  MaybeAlign Alignment,
2138  unsigned AddressSpace,
2140  const Instruction *I) {
2141  EVT VT = TLI->getValueType(DL, Ty, true);
2142  // Type legalization can't handle structs
2143  if (VT == MVT::Other)
2144  return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
2145  CostKind);
2146 
2147  auto LT = TLI->getTypeLegalizationCost(DL, Ty);
2148  if (!LT.first.isValid())
2149  return InstructionCost::getInvalid();
2150 
2151  // The code-generator is currently not able to handle scalable vectors
2152  // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2153  // it. This change will be removed when code-generation for these types is
2154  // sufficiently reliable.
2155  if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
2156  if (VTy->getElementCount() == ElementCount::getScalable(1))
2157  return InstructionCost::getInvalid();
2158 
2159  // TODO: consider latency as well for TCK_SizeAndLatency.
2161  return LT.first;
2162 
2164  return 1;
2165 
2166  if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
2167  LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
2168  // Unaligned stores are extremely inefficient. We don't split all
2169  // unaligned 128-bit stores because the negative impact that has shown in
2170  // practice on inlined block copy code.
2171  // We make such stores expensive so that we will only vectorize if there
2172  // are 6 other instructions getting vectorized.
2173  const int AmortizationCost = 6;
2174 
2175  return LT.first * 2 * AmortizationCost;
2176  }
2177 
2178  // Check truncating stores and extending loads.
2179  if (useNeonVector(Ty) &&
2180  Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
2181  // v4i8 types are lowered to scalar a load/store and sshll/xtn.
2182  if (VT == MVT::v4i8)
2183  return 2;
2184  // Otherwise we need to scalarize.
2185  return cast<FixedVectorType>(Ty)->getNumElements() * 2;
2186  }
2187 
2188  return LT.first;
2189 }
2190 
2192  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
2193  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
2194  bool UseMaskForCond, bool UseMaskForGaps) {
2195  assert(Factor >= 2 && "Invalid interleave factor");
2196  auto *VecVTy = cast<FixedVectorType>(VecTy);
2197 
2198  if (!UseMaskForCond && !UseMaskForGaps &&
2199  Factor <= TLI->getMaxSupportedInterleaveFactor()) {
2200  unsigned NumElts = VecVTy->getNumElements();
2201  auto *SubVecTy =
2202  FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
2203 
2204  // ldN/stN only support legal vector types of size 64 or 128 in bits.
2205  // Accesses having vector types that are a multiple of 128 bits can be
2206  // matched to more than one ldN/stN instruction.
2207  bool UseScalable;
2208  if (NumElts % Factor == 0 &&
2209  TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
2210  return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
2211  }
2212 
2213  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2214  Alignment, AddressSpace, CostKind,
2215  UseMaskForCond, UseMaskForGaps);
2216 }
2217 
2220  InstructionCost Cost = 0;
2222  for (auto *I : Tys) {
2223  if (!I->isVectorTy())
2224  continue;
2225  if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
2226  128)
2227  Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
2228  getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
2229  }
2230  return Cost;
2231 }
2232 
2234  return ST->getMaxInterleaveFactor();
2235 }
2236 
2237 // For Falkor, we want to avoid having too many strided loads in a loop since
2238 // that can exhaust the HW prefetcher resources. We adjust the unroller
2239 // MaxCount preference below to attempt to ensure unrolling doesn't create too
2240 // many strided loads.
2241 static void
2244  enum { MaxStridedLoads = 7 };
2245  auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
2246  int StridedLoads = 0;
2247  // FIXME? We could make this more precise by looking at the CFG and
2248  // e.g. not counting loads in each side of an if-then-else diamond.
2249  for (const auto BB : L->blocks()) {
2250  for (auto &I : *BB) {
2251  LoadInst *LMemI = dyn_cast<LoadInst>(&I);
2252  if (!LMemI)
2253  continue;
2254 
2255  Value *PtrValue = LMemI->getPointerOperand();
2256  if (L->isLoopInvariant(PtrValue))
2257  continue;
2258 
2259  const SCEV *LSCEV = SE.getSCEV(PtrValue);
2260  const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
2261  if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
2262  continue;
2263 
2264  // FIXME? We could take pairing of unrolled load copies into account
2265  // by looking at the AddRec, but we would probably have to limit this
2266  // to loops with no stores or other memory optimization barriers.
2267  ++StridedLoads;
2268  // We've seen enough strided loads that seeing more won't make a
2269  // difference.
2270  if (StridedLoads > MaxStridedLoads / 2)
2271  return StridedLoads;
2272  }
2273  }
2274  return StridedLoads;
2275  };
2276 
2277  int StridedLoads = countStridedLoads(L, SE);
2278  LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
2279  << " strided loads\n");
2280  // Pick the largest power of 2 unroll count that won't result in too many
2281  // strided loads.
2282  if (StridedLoads) {
2283  UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
2284  LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
2285  << UP.MaxCount << '\n');
2286  }
2287 }
2288 
2292  // Enable partial unrolling and runtime unrolling.
2293  BaseT::getUnrollingPreferences(L, SE, UP, ORE);
2294 
2295  UP.UpperBound = true;
2296 
2297  // For inner loop, it is more likely to be a hot one, and the runtime check
2298  // can be promoted out from LICM pass, so the overhead is less, let's try
2299  // a larger threshold to unroll more loops.
2300  if (L->getLoopDepth() > 1)
2301  UP.PartialThreshold *= 2;
2302 
2303  // Disable partial & runtime unrolling on -Os.
2304  UP.PartialOptSizeThreshold = 0;
2305 
2306  if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
2308  getFalkorUnrollingPreferences(L, SE, UP);
2309 
2310  // Scan the loop: don't unroll loops with calls as this could prevent
2311  // inlining. Don't unroll vector loops either, as they don't benefit much from
2312  // unrolling.
2313  for (auto *BB : L->getBlocks()) {
2314  for (auto &I : *BB) {
2315  // Don't unroll vectorised loop.
2316  if (I.getType()->isVectorTy())
2317  return;
2318 
2319  if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2320  if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2321  if (!isLoweredToCall(F))
2322  continue;
2323  }
2324  return;
2325  }
2326  }
2327  }
2328 
2329  // Enable runtime unrolling for in-order models
2330  // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
2331  // checking for that case, we can ensure that the default behaviour is
2332  // unchanged
2333  if (ST->getProcFamily() != AArch64Subtarget::Others &&
2334  !ST->getSchedModel().isOutOfOrder()) {
2335  UP.Runtime = true;
2336  UP.Partial = true;
2337  UP.UnrollRemainder = true;
2339 
2340  UP.UnrollAndJam = true;
2342  }
2343 }
2344 
2347  BaseT::getPeelingPreferences(L, SE, PP);
2348 }
2349 
2351  Type *ExpectedType) {
2352  switch (Inst->getIntrinsicID()) {
2353  default:
2354  return nullptr;
2355  case Intrinsic::aarch64_neon_st2:
2356  case Intrinsic::aarch64_neon_st3:
2357  case Intrinsic::aarch64_neon_st4: {
2358  // Create a struct type
2359  StructType *ST = dyn_cast<StructType>(ExpectedType);
2360  if (!ST)
2361  return nullptr;
2362  unsigned NumElts = Inst->arg_size() - 1;
2363  if (ST->getNumElements() != NumElts)
2364  return nullptr;
2365  for (unsigned i = 0, e = NumElts; i != e; ++i) {
2366  if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
2367  return nullptr;
2368  }
2369  Value *Res = UndefValue::get(ExpectedType);
2370  IRBuilder<> Builder(Inst);
2371  for (unsigned i = 0, e = NumElts; i != e; ++i) {
2372  Value *L = Inst->getArgOperand(i);
2373  Res = Builder.CreateInsertValue(Res, L, i);
2374  }
2375  return Res;
2376  }
2377  case Intrinsic::aarch64_neon_ld2:
2378  case Intrinsic::aarch64_neon_ld3:
2379  case Intrinsic::aarch64_neon_ld4:
2380  if (Inst->getType() == ExpectedType)
2381  return Inst;
2382  return nullptr;
2383  }
2384 }
2385 
2388  switch (Inst->getIntrinsicID()) {
2389  default:
2390  break;
2391  case Intrinsic::aarch64_neon_ld2:
2392  case Intrinsic::aarch64_neon_ld3:
2393  case Intrinsic::aarch64_neon_ld4:
2394  Info.ReadMem = true;
2395  Info.WriteMem = false;
2396  Info.PtrVal = Inst->getArgOperand(0);
2397  break;
2398  case Intrinsic::aarch64_neon_st2:
2399  case Intrinsic::aarch64_neon_st3:
2400  case Intrinsic::aarch64_neon_st4:
2401  Info.ReadMem = false;
2402  Info.WriteMem = true;
2403  Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
2404  break;
2405  }
2406 
2407  switch (Inst->getIntrinsicID()) {
2408  default:
2409  return false;
2410  case Intrinsic::aarch64_neon_ld2:
2411  case Intrinsic::aarch64_neon_st2:
2412  Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
2413  break;
2414  case Intrinsic::aarch64_neon_ld3:
2415  case Intrinsic::aarch64_neon_st3:
2416  Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
2417  break;
2418  case Intrinsic::aarch64_neon_ld4:
2419  case Intrinsic::aarch64_neon_st4:
2420  Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
2421  break;
2422  }
2423  return true;
2424 }
2425 
2426 /// See if \p I should be considered for address type promotion. We check if \p
2427 /// I is a sext with right type and used in memory accesses. If it used in a
2428 /// "complex" getelementptr, we allow it to be promoted without finding other
2429 /// sext instructions that sign extended the same initial value. A getelementptr
2430 /// is considered as "complex" if it has more than 2 operands.
2432  const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
2433  bool Considerable = false;
2434  AllowPromotionWithoutCommonHeader = false;
2435  if (!isa<SExtInst>(&I))
2436  return false;
2437  Type *ConsideredSExtType =
2438  Type::getInt64Ty(I.getParent()->getParent()->getContext());
2439  if (I.getType() != ConsideredSExtType)
2440  return false;
2441  // See if the sext is the one with the right type and used in at least one
2442  // GetElementPtrInst.
2443  for (const User *U : I.users()) {
2444  if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2445  Considerable = true;
2446  // A getelementptr is considered as "complex" if it has more than 2
2447  // operands. We will promote a SExt used in such complex GEP as we
2448  // expect some computation to be merged if they are done on 64 bits.
2449  if (GEPInst->getNumOperands() > 2) {
2450  AllowPromotionWithoutCommonHeader = true;
2451  break;
2452  }
2453  }
2454  }
2455  return Considerable;
2456 }
2457 
2459  const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
2460  if (!VF.isScalable())
2461  return true;
2462 
2463  Type *Ty = RdxDesc.getRecurrenceType();
2464  if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
2465  return false;
2466 
2467  switch (RdxDesc.getRecurrenceKind()) {
2468  case RecurKind::Add:
2469  case RecurKind::FAdd:
2470  case RecurKind::And:
2471  case RecurKind::Or:
2472  case RecurKind::Xor:
2473  case RecurKind::SMin:
2474  case RecurKind::SMax:
2475  case RecurKind::UMin:
2476  case RecurKind::UMax:
2477  case RecurKind::FMin:
2478  case RecurKind::FMax:
2479  case RecurKind::SelectICmp:
2480  case RecurKind::SelectFCmp:
2481  case RecurKind::FMulAdd:
2482  return true;
2483  default:
2484  return false;
2485  }
2486 }
2487 
2490  bool IsUnsigned,
2492  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
2493 
2494  if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
2495  return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
2496 
2497  assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&
2498  "Both vector needs to be equally scalable");
2499 
2500  InstructionCost LegalizationCost = 0;
2501  if (LT.first > 1) {
2502  Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2503  unsigned MinMaxOpcode =
2504  Ty->isFPOrFPVectorTy()
2506  : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin);
2507  IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy});
2508  LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
2509  }
2510 
2511  return LegalizationCost + /*Cost of horizontal reduction*/ 2;
2512 }
2513 
2515  unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
2516  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2517  InstructionCost LegalizationCost = 0;
2518  if (LT.first > 1) {
2519  Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
2520  LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
2521  LegalizationCost *= LT.first - 1;
2522  }
2523 
2524  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2525  assert(ISD && "Invalid opcode");
2526  // Add the final reduction cost for the legal horizontal reduction
2527  switch (ISD) {
2528  case ISD::ADD:
2529  case ISD::AND:
2530  case ISD::OR:
2531  case ISD::XOR:
2532  case ISD::FADD:
2533  return LegalizationCost + 2;
2534  default:
2535  return InstructionCost::getInvalid();
2536  }
2537 }
2538 
2543  if (TTI::requiresOrderedReduction(FMF)) {
2544  if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
2545  InstructionCost BaseCost =
2546  BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2547  // Add on extra cost to reflect the extra overhead on some CPUs. We still
2548  // end up vectorizing for more computationally intensive loops.
2549  return BaseCost + FixedVTy->getNumElements();
2550  }
2551 
2552  if (Opcode != Instruction::FAdd)
2553  return InstructionCost::getInvalid();
2554 
2555  auto *VTy = cast<ScalableVectorType>(ValTy);
2556  InstructionCost Cost =
2557  getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
2558  Cost *= getMaxNumElements(VTy->getElementCount());
2559  return Cost;
2560  }
2561 
2562  if (isa<ScalableVectorType>(ValTy))
2563  return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
2564 
2565  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
2566  MVT MTy = LT.second;
2567  int ISD = TLI->InstructionOpcodeToISD(Opcode);
2568  assert(ISD && "Invalid opcode");
2569 
2570  // Horizontal adds can use the 'addv' instruction. We model the cost of these
2571  // instructions as twice a normal vector add, plus 1 for each legalization
2572  // step (LT.first). This is the only arithmetic vector reduction operation for
2573  // which we have an instruction.
2574  // OR, XOR and AND costs should match the codegen from:
2575  // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
2576  // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
2577  // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
2578  static const CostTblEntry CostTblNoPairwise[]{
2579  {ISD::ADD, MVT::v8i8, 2},
2580  {ISD::ADD, MVT::v16i8, 2},
2581  {ISD::ADD, MVT::v4i16, 2},
2582  {ISD::ADD, MVT::v8i16, 2},
2583  {ISD::ADD, MVT::v4i32, 2},
2584  {ISD::OR, MVT::v8i8, 15},
2585  {ISD::OR, MVT::v16i8, 17},
2586  {ISD::OR, MVT::v4i16, 7},
2587  {ISD::OR, MVT::v8i16, 9},
2588  {ISD::OR, MVT::v2i32, 3},
2589  {ISD::OR, MVT::v4i32, 5},
2590  {ISD::OR, MVT::v2i64, 3},
2591  {ISD::XOR, MVT::v8i8, 15},
2592  {ISD::XOR, MVT::v16i8, 17},
2593  {ISD::XOR, MVT::v4i16, 7},
2594  {ISD::XOR, MVT::v8i16, 9},
2595  {ISD::XOR, MVT::v2i32, 3},
2596  {ISD::XOR, MVT::v4i32, 5},
2597  {ISD::XOR, MVT::v2i64, 3},
2598  {ISD::AND, MVT::v8i8, 15},
2599  {ISD::AND, MVT::v16i8, 17},
2600  {ISD::AND, MVT::v4i16, 7},
2601  {ISD::AND, MVT::v8i16, 9},
2602  {ISD::AND, MVT::v2i32, 3},
2603  {ISD::AND, MVT::v4i32, 5},
2604  {ISD::AND, MVT::v2i64, 3},
2605  };
2606  switch (ISD) {
2607  default:
2608  break;
2609  case ISD::ADD:
2610  if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
2611  return (LT.first - 1) + Entry->Cost;
2612  break;
2613  case ISD::XOR:
2614  case ISD::AND:
2615  case ISD::OR:
2616  const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
2617  if (!Entry)
2618  break;
2619  auto *ValVTy = cast<FixedVectorType>(ValTy);
2620  if (!ValVTy->getElementType()->isIntegerTy(1) &&
2621  MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
2622  isPowerOf2_32(ValVTy->getNumElements())) {
2623  InstructionCost ExtraCost = 0;
2624  if (LT.first != 1) {
2625  // Type needs to be split, so there is an extra cost of LT.first - 1
2626  // arithmetic ops.
2627  auto *Ty = FixedVectorType::get(ValTy->getElementType(),
2628  MTy.getVectorNumElements());
2629  ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
2630  ExtraCost *= LT.first - 1;
2631  }
2632  return Entry->Cost + ExtraCost;
2633  }
2634  break;
2635  }
2636  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2637 }
2638 
2640  static const CostTblEntry ShuffleTbl[] = {
2641  { TTI::SK_Splice, MVT::nxv16i8, 1 },
2642  { TTI::SK_Splice, MVT::nxv8i16, 1 },
2643  { TTI::SK_Splice, MVT::nxv4i32, 1 },
2644  { TTI::SK_Splice, MVT::nxv2i64, 1 },
2645  { TTI::SK_Splice, MVT::nxv2f16, 1 },
2646  { TTI::SK_Splice, MVT::nxv4f16, 1 },
2647  { TTI::SK_Splice, MVT::nxv8f16, 1 },
2648  { TTI::SK_Splice, MVT::nxv2bf16, 1 },
2649  { TTI::SK_Splice, MVT::nxv4bf16, 1 },
2650  { TTI::SK_Splice, MVT::nxv8bf16, 1 },
2651  { TTI::SK_Splice, MVT::nxv2f32, 1 },
2652  { TTI::SK_Splice, MVT::nxv4f32, 1 },
2653  { TTI::SK_Splice, MVT::nxv2f64, 1 },
2654  };
2655 
2656  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
2657  Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
2659  EVT PromotedVT = LT.second.getScalarType() == MVT::i1
2660  ? TLI->getPromotedVTForPredicate(EVT(LT.second))
2661  : LT.second;
2662  Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
2663  InstructionCost LegalizationCost = 0;
2664  if (Index < 0) {
2665  LegalizationCost =
2666  getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
2668  getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
2670  }
2671 
2672  // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
2673  // Cost performed on a promoted type.
2674  if (LT.second.getScalarType() == MVT::i1) {
2675  LegalizationCost +=
2676  getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
2678  getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
2680  }
2681  const auto *Entry =
2682  CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
2683  assert(Entry && "Illegal Type for Splice");
2684  LegalizationCost += Entry->Cost;
2685  return LegalizationCost * LT.first;
2686 }
2687 
2689  VectorType *Tp,
2690  ArrayRef<int> Mask, int Index,
2691  VectorType *SubTp,
2693  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
2694  // If we have a Mask, and the LT is being legalized somehow, split the Mask
2695  // into smaller vectors and sum the cost of each shuffle.
2696  if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
2697  Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
2698  cast<FixedVectorType>(Tp)->getNumElements() >
2699  LT.second.getVectorNumElements() &&
2700  !Index && !SubTp) {
2701  unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements();
2702  assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!");
2703  unsigned LTNumElts = LT.second.getVectorNumElements();
2704  unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
2705  VectorType *NTp =
2706  VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
2707  InstructionCost Cost;
2708  for (unsigned N = 0; N < NumVecs; N++) {
2709  SmallVector<int> NMask;
2710  // Split the existing mask into chunks of size LTNumElts. Track the source
2711  // sub-vectors to ensure the result has at most 2 inputs.
2712  unsigned Source1, Source2;
2713  unsigned NumSources = 0;
2714  for (unsigned E = 0; E < LTNumElts; E++) {
2715  int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
2716  : UndefMaskElem;
2717  if (MaskElt < 0) {
2718  NMask.push_back(UndefMaskElem);
2719  continue;
2720  }
2721 
2722  // Calculate which source from the input this comes from and whether it
2723  // is new to us.
2724  unsigned Source = MaskElt / LTNumElts;
2725  if (NumSources == 0) {
2726  Source1 = Source;
2727  NumSources = 1;
2728  } else if (NumSources == 1 && Source != Source1) {
2729  Source2 = Source;
2730  NumSources = 2;
2731  } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
2732  NumSources++;
2733  }
2734 
2735  // Add to the new mask. For the NumSources>2 case these are not correct,
2736  // but are only used for the modular lane number.
2737  if (Source == Source1)
2738  NMask.push_back(MaskElt % LTNumElts);
2739  else if (Source == Source2)
2740  NMask.push_back(MaskElt % LTNumElts + LTNumElts);
2741  else
2742  NMask.push_back(MaskElt % LTNumElts);
2743  }
2744  // If the sub-mask has at most 2 input sub-vectors then re-cost it using
2745  // getShuffleCost. If not then cost it using the worst case.
2746  if (NumSources <= 2)
2747  Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
2749  NTp, NMask, 0, nullptr, Args);
2750  else if (any_of(enumerate(NMask), [&](const auto &ME) {
2751  return ME.value() % LTNumElts == ME.index();
2752  }))
2753  Cost += LTNumElts - 1;
2754  else
2755  Cost += LTNumElts;
2756  }
2757  return Cost;
2758  }
2759 
2760  Kind = improveShuffleKindFromMask(Kind, Mask);
2761 
2762  // Check for broadcast loads.
2763  if (Kind == TTI::SK_Broadcast) {
2764  bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
2765  if (IsLoad && LT.second.isVector() &&
2766  isLegalBroadcastLoad(Tp->getElementType(),
2767  LT.second.getVectorElementCount()))
2768  return 0; // broadcast is handled by ld1r
2769  }
2770 
2771  // If we have 4 elements for the shuffle and a Mask, get the cost straight
2772  // from the perfect shuffle tables.
2773  if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
2774  (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
2775  all_of(Mask, [](int E) { return E < 8; }))
2776  return getPerfectShuffleCost(Mask);
2777 
2780  Kind == TTI::SK_Reverse) {
2781 
2782  static const CostTblEntry ShuffleTbl[] = {
2783  // Broadcast shuffle kinds can be performed with 'dup'.
2784  { TTI::SK_Broadcast, MVT::v8i8, 1 },
2785  { TTI::SK_Broadcast, MVT::v16i8, 1 },
2786  { TTI::SK_Broadcast, MVT::v4i16, 1 },
2787  { TTI::SK_Broadcast, MVT::v8i16, 1 },
2788  { TTI::SK_Broadcast, MVT::v2i32, 1 },
2789  { TTI::SK_Broadcast, MVT::v4i32, 1 },
2790  { TTI::SK_Broadcast, MVT::v2i64, 1 },
2791  { TTI::SK_Broadcast, MVT::v2f32, 1 },
2792  { TTI::SK_Broadcast, MVT::v4f32, 1 },
2793  { TTI::SK_Broadcast, MVT::v2f64, 1 },
2794  // Transpose shuffle kinds can be performed with 'trn1/trn2' and
2795  // 'zip1/zip2' instructions.
2796  { TTI::SK_Transpose, MVT::v8i8, 1 },
2797  { TTI::SK_Transpose, MVT::v16i8, 1 },
2798  { TTI::SK_Transpose, MVT::v4i16, 1 },
2799  { TTI::SK_Transpose, MVT::v8i16, 1 },
2800  { TTI::SK_Transpose, MVT::v2i32, 1 },
2801  { TTI::SK_Transpose, MVT::v4i32, 1 },
2802  { TTI::SK_Transpose, MVT::v2i64, 1 },
2803  { TTI::SK_Transpose, MVT::v2f32, 1 },
2804  { TTI::SK_Transpose, MVT::v4f32, 1 },
2805  { TTI::SK_Transpose, MVT::v2f64, 1 },
2806  // Select shuffle kinds.
2807  // TODO: handle vXi8/vXi16.
2808  { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
2809  { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
2810  { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
2811  { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
2812  { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
2813  { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
2814  // PermuteSingleSrc shuffle kinds.
2815  { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
2816  { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
2817  { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
2818  { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
2819  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
2820  { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
2821  { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case.
2822  { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case.
2823  { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case.
2824  { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl
2825  { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl
2826  { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl
2827  { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl
2828  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl
2829  // Reverse can be lowered with `rev`.
2830  { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov.
2831  { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT
2832  { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov.
2833  { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov.
2834  { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT
2835  { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov.
2836  { TTI::SK_Reverse, MVT::v8f16, 2 }, // REV64; EXT
2837  { TTI::SK_Reverse, MVT::v8i16, 2 }, // REV64; EXT
2838  { TTI::SK_Reverse, MVT::v16i8, 2 }, // REV64; EXT
2839  { TTI::SK_Reverse, MVT::v4f16, 1 }, // REV64
2840  { TTI::SK_Reverse, MVT::v4i16, 1 }, // REV64
2841  { TTI::SK_Reverse, MVT::v8i8, 1 }, // REV64
2842  // Broadcast shuffle kinds for scalable vectors
2860  // Handle the cases for vector.reverse with scalable vectors
2861  { TTI::SK_Reverse, MVT::nxv16i8, 1 },
2862  { TTI::SK_Reverse, MVT::nxv8i16, 1 },
2863  { TTI::SK_Reverse, MVT::nxv4i32, 1 },
2864  { TTI::SK_Reverse, MVT::nxv2i64, 1 },
2865  { TTI::SK_Reverse, MVT::nxv2f16, 1 },
2866  { TTI::SK_Reverse, MVT::nxv4f16, 1 },
2867  { TTI::SK_Reverse, MVT::nxv8f16, 1 },
2871  { TTI::SK_Reverse, MVT::nxv2f32, 1 },
2872  { TTI::SK_Reverse, MVT::nxv4f32, 1 },
2873  { TTI::SK_Reverse, MVT::nxv2f64, 1 },
2874  { TTI::SK_Reverse, MVT::nxv16i1, 1 },
2875  { TTI::SK_Reverse, MVT::nxv8i1, 1 },
2876  { TTI::SK_Reverse, MVT::nxv4i1, 1 },
2877  { TTI::SK_Reverse, MVT::nxv2i1, 1 },
2878  };
2879  if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
2880  return LT.first * Entry->Cost;
2881  }
2882 
2883  if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
2884  return getSpliceCost(Tp, Index);
2885 
2886  // Inserting a subvector can often be done with either a D, S or H register
2887  // move, so long as the inserted vector is "aligned".
2888  if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
2889  LT.second.getSizeInBits() <= 128 && SubTp) {
2890  std::pair<InstructionCost, MVT> SubLT =
2891  TLI->getTypeLegalizationCost(DL, SubTp);
2892  if (SubLT.second.isVector()) {
2893  int NumElts = LT.second.getVectorNumElements();
2894  int NumSubElts = SubLT.second.getVectorNumElements();
2895  if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
2896  return SubLT.first;
2897  }
2898  }
2899 
2900  return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
2901 }
i
i
Definition: README.txt:29
llvm::AArch64TTIImpl::getIntImmCostIntrin
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:166
llvm::MVT::nxv4i1
@ nxv4i1
Definition: MachineValueType.h:190
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::TargetTransformInfo::PSK_FastHardware
@ PSK_FastHardware
Definition: TargetTransformInfo.h:594
Int32Ty
IntegerType * Int32Ty
Definition: NVVMIntrRange.cpp:67
llvm::AArch64TTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: AArch64TargetTransformInfo.cpp:2289
llvm::MVT::nxv4i64
@ nxv4i64
Definition: MachineValueType.h:220
llvm::TargetTransformInfo::UnrollingPreferences::PartialOptSizeThreshold
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
Definition: TargetTransformInfo.h:459
llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition: TargetTransformInfo.h:874
Attrs
Function Attrs
Definition: README_ALTIVEC.txt:215
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:487
llvm::MVT::nxv2i1
@ nxv2i1
Definition: MachineValueType.h:189
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:210
llvm::Loop::isLoopInvariant
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:60
llvm::MVT::v4f16
@ v4f16
Definition: MachineValueType.h:136
llvm::PatternMatch::m_NonNegative
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
Definition: PatternMatch.h:479
llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition: TargetTransformInfo.h:455
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
llvm::ConvertCostTableLookup
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType >> Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
llvm::Type::getInt1Ty
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:236
llvm::CostTblEntryT
Cost Table Entry.
Definition: CostTable.h:25
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:65
getFalkorUnrollingPreferences
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
Definition: AArch64TargetTransformInfo.cpp:2242
llvm::Value::getPointerAlignment
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:915
llvm::MVT::nxv2f64
@ nxv2f64
Definition: MachineValueType.h:247
llvm::AArch64TTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: AArch64TargetTransformInfo.cpp:1827
llvm::RecurKind::Or
@ Or
Bitwise or logical OR of integers.
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:667
InstCombiner.h
instCombineSVEPTest
static Optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:834
llvm::AArch64_AM::LSL
@ LSL
Definition: AArch64AddressingModes.h:35
llvm::ISD::BITCAST
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:886
llvm::RecurKind::FMul
@ FMul
Product of floats.
Insert
Vector Rotate Left Mask Mask Insert
Definition: README_P9.txt:112
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:719
instCombineSVEST1
static Optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
Definition: AArch64TargetTransformInfo.cpp:929
llvm::SCEVAddRecExpr::isAffine
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
Definition: ScalarEvolutionExpressions.h:370
llvm::TargetTransformInfo::UnrollingPreferences::MaxCount
unsigned MaxCount
Definition: TargetTransformInfo.h:471
llvm::AArch64TTIImpl::getIntImmCost
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
Definition: AArch64TargetTransformInfo.cpp:57
IntrinsicInst.h
llvm::ElementCount
Definition: TypeSize.h:390
llvm::AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic
Optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
Definition: AArch64TargetTransformInfo.cpp:1342
llvm::AArch64TTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:2136
llvm::Function
Definition: Function.h:60
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
instCombineSVECmpNE
static Optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:616
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
llvm::BinaryOperator::CreateWithCopiedFlags
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Instruction *CopyO, const Twine &Name="", Instruction *InsertBefore=nullptr)
Definition: InstrTypes.h:249
llvm::ISD::UDIV
@ UDIV
Definition: ISDOpcodes.h:243
llvm::TargetTransformInfo::PopcntSupportKind
PopcntSupportKind
Flags indicating the kind of support for population count.
Definition: TargetTransformInfo.h:594
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:53
llvm::MVT::nxv2f32
@ nxv2f32
Definition: MachineValueType.h:241
llvm::APInt::isPowerOf2
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:425
llvm::PatternMatch::m_FPOne
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:826
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:309
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:133
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1185
llvm::MVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition: MachineValueType.h:366
llvm::enumerate
detail::enumerator< R > enumerate(R &&TheRange)
Given an input range, returns a new range whose values are are pair (A,B) such that A is the 0-based ...
Definition: STLExtras.h:2047
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1490
llvm::getSplatValue
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
Definition: VectorUtils.cpp:371
llvm::IRBuilder<>
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:149
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:449
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJamInnerLoopThreshold
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
Definition: TargetTransformInfo.h:506
llvm::MVT::nxv2i64
@ nxv2i64
Definition: MachineValueType.h:219
llvm::TargetTransformInfo::UnrollingPreferences::UnrollRemainder
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
Definition: TargetTransformInfo.h:499
llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:819
llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:179
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:213
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
llvm::InsertElementInst::Create
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Definition: Instructions.h:1950
Shift
bool Shift
Definition: README.txt:468
llvm::ExtractElementInst::Create
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Definition: Instructions.h:1885
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:483
llvm::MVT::nxv4f16
@ nxv4f16
Definition: MachineValueType.h:230
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1423
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:537
getPerfectShuffleCost
static unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
Definition: AArch64PerfectShuffle.h:6589
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
getSVEGatherScatterOverhead
static unsigned getSVEGatherScatterOverhead(unsigned Opcode)
Definition: AArch64TargetTransformInfo.cpp:2099
llvm::Optional
Definition: APInt.h:33
llvm::RecurKind::SelectFCmp
@ SelectFCmp
Integer select(fcmp(),x,y) where one of (x,y) is loop invariant.
llvm::PatternMatch::m_BinOp
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:84
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:40
llvm::MVT::nxv4f64
@ nxv4f64
Definition: MachineValueType.h:248
llvm::TargetTransformInfo::OP_PowerOf2
@ OP_PowerOf2
Definition: TargetTransformInfo.h:898
instCombineSVEVectorBinOp
static Optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:964
llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:422
llvm::Instruction::copyMetadata
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Definition: Instruction.cpp:841
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:119
SVEScatterOverhead
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
RHS
Value * RHS
Definition: X86PartialReduction.cpp:76
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:268
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
tryCombineFromSVBoolBinOp
static Optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:472
llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:21
llvm::RecurKind::SMin
@ SMin
Signed integer min implemented in terms of select(cmp()).
llvm::MVT::v2f64
@ v2f64
Definition: MachineValueType.h:172
llvm::CmpInst::FCMP_OGT
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:723
llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition: TargetTransformInfo.h:882
llvm::MVT::nxv8i16
@ nxv8i16
Definition: MachineValueType.h:207
llvm::LinearPolySize::isScalable
bool isScalable() const
Returns whether the size is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:298
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:239
llvm::APIntOps::umin
const APInt & umin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be unsigned.
Definition: APInt.h:2149
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:872
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::AArch64TTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: AArch64TargetTransformInfo.cpp:40
instCombineRDFFR
static Optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:794
llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(Optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition: TargetTransformInfo.h:1206
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:129
llvm::UndefMaskElem
constexpr int UndefMaskElem
Definition: Instructions.h:1996
llvm::MVT::nxv8bf16
@ nxv8bf16
Definition: MachineValueType.h:238
llvm::AArch64TTIImpl::getExtractWithExtendCost
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
Definition: AArch64TargetTransformInfo.cpp:1756
llvm::PatternMatch::m_OneUse
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:266
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::MVT::v4bf16
@ v4bf16
Definition: MachineValueType.h:147
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:186
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
instCombineMaxMinNM
static Optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1222
LHS
Value * LHS
Definition: X86PartialReduction.cpp:75
TargetLowering.h
llvm::IntrinsicCostAttributes::getArgTypes
const SmallVectorImpl< Type * > & getArgTypes() const
Definition: TargetTransformInfo.h:153
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::InstCombiner::replaceOperand
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:438
llvm::MVT::i1
@ i1
Definition: MachineValueType.h:43
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1607
llvm::TargetTransformInfo::SK_PermuteTwoSrc
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
Definition: TargetTransformInfo.h:880
llvm::MVT::v8f16
@ v8f16
Definition: MachineValueType.h:137
llvm::isSplatValue
bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
Definition: VectorUtils.cpp:386
llvm::SISrcMods::NEG
@ NEG
Definition: SIDefines.h:213
llvm::MVT::nxv2bf16
@ nxv2bf16
Definition: MachineValueType.h:236
llvm::RecurKind::And
@ And
Bitwise or logical AND of integers.
instCombineConvertFromSVBool
static Optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:524
llvm::TypeConversionCostTblEntryT
Type Conversion Cost Table.
Definition: CostTable.h:55
llvm::PatternMatch::m_Select
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
Definition: PatternMatch.h:1472
llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:713
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:763
llvm::TargetTransformInfo::OP_None
@ OP_None
Definition: TargetTransformInfo.h:898
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:871
llvm::AArch64_AM::isLogicalImmediate
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
Definition: AArch64AddressingModes.h:276
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
SVEGatherOverhead
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition: TargetTransformInfo.h:1091
llvm::User
Definition: User.h:44
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:34
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::MVT::f64
@ f64
Definition: MachineValueType.h:56
llvm::MVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition: MachineValueType.h:1066
llvm::AArch64TTIImpl::getCostOfKeepingLiveOverCall
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
Definition: AArch64TargetTransformInfo.cpp:2219
llvm::RecurrenceDescriptor::getRecurrenceType
Type * getRecurrenceType() const
Returns the type of the recurrence.
Definition: IVDescriptors.h:245
instCombineSVEZip
static Optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1107
llvm::AArch64Subtarget::Others
@ Others
Definition: AArch64Subtarget.h:41
llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:769
llvm::ISD::SRA
@ SRA
Definition: ISDOpcodes.h:692
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:178
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:227
llvm::MVT::v16i1
@ v16i1
Definition: MachineValueType.h:68
llvm::MVT::nxv2f16
@ nxv2f16
Definition: MachineValueType.h:229
llvm::VectorType::getElementCount
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:627
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
llvm::Log2_32
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:623
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::MVT::v8i1
@ v8i1
Definition: MachineValueType.h:67
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJam
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
Definition: TargetTransformInfo.h:501
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::AArch64TTIImpl::getIntImmCostInst
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
Definition: AArch64TargetTransformInfo.cpp:97
llvm::Instruction
Definition: Instruction.h:42
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:189
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::RecurrenceDescriptor::getRecurrenceKind
RecurKind getRecurrenceKind() const
Definition: IVDescriptors.h:195
llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:773
llvm::InstCombiner::eraseInstFromFunction
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
llvm::AArch64TTIImpl::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:1440
Options
const char LLVMTargetMachineRef LLVMPassBuilderOptionsRef Options
Definition: PassBuilderBindings.cpp:48
llvm::MVT::v16f16
@ v16f16
Definition: MachineValueType.h:138
llvm::AArch64TTIImpl::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:2489
llvm::MVT::nxv4i8
@ nxv4i8
Definition: MachineValueType.h:198
llvm::MVT::nxv4f32
@ nxv4f32
Definition: MachineValueType.h:242
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1769
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:919
llvm::CmpInst::FCMP_OEQ
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:722
instCombineSVEVectorMul
static Optional< Instruction * > instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:987
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::CmpInst::FCMP_OLT
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:725
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:666
Align
uint64_t Align
Definition: ELFObjHandler.cpp:81
PatternMatch.h
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:684
llvm::MVT::v1i64
@ v1i64
Definition: MachineValueType.h:117
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::MCID::Call
@ Call
Definition: MCInstrDesc.h:155
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
IVDescriptors.h
llvm::None
const NoneType None
Definition: None.h:24
llvm::LinearPolySize< ElementCount >::getFixed
static ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:283
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:91
llvm::EVT::getTypeForEVT
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:182
llvm::lltok::Kind
Kind
Definition: LLToken.h:18
llvm::MVT::v4i8
@ v4i8
Definition: MachineValueType.h:78
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:117
llvm::MVT::nxv4i16
@ nxv4i16
Definition: MachineValueType.h:206
instCombineSVETBL
static Optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1056
llvm::PatternMatch::m_One
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:513
llvm::RecurKind::UMin
@ UMin
Unisgned integer min implemented in terms of select(cmp()).
llvm::TargetTransformInfo::PSK_Software
@ PSK_Software
Definition: TargetTransformInfo.h:594
llvm::maxnum
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE maxNum semantics.
Definition: APFloat.h:1306
LoopInfo.h
instCombineSVEUnpack
static Optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1034
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:74
instCombineSVESDIV
static Optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1188
llvm::APInt::ashr
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:808
llvm::AArch64TTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:227
llvm::AArch64TTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:2540
llvm::ScalarEvolution::getSCEV
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition: ScalarEvolution.cpp:4407
AArch64AddressingModes.h
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:191
llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:873
llvm::MVT::v2i8
@ v2i8
Definition: MachineValueType.h:77
llvm::AArch64TTIImpl::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: AArch64TargetTransformInfo.cpp:2191
llvm::MVT::v4i64
@ v4i64
Definition: MachineValueType.h:120
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
llvm::MVT::nxv16i8
@ nxv16i8
Definition: MachineValueType.h:200
llvm::cl::opt< bool >
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:75
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:305
llvm::TargetTransformInfo::SK_InsertSubvector
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
Definition: TargetTransformInfo.h:878
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::MVT::nxv8i64
@ nxv8i64
Definition: MachineValueType.h:221
llvm::MVT::v16i8
@ v16i8
Definition: MachineValueType.h:80
llvm::CostTableLookup
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType >> Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
llvm::AArch64TTIImpl::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:2079
llvm::PointerType::getUnqual
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:651
llvm::AArch64TTIImpl::instCombineIntrinsic
Optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
Definition: AArch64TargetTransformInfo.cpp:1269
llvm::MVT::v16i16
@ v16i16
Definition: MachineValueType.h:93
llvm::MVT::v2i64
@ v2i64
Definition: MachineValueType.h:118
intrinsicIDToBinOpCode
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
Definition: AArch64TargetTransformInfo.cpp:951
llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition: ISDOpcodes.h:820
llvm::AArch64::SVEBitsPerBlock
static constexpr unsigned SVEBitsPerBlock
Definition: AArch64BaseInfo.h:760
AArch64ExpandImm.h
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::MVT::v16f32
@ v16f32
Definition: MachineValueType.h:162
llvm::TargetTransformInfo::OK_UniformConstantValue
@ OK_UniformConstantValue
Definition: TargetTransformInfo.h:893
llvm::AArch64TTIImpl::getArithmeticReductionCostSVE
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:2514
llvm::LegalityPredicates::all
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
Definition: LegalizerInfo.h:228
llvm::APInt::logBase2
unsigned logBase2() const
Definition: APInt.h:1664
llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition: Instructions.h:2814
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:57
llvm::APInt::negate
void negate()
Negate this APInt in place.
Definition: APInt.h:1405
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:430
instCombineSVETupleGet
static Optional< Instruction * > instCombineSVETupleGet(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1081
I
#define I(x, y, z)
Definition: MD5.cpp:58
getCalledFunction
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
Definition: MemoryBuiltins.cpp:160
instCombineSVESrshl
static Optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1232
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:898
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:929
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
llvm::AArch64TTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: AArch64TargetTransformInfo.cpp:1855
AArch64PerfectShuffle.h
llvm::RecurKind::Add
@ Add
Sum of integers.
isAllActivePredicate
static bool isAllActivePredicate(Value *Pred)
Definition: AArch64TargetTransformInfo.cpp:889
llvm::MVT::v4f32
@ v4f32
Definition: MachineValueType.h:157
llvm::AArch64TTIImpl::isLegalToVectorizeReduction
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
Definition: AArch64TargetTransformInfo.cpp:2458
llvm::MVT::getVectorNumElements
unsigned getVectorNumElements() const
Definition: MachineValueType.h:850
llvm::MVT::i8
@ i8
Definition: MachineValueType.h:44
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
llvm::MVT::Other
@ Other
Definition: MachineValueType.h:42
llvm::LoopBase::getLoopDepth
unsigned getLoopDepth() const
Return the nesting level of this loop.
Definition: LoopInfo.h:96
llvm::CmpInst::FCMP_OGE
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:724
llvm::MVT::nxv4i32
@ nxv4i32
Definition: MachineValueType.h:213
llvm::CmpInst::BAD_ICMP_PREDICATE
@ BAD_ICMP_PREDICATE
Definition: InstrTypes.h:752
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:82
llvm::MVT
Machine Value Type.
Definition: MachineValueType.h:31
llvm::TargetTransformInfo::SK_Splice
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
Definition: TargetTransformInfo.h:884
llvm::MVT::nxv4bf16
@ nxv4bf16
Definition: MachineValueType.h:237
llvm::LinearPolySize::getKnownMinValue
ScalarTy getKnownMinValue() const
Returns the minimum value this size can represent.
Definition: TypeSize.h:296
llvm::AArch64TTIImpl::getMaxInterleaveFactor
unsigned getMaxInterleaveFactor(unsigned VF)
Definition: AArch64TargetTransformInfo.cpp:2233
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:890
llvm::AArch64TTIImpl::getSpliceCost
InstructionCost getSpliceCost(VectorType *Tp, int Index)
Definition: AArch64TargetTransformInfo.cpp:2639
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
instCombineSVEDupX
static Optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:604
llvm::CmpInst::isIntPredicate
bool isIntPredicate() const
Definition: InstrTypes.h:827
llvm::TargetTransformInfo::MemCmpExpansionOptions
Returns options for expansion of memcmp. IsZeroCmp is.
Definition: TargetTransformInfo.h:773
llvm::MVT::nxv2i32
@ nxv2i32
Definition: MachineValueType.h:212
llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition: TargetTransformInfo.h:261
llvm::APIntOps::smin
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition: APInt.h:2139
llvm::RecurKind::UMax
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
llvm::PatternMatch::m_SpecificInt
specific_intval< false > m_SpecificInt(APInt V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:863
llvm::Sched::Source
@ Source
Definition: TargetLowering.h:99
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::Instruction::getFastMathFlags
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
Definition: Instruction.cpp:289
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
Mul
BinaryOperator * Mul
Definition: X86PartialReduction.cpp:70
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1614
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:47
llvm::StructType
Class to represent struct types.
Definition: DerivedTypes.h:213
llvm::MVT::v2i32
@ v2i32
Definition: MachineValueType.h:101
llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:352
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:214
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::MVT::v2f32
@ v2f32
Definition: MachineValueType.h:155
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:991
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::TargetTransformInfo::UnrollingPreferences::DefaultUnrollRuntimeCount
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
Definition: TargetTransformInfo.h:466
llvm::AArch64TTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=None)
Definition: AArch64TargetTransformInfo.cpp:2688
instCombineSVELD1
static Optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
Definition: AArch64TargetTransformInfo.cpp:906
llvm::RecurKind::FMax
@ FMax
FP max implemented in terms of select(cmp()).
llvm::MVT::v4i32
@ v4i32
Definition: MachineValueType.h:103
llvm::AArch64TTIImpl::getPopcntSupport
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
Definition: AArch64TargetTransformInfo.cpp:218
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:280
llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:176
llvm::ISD::FMUL
@ FMUL
Definition: ISDOpcodes.h:392
llvm::TargetTransformInfo::SK_Transpose
@ SK_Transpose
Transpose two vectors.
Definition: TargetTransformInfo.h:877
instCombineSVEDup
static Optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:580
llvm::MVT::v8i64
@ v8i64
Definition: MachineValueType.h:121
llvm::TargetTransformInfo::CastContextHint::None
@ None
The cast is not used with a load/store of any kind.
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:668
llvm::InstCombiner::replaceInstUsesWith
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:417
AArch64TargetTransformInfo.h
llvm::MVT::v16i32
@ v16i32
Definition: MachineValueType.h:108
instCombineSVEVectorFMLA
static Optional< Instruction * > instCombineSVEVectorFMLA(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:859
llvm::AArch64TTIImpl::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:1993
llvm::MCID::Select
@ Select
Definition: MCInstrDesc.h:164
llvm::RecurKind::FMulAdd
@ FMulAdd
Fused multiply-add of floats (a * b + c).
llvm::MVT::nxv16i1
@ nxv16i1
Definition: MachineValueType.h:192
llvm::APIntOps::umax
const APInt & umax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be unsigned.
Definition: APInt.h:2154
llvm::MVT::v8bf16
@ v8bf16
Definition: MachineValueType.h:148
llvm::MVT::nxv2i16
@ nxv2i16
Definition: MachineValueType.h:205
llvm::Type::getInt64Ty
static IntegerType * getInt64Ty(LLVMContext &C)
Definition: Type.cpp:240
Insn
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
Definition: AArch64MIPeepholeOpt.cpp:132
llvm::Constant::getNullValue
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:350
llvm::Type::getIntNTy
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:243
instCombineSVEVectorFAdd
static Optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:980
EnableFalkorHWPFUnrollFix
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
CostTable.h
llvm::TargetTransformInfo::UnrollingPreferences::UpperBound
bool UpperBound
Allow using trip count upper bound to unroll loops.
Definition: TargetTransformInfo.h:497
llvm::AArch64_IMM::expandMOVImm
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
Definition: AArch64ExpandImm.cpp:303
instCombineSVECntElts
static Optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
Definition: AArch64TargetTransformInfo.cpp:812
llvm::CallBase::arg_size
unsigned arg_size() const
Definition: InstrTypes.h:1339
llvm::APInt::sextOrTrunc
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1002
llvm::MVT::v8i16
@ v8i16
Definition: MachineValueType.h:92
llvm::APInt::isNegatedPowerOf2
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:434
llvm::SCEVAddRecExpr
This node represents a polynomial recurrence on the trip count of the specified loop.
Definition: ScalarEvolutionExpressions.h:342
llvm::AArch64TTIImpl::getTgtMemIntrinsic
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
Definition: AArch64TargetTransformInfo.cpp:2386
llvm::LinearPolySize< ElementCount >::getScalable
static ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:286
llvm::MVT::i32
@ i32
Definition: MachineValueType.h:46
llvm::Type::getPointerTo
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:774
llvm::ISD::SDIV
@ SDIV
Definition: ISDOpcodes.h:242
llvm::AArch64TTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:1817
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:185
llvm::MVT::nxv8i8
@ nxv8i8
Definition: MachineValueType.h:199
llvm::MVT::v8i32
@ v8i32
Definition: MachineValueType.h:107
llvm::InstCombiner
The core instruction combiner logic.
Definition: InstCombiner.h:45
llvm::AMDGPU::Hwreg::Width
Width
Definition: SIDefines.h:432
llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition: ISDOpcodes.h:774
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
llvm::MVT::nxv8i32
@ nxv8i32
Definition: MachineValueType.h:214
llvm::APInt::sext
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:946
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:46
llvm::MVT::nxv8f16
@ nxv8f16
Definition: MachineValueType.h:231
llvm::InstructionCost::getInvalid
static InstructionCost getInvalid(CostType Val=0)
Definition: InstructionCost.h:73
llvm::Instruction::BinaryOps
BinaryOps
Definition: Instruction.h:786
llvm::RecurrenceDescriptor
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:69
llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:871
llvm::RecurKind::SelectICmp
@ SelectICmp
Integer select(icmp(),x,y) where one of (x,y) is loop invariant.
llvm::RecurKind::FAdd
@ FAdd
Sum of floats.
llvm::getNumElementsFromSVEPredPattern
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
Definition: AArch64BaseInfo.h:459
llvm::ISD::FSUB
@ FSUB
Definition: ISDOpcodes.h:391
llvm::Pattern
Definition: FileCheckImpl.h:614
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:147
llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:691
instCombineSVESel
static Optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:572
llvm::MVT::nxv8i1
@ nxv8i1
Definition: MachineValueType.h:191
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:780
llvm::Type::isBFloatTy
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
processPhiNode
static Optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
Definition: AArch64TargetTransformInfo.cpp:425
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:241
llvm::MVT::f16
@ f16
Definition: MachineValueType.h:54
llvm::CmpInst::FCMP_UNE
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:735
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1341
N
#define N
llvm::ISD::SRL
@ SRL
Definition: ISDOpcodes.h:693
SetValue
static void SetValue(Value *V, GenericValue Val, ExecutionContext &SF)
Definition: Execution.cpp:41
TargetTransformInfo.h
llvm::AArch64TTIImpl::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:2103
llvm::PHINode
Definition: Instructions.h:2664
llvm::PatternMatch
Definition: PatternMatch.h:47
llvm::CmpInst::FCMP_OLE
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:726
llvm::ISD::MULHU
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:637
llvm::MemIntrinsicInfo
Information about a load/store intrinsic defined by the target.
Definition: TargetTransformInfo.h:69
instCombineSVELast
static Optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:718
llvm::AArch64TTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AArch64TargetTransformInfo.cpp:2345
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:398
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:45
llvm::ScalableVectorType::get
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:705
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1474
llvm::ISD::FNEG
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:904
llvm::RecurKind::FMin
@ FMin
FP min implemented in terms of select(cmp()).
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::AArch64TTIImpl::shouldConsiderAddressTypePromotion
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
Definition: AArch64TargetTransformInfo.cpp:2431
llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition: TargetTransformInfo.h:262
llvm::Value::takeName
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:378
instCombineST1ScatterIndex
static Optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1155
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:394
llvm::FastMathFlags::allowContract
bool allowContract() const
Definition: FMF.h:71
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
BasicTTIImpl.h
llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:760
llvm::MVT::v8i8
@ v8i8
Definition: MachineValueType.h:79
llvm::MVT::nxv2i8
@ nxv2i8
Definition: MachineValueType.h:197
llvm::MVT::v8f32
@ v8f32
Definition: MachineValueType.h:161
llvm::ConstantAggregateZero::get
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1648
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:89
llvm::MVT::v16i64
@ v16i64
Definition: MachineValueType.h:122
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1281
llvm::PatternMatch::m_Cmp
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:89
llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:852
llvm::MVT::f32
@ f32
Definition: MachineValueType.h:55
llvm::RecurKind::SMax
@ SMax
Signed integer max implemented in terms of select(cmp()).
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::AArch64Subtarget::Falkor
@ Falkor
Definition: AArch64Subtarget.h:70
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:211
llvm::Type::isFP128Ty
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:157
Debug.h
llvm::VectorType::get
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:668
llvm::AArch64TTIImpl::useNeonVector
bool useNeonVector(const Type *Ty) const
Definition: AArch64TargetTransformInfo.cpp:2132
llvm::APIntOps::smax
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition: APInt.h:2144
llvm::MVT::nxv8f64
@ nxv8f64
Definition: MachineValueType.h:249
llvm::ISD::CTPOP
@ CTPOP
Definition: ISDOpcodes.h:703
llvm::RecurKind::Xor
@ Xor
Bitwise or logical XOR of integers.
instCombineLD1GatherIndex
static Optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:1122
llvm::AArch64TTIImpl::getOrCreateResultFromMemIntrinsic
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
Definition: AArch64TargetTransformInfo.cpp:2350
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:164
llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:288
llvm::MVT::v4i1
@ v4i1
Definition: MachineValueType.h:66
llvm::AArch64TTIImpl::enableMemCmpExpansion
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
Definition: AArch64TargetTransformInfo.cpp:2061
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37
llvm::AArch64TTIImpl::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
Definition: AArch64TargetTransformInfo.cpp:1974
llvm::MVT::nxv8f32
@ nxv8f32
Definition: MachineValueType.h:243
llvm::SmallVectorImpl::insert
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:792
llvm::ISD::FDIV
@ FDIV
Definition: ISDOpcodes.h:393