LLVM  14.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "AArch64ExpandImm.h"
12 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
17 #include "llvm/IR/IntrinsicInst.h"
18 #include "llvm/IR/IntrinsicsAArch64.h"
19 #include "llvm/IR/PatternMatch.h"
20 #include "llvm/Support/Debug.h"
22 #include <algorithm>
23 using namespace llvm;
24 using namespace llvm::PatternMatch;
25 
26 #define DEBUG_TYPE "aarch64tti"
27 
28 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
29  cl::init(true), cl::Hidden);
30 
32  const Function *Callee) const {
33  const TargetMachine &TM = getTLI()->getTargetMachine();
34 
35  const FeatureBitset &CallerBits =
36  TM.getSubtargetImpl(*Caller)->getFeatureBits();
37  const FeatureBitset &CalleeBits =
38  TM.getSubtargetImpl(*Callee)->getFeatureBits();
39 
40  // Inline a callee if its target-features are a subset of the callers
41  // target-features.
42  return (CallerBits & CalleeBits) == CalleeBits;
43 }
44 
45 /// Calculate the cost of materializing a 64-bit value. This helper
46 /// method might only calculate a fraction of a larger immediate. Therefore it
47 /// is valid to return a cost of ZERO.
49  // Check if the immediate can be encoded within an instruction.
50  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
51  return 0;
52 
53  if (Val < 0)
54  Val = ~Val;
55 
56  // Calculate how many moves we will need to materialize this constant.
58  AArch64_IMM::expandMOVImm(Val, 64, Insn);
59  return Insn.size();
60 }
61 
62 /// Calculate the cost of materializing the given constant.
65  assert(Ty->isIntegerTy());
66 
67  unsigned BitSize = Ty->getPrimitiveSizeInBits();
68  if (BitSize == 0)
69  return ~0U;
70 
71  // Sign-extend all constants to a multiple of 64-bit.
72  APInt ImmVal = Imm;
73  if (BitSize & 0x3f)
74  ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
75 
76  // Split the constant into 64-bit chunks and calculate the cost for each
77  // chunk.
78  InstructionCost Cost = 0;
79  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
80  APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
81  int64_t Val = Tmp.getSExtValue();
82  Cost += getIntImmCost(Val);
83  }
84  // We need at least one instruction to materialze the constant.
85  return std::max<InstructionCost>(1, Cost);
86 }
87 
88 InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
89  const APInt &Imm, Type *Ty,
91  Instruction *Inst) {
92  assert(Ty->isIntegerTy());
93 
94  unsigned BitSize = Ty->getPrimitiveSizeInBits();
95  // There is no cost model for constants with a bit size of 0. Return TCC_Free
96  // here, so that constant hoisting will ignore this constant.
97  if (BitSize == 0)
98  return TTI::TCC_Free;
99 
100  unsigned ImmIdx = ~0U;
101  switch (Opcode) {
102  default:
103  return TTI::TCC_Free;
104  case Instruction::GetElementPtr:
105  // Always hoist the base address of a GetElementPtr.
106  if (Idx == 0)
107  return 2 * TTI::TCC_Basic;
108  return TTI::TCC_Free;
109  case Instruction::Store:
110  ImmIdx = 0;
111  break;
112  case Instruction::Add:
113  case Instruction::Sub:
114  case Instruction::Mul:
115  case Instruction::UDiv:
116  case Instruction::SDiv:
117  case Instruction::URem:
118  case Instruction::SRem:
119  case Instruction::And:
120  case Instruction::Or:
121  case Instruction::Xor:
122  case Instruction::ICmp:
123  ImmIdx = 1;
124  break;
125  // Always return TCC_Free for the shift value of a shift instruction.
126  case Instruction::Shl:
127  case Instruction::LShr:
128  case Instruction::AShr:
129  if (Idx == 1)
130  return TTI::TCC_Free;
131  break;
132  case Instruction::Trunc:
133  case Instruction::ZExt:
134  case Instruction::SExt:
135  case Instruction::IntToPtr:
136  case Instruction::PtrToInt:
137  case Instruction::BitCast:
138  case Instruction::PHI:
139  case Instruction::Call:
140  case Instruction::Select:
141  case Instruction::Ret:
142  case Instruction::Load:
143  break;
144  }
145 
146  if (Idx == ImmIdx) {
147  int NumConstants = (BitSize + 63) / 64;
149  return (Cost <= NumConstants * TTI::TCC_Basic)
150  ? static_cast<int>(TTI::TCC_Free)
151  : Cost;
152  }
153  return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
154 }
155 
158  const APInt &Imm, Type *Ty,
160  assert(Ty->isIntegerTy());
161 
162  unsigned BitSize = Ty->getPrimitiveSizeInBits();
163  // There is no cost model for constants with a bit size of 0. Return TCC_Free
164  // here, so that constant hoisting will ignore this constant.
165  if (BitSize == 0)
166  return TTI::TCC_Free;
167 
168  // Most (all?) AArch64 intrinsics do not support folding immediates into the
169  // selected instruction, so we compute the materialization cost for the
170  // immediate directly.
171  if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
172  return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
173 
174  switch (IID) {
175  default:
176  return TTI::TCC_Free;
177  case Intrinsic::sadd_with_overflow:
178  case Intrinsic::uadd_with_overflow:
179  case Intrinsic::ssub_with_overflow:
180  case Intrinsic::usub_with_overflow:
181  case Intrinsic::smul_with_overflow:
182  case Intrinsic::umul_with_overflow:
183  if (Idx == 1) {
184  int NumConstants = (BitSize + 63) / 64;
186  return (Cost <= NumConstants * TTI::TCC_Basic)
187  ? static_cast<int>(TTI::TCC_Free)
188  : Cost;
189  }
190  break;
191  case Intrinsic::experimental_stackmap:
192  if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
193  return TTI::TCC_Free;
194  break;
195  case Intrinsic::experimental_patchpoint_void:
196  case Intrinsic::experimental_patchpoint_i64:
197  if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
198  return TTI::TCC_Free;
199  break;
200  case Intrinsic::experimental_gc_statepoint:
201  if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
202  return TTI::TCC_Free;
203  break;
204  }
205  return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
206 }
207 
210  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
211  if (TyWidth == 32 || TyWidth == 64)
212  return TTI::PSK_FastHardware;
213  // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
214  return TTI::PSK_Software;
215 }
216 
220  auto *RetTy = ICA.getReturnType();
221  switch (ICA.getID()) {
222  case Intrinsic::umin:
223  case Intrinsic::umax:
224  case Intrinsic::smin:
225  case Intrinsic::smax: {
226  static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
228  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
229  // v2i64 types get converted to cmp+bif hence the cost of 2
230  if (LT.second == MVT::v2i64)
231  return LT.first * 2;
232  if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
233  return LT.first;
234  break;
235  }
236  case Intrinsic::sadd_sat:
237  case Intrinsic::ssub_sat:
238  case Intrinsic::uadd_sat:
239  case Intrinsic::usub_sat: {
240  static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
242  MVT::v2i64};
243  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
244  // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
245  // need to extend the type, as it uses shr(qadd(shl, shl)).
246  unsigned Instrs =
247  LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
248  if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
249  return LT.first * Instrs;
250  break;
251  }
252  case Intrinsic::abs: {
253  static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
255  MVT::v2i64};
256  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
257  if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
258  return LT.first;
259  break;
260  }
261  case Intrinsic::experimental_stepvector: {
262  InstructionCost Cost = 1; // Cost of the `index' instruction
263  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
264  // Legalisation of illegal vectors involves an `index' instruction plus
265  // (LT.first - 1) vector adds.
266  if (LT.first > 1) {
267  Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
268  InstructionCost AddCost =
269  getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
270  Cost += AddCost * (LT.first - 1);
271  }
272  return Cost;
273  }
274  case Intrinsic::bitreverse: {
275  static const CostTblEntry BitreverseTbl[] = {
276  {Intrinsic::bitreverse, MVT::i32, 1},
277  {Intrinsic::bitreverse, MVT::i64, 1},
278  {Intrinsic::bitreverse, MVT::v8i8, 1},
279  {Intrinsic::bitreverse, MVT::v16i8, 1},
280  {Intrinsic::bitreverse, MVT::v4i16, 2},
281  {Intrinsic::bitreverse, MVT::v8i16, 2},
282  {Intrinsic::bitreverse, MVT::v2i32, 2},
283  {Intrinsic::bitreverse, MVT::v4i32, 2},
284  {Intrinsic::bitreverse, MVT::v1i64, 2},
285  {Intrinsic::bitreverse, MVT::v2i64, 2},
286  };
287  const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy);
288  const auto *Entry =
289  CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
290  // Cost Model is using the legal type(i32) that i8 and i16 will be converted
291  // to +1 so that we match the actual lowering cost
292  if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
293  TLI->getValueType(DL, RetTy, true) == MVT::i16)
294  return LegalisationCost.first * Entry->Cost + 1;
295  if (Entry)
296  return LegalisationCost.first * Entry->Cost;
297  break;
298  }
299  case Intrinsic::ctpop: {
300  static const CostTblEntry CtpopCostTbl[] = {
301  {ISD::CTPOP, MVT::v2i64, 4},
302  {ISD::CTPOP, MVT::v4i32, 3},
303  {ISD::CTPOP, MVT::v8i16, 2},
304  {ISD::CTPOP, MVT::v16i8, 1},
305  {ISD::CTPOP, MVT::i64, 4},
306  {ISD::CTPOP, MVT::v2i32, 3},
307  {ISD::CTPOP, MVT::v4i16, 2},
308  {ISD::CTPOP, MVT::v8i8, 1},
309  {ISD::CTPOP, MVT::i32, 5},
310  };
311  auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
312  MVT MTy = LT.second;
313  if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
314  // Extra cost of +1 when illegal vector types are legalized by promoting
315  // the integer type.
316  int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
317  RetTy->getScalarSizeInBits()
318  ? 1
319  : 0;
320  return LT.first * Entry->Cost + ExtraCost;
321  }
322  break;
323  }
324  default:
325  break;
326  }
327  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
328 }
329 
330 /// The function will remove redundant reinterprets casting in the presence
331 /// of the control flow
333  IntrinsicInst &II) {
335  auto RequiredType = II.getType();
336 
337  auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
338  assert(PN && "Expected Phi Node!");
339 
340  // Don't create a new Phi unless we can remove the old one.
341  if (!PN->hasOneUse())
342  return None;
343 
344  for (Value *IncValPhi : PN->incoming_values()) {
345  auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
346  if (!Reinterpret ||
347  Reinterpret->getIntrinsicID() !=
348  Intrinsic::aarch64_sve_convert_to_svbool ||
349  RequiredType != Reinterpret->getArgOperand(0)->getType())
350  return None;
351  }
352 
353  // Create the new Phi
354  LLVMContext &Ctx = PN->getContext();
355  IRBuilder<> Builder(Ctx);
356  Builder.SetInsertPoint(PN);
357  PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
358  Worklist.push_back(PN);
359 
360  for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
361  auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
362  NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
363  Worklist.push_back(Reinterpret);
364  }
365 
366  // Cleanup Phi Node and reinterprets
367  return IC.replaceInstUsesWith(II, NPN);
368 }
369 
371  IntrinsicInst &II) {
372  // If the reinterpret instruction operand is a PHI Node
373  if (isa<PHINode>(II.getArgOperand(0)))
374  return processPhiNode(IC, II);
375 
376  SmallVector<Instruction *, 32> CandidatesForRemoval;
377  Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
378 
379  const auto *IVTy = cast<VectorType>(II.getType());
380 
381  // Walk the chain of conversions.
382  while (Cursor) {
383  // If the type of the cursor has fewer lanes than the final result, zeroing
384  // must take place, which breaks the equivalence chain.
385  const auto *CursorVTy = cast<VectorType>(Cursor->getType());
386  if (CursorVTy->getElementCount().getKnownMinValue() <
387  IVTy->getElementCount().getKnownMinValue())
388  break;
389 
390  // If the cursor has the same type as I, it is a viable replacement.
391  if (Cursor->getType() == IVTy)
392  EarliestReplacement = Cursor;
393 
394  auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
395 
396  // If this is not an SVE conversion intrinsic, this is the end of the chain.
397  if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
398  Intrinsic::aarch64_sve_convert_to_svbool ||
399  IntrinsicCursor->getIntrinsicID() ==
400  Intrinsic::aarch64_sve_convert_from_svbool))
401  break;
402 
403  CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
404  Cursor = IntrinsicCursor->getOperand(0);
405  }
406 
407  // If no viable replacement in the conversion chain was found, there is
408  // nothing to do.
409  if (!EarliestReplacement)
410  return None;
411 
412  return IC.replaceInstUsesWith(II, EarliestReplacement);
413 }
414 
416  IntrinsicInst &II) {
417  IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
418  if (!Pg)
419  return None;
420 
421  if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
422  return None;
423 
424  const auto PTruePattern =
425  cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
426  if (PTruePattern != AArch64SVEPredPattern::vl1)
427  return None;
428 
429  // The intrinsic is inserting into lane zero so use an insert instead.
430  auto *IdxTy = Type::getInt64Ty(II.getContext());
432  II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
433  Insert->insertBefore(&II);
434  Insert->takeName(&II);
435 
436  return IC.replaceInstUsesWith(II, Insert);
437 }
438 
440  IntrinsicInst &II) {
441  LLVMContext &Ctx = II.getContext();
442  IRBuilder<> Builder(Ctx);
443  Builder.SetInsertPoint(&II);
444 
445  // Check that the predicate is all active
446  auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
447  if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
448  return None;
449 
450  const auto PTruePattern =
451  cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
452  if (PTruePattern != AArch64SVEPredPattern::all)
453  return None;
454 
455  // Check that we have a compare of zero..
456  auto *DupX = dyn_cast<IntrinsicInst>(II.getArgOperand(2));
457  if (!DupX || DupX->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
458  return None;
459 
460  auto *DupXArg = dyn_cast<ConstantInt>(DupX->getArgOperand(0));
461  if (!DupXArg || !DupXArg->isZero())
462  return None;
463 
464  // ..against a dupq
465  auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
466  if (!DupQLane ||
467  DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
468  return None;
469 
470  // Where the dupq is a lane 0 replicate of a vector insert
471  if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
472  return None;
473 
474  auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
475  if (!VecIns ||
476  VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert)
477  return None;
478 
479  // Where the vector insert is a fixed constant vector insert into undef at
480  // index zero
481  if (!isa<UndefValue>(VecIns->getArgOperand(0)))
482  return None;
483 
484  if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
485  return None;
486 
487  auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
488  if (!ConstVec)
489  return None;
490 
491  auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
492  auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
493  if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
494  return None;
495 
496  unsigned NumElts = VecTy->getNumElements();
497  unsigned PredicateBits = 0;
498 
499  // Expand intrinsic operands to a 16-bit byte level predicate
500  for (unsigned I = 0; I < NumElts; ++I) {
501  auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
502  if (!Arg)
503  return None;
504  if (!Arg->isZero())
505  PredicateBits |= 1 << (I * (16 / NumElts));
506  }
507 
508  // If all bits are zero bail early with an empty predicate
509  if (PredicateBits == 0) {
510  auto *PFalse = Constant::getNullValue(II.getType());
511  PFalse->takeName(&II);
512  return IC.replaceInstUsesWith(II, PFalse);
513  }
514 
515  // Calculate largest predicate type used (where byte predicate is largest)
516  unsigned Mask = 8;
517  for (unsigned I = 0; I < 16; ++I)
518  if ((PredicateBits & (1 << I)) != 0)
519  Mask |= (I % 8);
520 
521  unsigned PredSize = Mask & -Mask;
522  auto *PredType = ScalableVectorType::get(
523  Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
524 
525  // Ensure all relevant bits are set
526  for (unsigned I = 0; I < 16; I += PredSize)
527  if ((PredicateBits & (1 << I)) == 0)
528  return None;
529 
530  auto *PTruePat =
532  auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
533  {PredType}, {PTruePat});
534  auto *ConvertToSVBool = Builder.CreateIntrinsic(
535  Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
536  auto *ConvertFromSVBool =
537  Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
538  {II.getType()}, {ConvertToSVBool});
539 
540  ConvertFromSVBool->takeName(&II);
541  return IC.replaceInstUsesWith(II, ConvertFromSVBool);
542 }
543 
545  IntrinsicInst &II) {
546  Value *Pg = II.getArgOperand(0);
547  Value *Vec = II.getArgOperand(1);
548  bool IsAfter = II.getIntrinsicID() == Intrinsic::aarch64_sve_lasta;
549 
550  // lastX(splat(X)) --> X
551  if (auto *SplatVal = getSplatValue(Vec))
552  return IC.replaceInstUsesWith(II, SplatVal);
553 
554  auto *C = dyn_cast<Constant>(Pg);
555  if (IsAfter && C && C->isNullValue()) {
556  // The intrinsic is extracting lane 0 so use an extract instead.
557  auto *IdxTy = Type::getInt64Ty(II.getContext());
558  auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
559  Extract->insertBefore(&II);
560  Extract->takeName(&II);
561  return IC.replaceInstUsesWith(II, Extract);
562  }
563 
564  auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
565  if (!IntrPG)
566  return None;
567 
568  if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
569  return None;
570 
571  const auto PTruePattern =
572  cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
573 
574  // Can the intrinsic's predicate be converted to a known constant index?
575  unsigned Idx;
576  switch (PTruePattern) {
577  default:
578  return None;
579  case AArch64SVEPredPattern::vl1:
580  Idx = 0;
581  break;
582  case AArch64SVEPredPattern::vl2:
583  Idx = 1;
584  break;
585  case AArch64SVEPredPattern::vl3:
586  Idx = 2;
587  break;
588  case AArch64SVEPredPattern::vl4:
589  Idx = 3;
590  break;
591  case AArch64SVEPredPattern::vl5:
592  Idx = 4;
593  break;
594  case AArch64SVEPredPattern::vl6:
595  Idx = 5;
596  break;
597  case AArch64SVEPredPattern::vl7:
598  Idx = 6;
599  break;
600  case AArch64SVEPredPattern::vl8:
601  Idx = 7;
602  break;
603  case AArch64SVEPredPattern::vl16:
604  Idx = 15;
605  break;
606  }
607 
608  // Increment the index if extracting the element after the last active
609  // predicate element.
610  if (IsAfter)
611  ++Idx;
612 
613  // Ignore extracts whose index is larger than the known minimum vector
614  // length. NOTE: This is an artificial constraint where we prefer to
615  // maintain what the user asked for until an alternative is proven faster.
616  auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
617  if (Idx >= PgVTy->getMinNumElements())
618  return None;
619 
620  // The intrinsic is extracting a fixed lane so use an extract instead.
621  auto *IdxTy = Type::getInt64Ty(II.getContext());
622  auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
623  Extract->insertBefore(&II);
624  Extract->takeName(&II);
625  return IC.replaceInstUsesWith(II, Extract);
626 }
627 
629  IntrinsicInst &II) {
630  LLVMContext &Ctx = II.getContext();
631  IRBuilder<> Builder(Ctx);
632  Builder.SetInsertPoint(&II);
633  // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
634  // can work with RDFFR_PP for ptest elimination.
635  auto *AllPat =
637  auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
638  {II.getType()}, {AllPat});
639  auto *RDFFR =
640  Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
641  RDFFR->takeName(&II);
642  return IC.replaceInstUsesWith(II, RDFFR);
643 }
644 
646 instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
647  const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
648 
650  LLVMContext &Ctx = II.getContext();
651  IRBuilder<> Builder(Ctx);
652  Builder.SetInsertPoint(&II);
653 
654  Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
655  auto *VScale = Builder.CreateVScale(StepVal);
656  VScale->takeName(&II);
657  return IC.replaceInstUsesWith(II, VScale);
658  }
659 
660  unsigned MinNumElts = 0;
661  switch (Pattern) {
662  default:
663  return None;
664  case AArch64SVEPredPattern::vl1:
665  case AArch64SVEPredPattern::vl2:
666  case AArch64SVEPredPattern::vl3:
667  case AArch64SVEPredPattern::vl4:
668  case AArch64SVEPredPattern::vl5:
669  case AArch64SVEPredPattern::vl6:
670  case AArch64SVEPredPattern::vl7:
671  case AArch64SVEPredPattern::vl8:
672  MinNumElts = Pattern;
673  break;
674  case AArch64SVEPredPattern::vl16:
675  MinNumElts = 16;
676  break;
677  }
678 
679  return NumElts >= MinNumElts
681  II, ConstantInt::get(II.getType(), MinNumElts)))
682  : None;
683 }
684 
686  IntrinsicInst &II) {
687  IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
688  IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
689 
690  if (Op1 && Op2 &&
691  Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
692  Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
693  Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
694 
696  Builder.SetInsertPoint(&II);
697 
698  Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
699  Type *Tys[] = {Op1->getArgOperand(0)->getType()};
700 
701  auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
702 
703  PTest->takeName(&II);
704  return IC.replaceInstUsesWith(II, PTest);
705  }
706 
707  return None;
708 }
709 
711  IntrinsicInst &II) {
712  auto *OpPredicate = II.getOperand(0);
713  auto *OpMultiplicand = II.getOperand(1);
714  auto *OpMultiplier = II.getOperand(2);
715 
717  Builder.SetInsertPoint(&II);
718 
719  // Return true if a given instruction is an aarch64_sve_dup_x intrinsic call
720  // with a unit splat value, false otherwise.
721  auto IsUnitDupX = [](auto *I) {
722  auto *IntrI = dyn_cast<IntrinsicInst>(I);
723  if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
724  return false;
725 
726  auto *SplatValue = IntrI->getOperand(0);
727  return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
728  };
729 
730  // Return true if a given instruction is an aarch64_sve_dup intrinsic call
731  // with a unit splat value, false otherwise.
732  auto IsUnitDup = [](auto *I) {
733  auto *IntrI = dyn_cast<IntrinsicInst>(I);
734  if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
735  return false;
736 
737  auto *SplatValue = IntrI->getOperand(2);
738  return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
739  };
740 
741  // The OpMultiplier variable should always point to the dup (if any), so
742  // swap if necessary.
743  if (IsUnitDup(OpMultiplicand) || IsUnitDupX(OpMultiplicand))
744  std::swap(OpMultiplier, OpMultiplicand);
745 
746  if (IsUnitDupX(OpMultiplier)) {
747  // [f]mul pg (dupx 1) %n => %n
748  OpMultiplicand->takeName(&II);
749  return IC.replaceInstUsesWith(II, OpMultiplicand);
750  } else if (IsUnitDup(OpMultiplier)) {
751  // [f]mul pg (dup pg 1) %n => %n
752  auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
753  auto *DupPg = DupInst->getOperand(1);
754  // TODO: this is naive. The optimization is still valid if DupPg
755  // 'encompasses' OpPredicate, not only if they're the same predicate.
756  if (OpPredicate == DupPg) {
757  OpMultiplicand->takeName(&II);
758  return IC.replaceInstUsesWith(II, OpMultiplicand);
759  }
760  }
761 
762  return None;
763 }
764 
766  IntrinsicInst &II) {
767  auto *OpVal = II.getOperand(0);
768  auto *OpIndices = II.getOperand(1);
769  VectorType *VTy = cast<VectorType>(II.getType());
770 
771  // Check whether OpIndices is an aarch64_sve_dup_x intrinsic call with
772  // constant splat value < minimal element count of result.
773  auto *DupXIntrI = dyn_cast<IntrinsicInst>(OpIndices);
774  if (!DupXIntrI || DupXIntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
775  return None;
776 
777  auto *SplatValue = dyn_cast<ConstantInt>(DupXIntrI->getOperand(0));
778  if (!SplatValue ||
779  SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
780  return None;
781 
782  // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
783  // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
785  Builder.SetInsertPoint(&II);
786  auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
787  auto *VectorSplat =
788  Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
789 
790  VectorSplat->takeName(&II);
791  return IC.replaceInstUsesWith(II, VectorSplat);
792 }
793 
796  IntrinsicInst &II) const {
797  Intrinsic::ID IID = II.getIntrinsicID();
798  switch (IID) {
799  default:
800  break;
801  case Intrinsic::aarch64_sve_convert_from_svbool:
802  return instCombineConvertFromSVBool(IC, II);
803  case Intrinsic::aarch64_sve_dup:
804  return instCombineSVEDup(IC, II);
805  case Intrinsic::aarch64_sve_cmpne:
806  case Intrinsic::aarch64_sve_cmpne_wide:
807  return instCombineSVECmpNE(IC, II);
808  case Intrinsic::aarch64_sve_rdffr:
809  return instCombineRDFFR(IC, II);
810  case Intrinsic::aarch64_sve_lasta:
811  case Intrinsic::aarch64_sve_lastb:
812  return instCombineSVELast(IC, II);
813  case Intrinsic::aarch64_sve_cntd:
814  return instCombineSVECntElts(IC, II, 2);
815  case Intrinsic::aarch64_sve_cntw:
816  return instCombineSVECntElts(IC, II, 4);
817  case Intrinsic::aarch64_sve_cnth:
818  return instCombineSVECntElts(IC, II, 8);
819  case Intrinsic::aarch64_sve_cntb:
820  return instCombineSVECntElts(IC, II, 16);
821  case Intrinsic::aarch64_sve_ptest_any:
822  case Intrinsic::aarch64_sve_ptest_first:
823  case Intrinsic::aarch64_sve_ptest_last:
824  return instCombineSVEPTest(IC, II);
825  case Intrinsic::aarch64_sve_mul:
826  case Intrinsic::aarch64_sve_fmul:
827  return instCombineSVEVectorMul(IC, II);
828  case Intrinsic::aarch64_sve_tbl:
829  return instCombineSVETBL(IC, II);
830  }
831 
832  return None;
833 }
834 
835 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
837 
838  // A helper that returns a vector type from the given type. The number of
839  // elements in type Ty determine the vector width.
840  auto toVectorTy = [&](Type *ArgTy) {
841  return VectorType::get(ArgTy->getScalarType(),
842  cast<VectorType>(DstTy)->getElementCount());
843  };
844 
845  // Exit early if DstTy is not a vector type whose elements are at least
846  // 16-bits wide.
847  if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
848  return false;
849 
850  // Determine if the operation has a widening variant. We consider both the
851  // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
852  // instructions.
853  //
854  // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
855  // verify that their extending operands are eliminated during code
856  // generation.
857  switch (Opcode) {
858  case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
859  case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
860  break;
861  default:
862  return false;
863  }
864 
865  // To be a widening instruction (either the "wide" or "long" versions), the
866  // second operand must be a sign- or zero extend having a single user. We
867  // only consider extends having a single user because they may otherwise not
868  // be eliminated.
869  if (Args.size() != 2 ||
870  (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
871  !Args[1]->hasOneUse())
872  return false;
873  auto *Extend = cast<CastInst>(Args[1]);
874 
875  // Legalize the destination type and ensure it can be used in a widening
876  // operation.
877  auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
878  unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
879  if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
880  return false;
881 
882  // Legalize the source type and ensure it can be used in a widening
883  // operation.
884  auto *SrcTy = toVectorTy(Extend->getSrcTy());
885  auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
886  unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
887  if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
888  return false;
889 
890  // Get the total number of vector elements in the legalized types.
891  InstructionCost NumDstEls =
892  DstTyL.first * DstTyL.second.getVectorMinNumElements();
893  InstructionCost NumSrcEls =
894  SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
895 
896  // Return true if the legalized types have the same number of vector elements
897  // and the destination element type size is twice that of the source type.
898  return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
899 }
900 
902  Type *Src,
905  const Instruction *I) {
906  int ISD = TLI->InstructionOpcodeToISD(Opcode);
907  assert(ISD && "Invalid opcode");
908 
909  // If the cast is observable, and it is used by a widening instruction (e.g.,
910  // uaddl, saddw, etc.), it may be free.
911  if (I && I->hasOneUse()) {
912  auto *SingleUser = cast<Instruction>(*I->user_begin());
913  SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
914  if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
915  // If the cast is the second operand, it is free. We will generate either
916  // a "wide" or "long" version of the widening instruction.
917  if (I == SingleUser->getOperand(1))
918  return 0;
919  // If the cast is not the second operand, it will be free if it looks the
920  // same as the second operand. In this case, we will generate a "long"
921  // version of the widening instruction.
922  if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
923  if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
924  cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
925  return 0;
926  }
927  }
928 
929  // TODO: Allow non-throughput costs that aren't binary.
930  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
932  return Cost == 0 ? 0 : 1;
933  return Cost;
934  };
935 
936  EVT SrcTy = TLI->getValueType(DL, Src);
937  EVT DstTy = TLI->getValueType(DL, Dst);
938 
939  if (!SrcTy.isSimple() || !DstTy.isSimple())
940  return AdjustCost(
941  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
942 
943  static const TypeConversionCostTblEntry
944  ConversionTbl[] = {
949 
950  // Truncations on nxvmiN
967 
968  // The number of shll instructions for the extension.
985 
986  // LowerVectorINT_TO_FP:
993 
994  // Complex: to v2f32
1001 
1002  // Complex: to v4f32
1007 
1008  // Complex: to v8f32
1013 
1014  // Complex: to v16f32
1017 
1018  // Complex: to v2f64
1025 
1026 
1027  // LowerVectorFP_TO_INT
1034 
1035  // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
1042 
1043  // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
1048 
1049  // Complex, from nxv2f32.
1058 
1059  // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
1066 
1067  // Complex, from nxv2f64.
1076 
1077  // Complex, from nxv4f32.
1086 
1087  // Complex, from nxv8f64. Illegal -> illegal conversions not required.
1092 
1093  // Complex, from nxv4f64. Illegal -> illegal conversions not required.
1100 
1101  // Complex, from nxv8f32. Illegal -> illegal conversions not required.
1106 
1107  // Complex, from nxv8f16.
1116 
1117  // Complex, from nxv4f16.
1126 
1127  // Complex, from nxv2f16.
1136 
1137  // Truncate from nxvmf32 to nxvmf16.
1141 
1142  // Truncate from nxvmf64 to nxvmf16.
1146 
1147  // Truncate from nxvmf64 to nxvmf32.
1151 
1152  // Extend from nxvmf16 to nxvmf32.
1156 
1157  // Extend from nxvmf16 to nxvmf64.
1161 
1162  // Extend from nxvmf32 to nxvmf64.
1166 
1167  };
1168 
1169  if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
1170  DstTy.getSimpleVT(),
1171  SrcTy.getSimpleVT()))
1172  return AdjustCost(Entry->Cost);
1173 
1174  return AdjustCost(
1175  BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1176 }
1177 
1179  Type *Dst,
1180  VectorType *VecTy,
1181  unsigned Index) {
1182 
1183  // Make sure we were given a valid extend opcode.
1184  assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
1185  "Invalid opcode");
1186 
1187  // We are extending an element we extract from a vector, so the source type
1188  // of the extend is the element type of the vector.
1189  auto *Src = VecTy->getElementType();
1190 
1191  // Sign- and zero-extends are for integer types only.
1192  assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
1193 
1194  // Get the cost for the extract. We compute the cost (if any) for the extend
1195  // below.
1196  InstructionCost Cost =
1197  getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
1198 
1199  // Legalize the types.
1200  auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
1201  auto DstVT = TLI->getValueType(DL, Dst);
1202  auto SrcVT = TLI->getValueType(DL, Src);
1204 
1205  // If the resulting type is still a vector and the destination type is legal,
1206  // we may get the extension for free. If not, get the default cost for the
1207  // extend.
1208  if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
1209  return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1210  CostKind);
1211 
1212  // The destination type should be larger than the element type. If not, get
1213  // the default cost for the extend.
1214  if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
1215  return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1216  CostKind);
1217 
1218  switch (Opcode) {
1219  default:
1220  llvm_unreachable("Opcode should be either SExt or ZExt");
1221 
1222  // For sign-extends, we only need a smov, which performs the extension
1223  // automatically.
1224  case Instruction::SExt:
1225  return Cost;
1226 
1227  // For zero-extends, the extend is performed automatically by a umov unless
1228  // the destination type is i64 and the element type is i8 or i16.
1229  case Instruction::ZExt:
1230  if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
1231  return Cost;
1232  }
1233 
1234  // If we are unable to perform the extend for free, get the default cost.
1235  return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
1236  CostKind);
1237 }
1238 
1241  const Instruction *I) {
1243  return Opcode == Instruction::PHI ? 0 : 1;
1244  assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
1245  // Branches are assumed to be predicted.
1246  return 0;
1247 }
1248 
1250  unsigned Index) {
1251  assert(Val->isVectorTy() && "This must be a vector type");
1252 
1253  if (Index != -1U) {
1254  // Legalize the type.
1255  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
1256 
1257  // This type is legalized to a scalar type.
1258  if (!LT.second.isVector())
1259  return 0;
1260 
1261  // The type may be split. Normalize the index to the new type.
1262  unsigned Width = LT.second.getVectorNumElements();
1263  Index = Index % Width;
1264 
1265  // The element at index zero is already inside the vector.
1266  if (Index == 0)
1267  return 0;
1268  }
1269 
1270  // All other insert/extracts cost this much.
1271  return ST->getVectorInsertExtractBaseCost();
1272 }
1273 
1275  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1276  TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
1277  TTI::OperandValueProperties Opd1PropInfo,
1279  const Instruction *CxtI) {
1280  // TODO: Handle more cost kinds.
1282  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1283  Opd2Info, Opd1PropInfo,
1284  Opd2PropInfo, Args, CxtI);
1285 
1286  // Legalize the type.
1287  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1288 
1289  // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
1290  // add in the widening overhead specified by the sub-target. Since the
1291  // extends feeding widening instructions are performed automatically, they
1292  // aren't present in the generated code and have a zero cost. By adding a
1293  // widening overhead here, we attach the total cost of the combined operation
1294  // to the widening instruction.
1295  InstructionCost Cost = 0;
1296  if (isWideningInstruction(Ty, Opcode, Args))
1297  Cost += ST->getWideningBaseCost();
1298 
1299  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1300 
1301  switch (ISD) {
1302  default:
1303  return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1304  Opd2Info,
1305  Opd1PropInfo, Opd2PropInfo);
1306  case ISD::SDIV:
1308  Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
1309  // On AArch64, scalar signed division by constants power-of-two are
1310  // normally expanded to the sequence ADD + CMP + SELECT + SRA.
1311  // The OperandValue properties many not be same as that of previous
1312  // operation; conservatively assume OP_None.
1313  Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
1314  Opd1Info, Opd2Info,
1317  Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
1318  Opd1Info, Opd2Info,
1321  Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind,
1322  Opd1Info, Opd2Info,
1325  Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
1326  Opd1Info, Opd2Info,
1329  return Cost;
1330  }
1332  case ISD::UDIV:
1334  auto VT = TLI->getValueType(DL, Ty);
1335  if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
1336  // Vector signed division by constant are expanded to the
1337  // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
1338  // to MULHS + SUB + SRL + ADD + SRL.
1339  InstructionCost MulCost = getArithmeticInstrCost(
1340  Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info,
1342  InstructionCost AddCost = getArithmeticInstrCost(
1343  Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info,
1345  InstructionCost ShrCost = getArithmeticInstrCost(
1346  Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info,
1348  return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
1349  }
1350  }
1351 
1352  Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1353  Opd2Info,
1354  Opd1PropInfo, Opd2PropInfo);
1355  if (Ty->isVectorTy()) {
1356  // On AArch64, vector divisions are not supported natively and are
1357  // expanded into scalar divisions of each pair of elements.
1358  Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind,
1359  Opd1Info, Opd2Info, Opd1PropInfo,
1360  Opd2PropInfo);
1361  Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
1362  Opd1Info, Opd2Info, Opd1PropInfo,
1363  Opd2PropInfo);
1364  // TODO: if one of the arguments is scalar, then it's not necessary to
1365  // double the cost of handling the vector elements.
1366  Cost += Cost;
1367  }
1368  return Cost;
1369 
1370  case ISD::MUL:
1371  if (LT.second != MVT::v2i64)
1372  return (Cost + 1) * LT.first;
1373  // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive
1374  // as elements are extracted from the vectors and the muls scalarized.
1375  // As getScalarizationOverhead is a bit too pessimistic, we estimate the
1376  // cost for a i64 vector directly here, which is:
1377  // - four i64 extracts,
1378  // - two i64 inserts, and
1379  // - two muls.
1380  // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with
1381  // LT.first = 2 the cost is 16.
1382  return LT.first * 8;
1383  case ISD::ADD:
1384  case ISD::XOR:
1385  case ISD::OR:
1386  case ISD::AND:
1387  // These nodes are marked as 'custom' for combining purposes only.
1388  // We know that they are legal. See LowerAdd in ISelLowering.
1389  return (Cost + 1) * LT.first;
1390 
1391  case ISD::FADD:
1392  // These nodes are marked as 'custom' just to lower them to SVE.
1393  // We know said lowering will incur no additional cost.
1394  if (isa<FixedVectorType>(Ty) && !Ty->getScalarType()->isFP128Ty())
1395  return (Cost + 2) * LT.first;
1396 
1397  return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
1398  Opd2Info,
1399  Opd1PropInfo, Opd2PropInfo);
1400  }
1401 }
1402 
1404  ScalarEvolution *SE,
1405  const SCEV *Ptr) {
1406  // Address computations in vectorized code with non-consecutive addresses will
1407  // likely result in more instructions compared to scalar code where the
1408  // computation can more often be merged into the index mode. The resulting
1409  // extra micro-ops can significantly decrease throughput.
1410  unsigned NumVectorInstToHideOverhead = 10;
1411  int MaxMergeDistance = 64;
1412 
1413  if (Ty->isVectorTy() && SE &&
1414  !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1415  return NumVectorInstToHideOverhead;
1416 
1417  // In many cases the address computation is not merged into the instruction
1418  // addressing mode.
1419  return 1;
1420 }
1421 
1423  Type *CondTy,
1424  CmpInst::Predicate VecPred,
1426  const Instruction *I) {
1427  // TODO: Handle other cost kinds.
1429  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1430  I);
1431 
1432  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1433  // We don't lower some vector selects well that are wider than the register
1434  // width.
1435  if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
1436  // We would need this many instructions to hide the scalarization happening.
1437  const int AmortizationCost = 20;
1438 
1439  // If VecPred is not set, check if we can get a predicate from the context
1440  // instruction, if its type matches the requested ValTy.
1441  if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
1442  CmpInst::Predicate CurrentPred;
1443  if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
1444  m_Value())))
1445  VecPred = CurrentPred;
1446  }
1447  // Check if we have a compare/select chain that can be lowered using CMxx &
1448  // BFI pair.
1449  if (CmpInst::isIntPredicate(VecPred)) {
1450  static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
1452  MVT::v2i64};
1453  auto LT = TLI->getTypeLegalizationCost(DL, ValTy);
1454  if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
1455  return LT.first;
1456  }
1457 
1458  static const TypeConversionCostTblEntry
1459  VectorSelectTbl[] = {
1461  { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
1463  { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
1464  { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
1465  { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
1466  };
1467 
1468  EVT SelCondTy = TLI->getValueType(DL, CondTy);
1469  EVT SelValTy = TLI->getValueType(DL, ValTy);
1470  if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1471  if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
1472  SelCondTy.getSimpleVT(),
1473  SelValTy.getSimpleVT()))
1474  return Entry->Cost;
1475  }
1476  }
1477  // The base case handles scalable vectors fine for now, since it treats the
1478  // cost as 1 * legalization cost.
1479  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1480 }
1481 
1483 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
1485  if (ST->requiresStrictAlign()) {
1486  // TODO: Add cost modeling for strict align. Misaligned loads expand to
1487  // a bunch of instructions when strict align is enabled.
1488  return Options;
1489  }
1490  Options.AllowOverlappingLoads = true;
1491  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
1492  Options.NumLoadsPerBlock = Options.MaxNumLoads;
1493  // TODO: Though vector loads usually perform well on AArch64, in some targets
1494  // they may wake up the FP unit, which raises the power consumption. Perhaps
1495  // they could be used with no holds barred (-O3).
1496  Options.LoadSizes = {8, 4, 2, 1};
1497  return Options;
1498 }
1499 
1502  Align Alignment, unsigned AddressSpace,
1504  if (!isa<ScalableVectorType>(Src))
1505  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1506  CostKind);
1507  auto LT = TLI->getTypeLegalizationCost(DL, Src);
1508  if (!LT.first.isValid())
1509  return InstructionCost::getInvalid();
1510 
1511  // The code-generator is currently not able to handle scalable vectors
1512  // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
1513  // it. This change will be removed when code-generation for these types is
1514  // sufficiently reliable.
1515  if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
1516  return InstructionCost::getInvalid();
1517 
1518  return LT.first * 2;
1519 }
1520 
1522  unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1523  Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1524 
1525  if (!isa<ScalableVectorType>(DataTy))
1526  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1527  Alignment, CostKind, I);
1528  auto *VT = cast<VectorType>(DataTy);
1529  auto LT = TLI->getTypeLegalizationCost(DL, DataTy);
1530  if (!LT.first.isValid())
1531  return InstructionCost::getInvalid();
1532 
1533  // The code-generator is currently not able to handle scalable vectors
1534  // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
1535  // it. This change will be removed when code-generation for these types is
1536  // sufficiently reliable.
1537  if (cast<VectorType>(DataTy)->getElementCount() ==
1539  return InstructionCost::getInvalid();
1540 
1541  ElementCount LegalVF = LT.second.getVectorElementCount();
1542  InstructionCost MemOpCost =
1543  getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
1544  return LT.first * MemOpCost * getMaxNumElements(LegalVF);
1545 }
1546 
1547 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
1548  return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
1549 }
1550 
1552  MaybeAlign Alignment,
1553  unsigned AddressSpace,
1555  const Instruction *I) {
1556  EVT VT = TLI->getValueType(DL, Ty, true);
1557  // Type legalization can't handle structs
1558  if (VT == MVT::Other)
1559  return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
1560  CostKind);
1561 
1562  auto LT = TLI->getTypeLegalizationCost(DL, Ty);
1563  if (!LT.first.isValid())
1564  return InstructionCost::getInvalid();
1565 
1566  // The code-generator is currently not able to handle scalable vectors
1567  // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
1568  // it. This change will be removed when code-generation for these types is
1569  // sufficiently reliable.
1570  if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
1571  if (VTy->getElementCount() == ElementCount::getScalable(1))
1572  return InstructionCost::getInvalid();
1573 
1574  // TODO: consider latency as well for TCK_SizeAndLatency.
1576  return LT.first;
1577 
1579  return 1;
1580 
1581  if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
1582  LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
1583  // Unaligned stores are extremely inefficient. We don't split all
1584  // unaligned 128-bit stores because the negative impact that has shown in
1585  // practice on inlined block copy code.
1586  // We make such stores expensive so that we will only vectorize if there
1587  // are 6 other instructions getting vectorized.
1588  const int AmortizationCost = 6;
1589 
1590  return LT.first * 2 * AmortizationCost;
1591  }
1592 
1593  // Check truncating stores and extending loads.
1594  if (useNeonVector(Ty) &&
1595  Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
1596  // v4i8 types are lowered to scalar a load/store and sshll/xtn.
1597  if (VT == MVT::v4i8)
1598  return 2;
1599  // Otherwise we need to scalarize.
1600  return cast<FixedVectorType>(Ty)->getNumElements() * 2;
1601  }
1602 
1603  return LT.first;
1604 }
1605 
1607  unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1608  Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1609  bool UseMaskForCond, bool UseMaskForGaps) {
1610  assert(Factor >= 2 && "Invalid interleave factor");
1611  auto *VecVTy = cast<FixedVectorType>(VecTy);
1612 
1613  if (!UseMaskForCond && !UseMaskForGaps &&
1614  Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1615  unsigned NumElts = VecVTy->getNumElements();
1616  auto *SubVecTy =
1617  FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1618 
1619  // ldN/stN only support legal vector types of size 64 or 128 in bits.
1620  // Accesses having vector types that are a multiple of 128 bits can be
1621  // matched to more than one ldN/stN instruction.
1622  if (NumElts % Factor == 0 &&
1623  TLI->isLegalInterleavedAccessType(SubVecTy, DL))
1624  return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1625  }
1626 
1627  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1628  Alignment, AddressSpace, CostKind,
1629  UseMaskForCond, UseMaskForGaps);
1630 }
1631 
1634  InstructionCost Cost = 0;
1636  for (auto *I : Tys) {
1637  if (!I->isVectorTy())
1638  continue;
1639  if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
1640  128)
1641  Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
1642  getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
1643  }
1644  return Cost;
1645 }
1646 
1648  return ST->getMaxInterleaveFactor();
1649 }
1650 
1651 // For Falkor, we want to avoid having too many strided loads in a loop since
1652 // that can exhaust the HW prefetcher resources. We adjust the unroller
1653 // MaxCount preference below to attempt to ensure unrolling doesn't create too
1654 // many strided loads.
1655 static void
1658  enum { MaxStridedLoads = 7 };
1659  auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
1660  int StridedLoads = 0;
1661  // FIXME? We could make this more precise by looking at the CFG and
1662  // e.g. not counting loads in each side of an if-then-else diamond.
1663  for (const auto BB : L->blocks()) {
1664  for (auto &I : *BB) {
1665  LoadInst *LMemI = dyn_cast<LoadInst>(&I);
1666  if (!LMemI)
1667  continue;
1668 
1669  Value *PtrValue = LMemI->getPointerOperand();
1670  if (L->isLoopInvariant(PtrValue))
1671  continue;
1672 
1673  const SCEV *LSCEV = SE.getSCEV(PtrValue);
1674  const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
1675  if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
1676  continue;
1677 
1678  // FIXME? We could take pairing of unrolled load copies into account
1679  // by looking at the AddRec, but we would probably have to limit this
1680  // to loops with no stores or other memory optimization barriers.
1681  ++StridedLoads;
1682  // We've seen enough strided loads that seeing more won't make a
1683  // difference.
1684  if (StridedLoads > MaxStridedLoads / 2)
1685  return StridedLoads;
1686  }
1687  }
1688  return StridedLoads;
1689  };
1690 
1691  int StridedLoads = countStridedLoads(L, SE);
1692  LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
1693  << " strided loads\n");
1694  // Pick the largest power of 2 unroll count that won't result in too many
1695  // strided loads.
1696  if (StridedLoads) {
1697  UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
1698  LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
1699  << UP.MaxCount << '\n');
1700  }
1701 }
1702 
1706  // Enable partial unrolling and runtime unrolling.
1707  BaseT::getUnrollingPreferences(L, SE, UP, ORE);
1708 
1709  // For inner loop, it is more likely to be a hot one, and the runtime check
1710  // can be promoted out from LICM pass, so the overhead is less, let's try
1711  // a larger threshold to unroll more loops.
1712  if (L->getLoopDepth() > 1)
1713  UP.PartialThreshold *= 2;
1714 
1715  // Disable partial & runtime unrolling on -Os.
1716  UP.PartialOptSizeThreshold = 0;
1717 
1718  if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
1720  getFalkorUnrollingPreferences(L, SE, UP);
1721 
1722  // Scan the loop: don't unroll loops with calls as this could prevent
1723  // inlining. Don't unroll vector loops either, as they don't benefit much from
1724  // unrolling.
1725  for (auto *BB : L->getBlocks()) {
1726  for (auto &I : *BB) {
1727  // Don't unroll vectorised loop.
1728  if (I.getType()->isVectorTy())
1729  return;
1730 
1731  if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1732  if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1733  if (!isLoweredToCall(F))
1734  continue;
1735  }
1736  return;
1737  }
1738  }
1739  }
1740 
1741  // Enable runtime unrolling for in-order models
1742  // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
1743  // checking for that case, we can ensure that the default behaviour is
1744  // unchanged
1745  if (ST->getProcFamily() != AArch64Subtarget::Others &&
1746  !ST->getSchedModel().isOutOfOrder()) {
1747  UP.Runtime = true;
1748  UP.Partial = true;
1749  UP.UpperBound = true;
1750  UP.UnrollRemainder = true;
1752 
1753  UP.UnrollAndJam = true;
1755  }
1756 }
1757 
1760  BaseT::getPeelingPreferences(L, SE, PP);
1761 }
1762 
1764  Type *ExpectedType) {
1765  switch (Inst->getIntrinsicID()) {
1766  default:
1767  return nullptr;
1768  case Intrinsic::aarch64_neon_st2:
1769  case Intrinsic::aarch64_neon_st3:
1770  case Intrinsic::aarch64_neon_st4: {
1771  // Create a struct type
1772  StructType *ST = dyn_cast<StructType>(ExpectedType);
1773  if (!ST)
1774  return nullptr;
1775  unsigned NumElts = Inst->getNumArgOperands() - 1;
1776  if (ST->getNumElements() != NumElts)
1777  return nullptr;
1778  for (unsigned i = 0, e = NumElts; i != e; ++i) {
1779  if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
1780  return nullptr;
1781  }
1782  Value *Res = UndefValue::get(ExpectedType);
1783  IRBuilder<> Builder(Inst);
1784  for (unsigned i = 0, e = NumElts; i != e; ++i) {
1785  Value *L = Inst->getArgOperand(i);
1786  Res = Builder.CreateInsertValue(Res, L, i);
1787  }
1788  return Res;
1789  }
1790  case Intrinsic::aarch64_neon_ld2:
1791  case Intrinsic::aarch64_neon_ld3:
1792  case Intrinsic::aarch64_neon_ld4:
1793  if (Inst->getType() == ExpectedType)
1794  return Inst;
1795  return nullptr;
1796  }
1797 }
1798 
1801  switch (Inst->getIntrinsicID()) {
1802  default:
1803  break;
1804  case Intrinsic::aarch64_neon_ld2:
1805  case Intrinsic::aarch64_neon_ld3:
1806  case Intrinsic::aarch64_neon_ld4:
1807  Info.ReadMem = true;
1808  Info.WriteMem = false;
1809  Info.PtrVal = Inst->getArgOperand(0);
1810  break;
1811  case Intrinsic::aarch64_neon_st2:
1812  case Intrinsic::aarch64_neon_st3:
1813  case Intrinsic::aarch64_neon_st4:
1814  Info.ReadMem = false;
1815  Info.WriteMem = true;
1816  Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
1817  break;
1818  }
1819 
1820  switch (Inst->getIntrinsicID()) {
1821  default:
1822  return false;
1823  case Intrinsic::aarch64_neon_ld2:
1824  case Intrinsic::aarch64_neon_st2:
1825  Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
1826  break;
1827  case Intrinsic::aarch64_neon_ld3:
1828  case Intrinsic::aarch64_neon_st3:
1829  Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
1830  break;
1831  case Intrinsic::aarch64_neon_ld4:
1832  case Intrinsic::aarch64_neon_st4:
1833  Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
1834  break;
1835  }
1836  return true;
1837 }
1838 
1839 /// See if \p I should be considered for address type promotion. We check if \p
1840 /// I is a sext with right type and used in memory accesses. If it used in a
1841 /// "complex" getelementptr, we allow it to be promoted without finding other
1842 /// sext instructions that sign extended the same initial value. A getelementptr
1843 /// is considered as "complex" if it has more than 2 operands.
1845  const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
1846  bool Considerable = false;
1847  AllowPromotionWithoutCommonHeader = false;
1848  if (!isa<SExtInst>(&I))
1849  return false;
1850  Type *ConsideredSExtType =
1851  Type::getInt64Ty(I.getParent()->getParent()->getContext());
1852  if (I.getType() != ConsideredSExtType)
1853  return false;
1854  // See if the sext is the one with the right type and used in at least one
1855  // GetElementPtrInst.
1856  for (const User *U : I.users()) {
1857  if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
1858  Considerable = true;
1859  // A getelementptr is considered as "complex" if it has more than 2
1860  // operands. We will promote a SExt used in such complex GEP as we
1861  // expect some computation to be merged if they are done on 64 bits.
1862  if (GEPInst->getNumOperands() > 2) {
1863  AllowPromotionWithoutCommonHeader = true;
1864  break;
1865  }
1866  }
1867  }
1868  return Considerable;
1869 }
1870 
1872  const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
1873  if (!VF.isScalable())
1874  return true;
1875 
1876  Type *Ty = RdxDesc.getRecurrenceType();
1877  if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
1878  return false;
1879 
1880  switch (RdxDesc.getRecurrenceKind()) {
1881  case RecurKind::Add:
1882  case RecurKind::FAdd:
1883  case RecurKind::And:
1884  case RecurKind::Or:
1885  case RecurKind::Xor:
1886  case RecurKind::SMin:
1887  case RecurKind::SMax:
1888  case RecurKind::UMin:
1889  case RecurKind::UMax:
1890  case RecurKind::FMin:
1891  case RecurKind::FMax:
1892  return true;
1893  default:
1894  return false;
1895  }
1896 }
1897 
1900  bool IsUnsigned,
1902  if (!isa<ScalableVectorType>(Ty))
1903  return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
1904  assert((isa<ScalableVectorType>(Ty) && isa<ScalableVectorType>(CondTy)) &&
1905  "Both vector needs to be scalable");
1906 
1907  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
1908  InstructionCost LegalizationCost = 0;
1909  if (LT.first > 1) {
1910  Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
1911  unsigned CmpOpcode =
1912  Ty->isFPOrFPVectorTy() ? Instruction::FCmp : Instruction::ICmp;
1913  LegalizationCost =
1914  getCmpSelInstrCost(CmpOpcode, LegalVTy, LegalVTy,
1916  getCmpSelInstrCost(Instruction::Select, LegalVTy, LegalVTy,
1918  LegalizationCost *= LT.first - 1;
1919  }
1920 
1921  return LegalizationCost + /*Cost of horizontal reduction*/ 2;
1922 }
1923 
1925  unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
1926  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1927  InstructionCost LegalizationCost = 0;
1928  if (LT.first > 1) {
1929  Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
1930  LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
1931  LegalizationCost *= LT.first - 1;
1932  }
1933 
1934  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1935  assert(ISD && "Invalid opcode");
1936  // Add the final reduction cost for the legal horizontal reduction
1937  switch (ISD) {
1938  case ISD::ADD:
1939  case ISD::AND:
1940  case ISD::OR:
1941  case ISD::XOR:
1942  case ISD::FADD:
1943  return LegalizationCost + 2;
1944  default:
1945  return InstructionCost::getInvalid();
1946  }
1947 }
1948 
1953  if (TTI::requiresOrderedReduction(FMF)) {
1954  if (!isa<ScalableVectorType>(ValTy))
1955  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1956 
1957  if (Opcode != Instruction::FAdd)
1958  return InstructionCost::getInvalid();
1959 
1960  auto *VTy = cast<ScalableVectorType>(ValTy);
1961  InstructionCost Cost =
1962  getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
1963  Cost *= getMaxNumElements(VTy->getElementCount());
1964  return Cost;
1965  }
1966 
1967  if (isa<ScalableVectorType>(ValTy))
1968  return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
1969 
1970  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1971  MVT MTy = LT.second;
1972  int ISD = TLI->InstructionOpcodeToISD(Opcode);
1973  assert(ISD && "Invalid opcode");
1974 
1975  // Horizontal adds can use the 'addv' instruction. We model the cost of these
1976  // instructions as twice a normal vector add, plus 1 for each legalization
1977  // step (LT.first). This is the only arithmetic vector reduction operation for
1978  // which we have an instruction.
1979  // OR, XOR and AND costs should match the codegen from:
1980  // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
1981  // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
1982  // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
1983  static const CostTblEntry CostTblNoPairwise[]{
1984  {ISD::ADD, MVT::v8i8, 2},
1985  {ISD::ADD, MVT::v16i8, 2},
1986  {ISD::ADD, MVT::v4i16, 2},
1987  {ISD::ADD, MVT::v8i16, 2},
1988  {ISD::ADD, MVT::v4i32, 2},
1989  {ISD::OR, MVT::v8i8, 15},
1990  {ISD::OR, MVT::v16i8, 17},
1991  {ISD::OR, MVT::v4i16, 7},
1992  {ISD::OR, MVT::v8i16, 9},
1993  {ISD::OR, MVT::v2i32, 3},
1994  {ISD::OR, MVT::v4i32, 5},
1995  {ISD::OR, MVT::v2i64, 3},
1996  {ISD::XOR, MVT::v8i8, 15},
1997  {ISD::XOR, MVT::v16i8, 17},
1998  {ISD::XOR, MVT::v4i16, 7},
1999  {ISD::XOR, MVT::v8i16, 9},
2000  {ISD::XOR, MVT::v2i32, 3},
2001  {ISD::XOR, MVT::v4i32, 5},
2002  {ISD::XOR, MVT::v2i64, 3},
2003  {ISD::AND, MVT::v8i8, 15},
2004  {ISD::AND, MVT::v16i8, 17},
2005  {ISD::AND, MVT::v4i16, 7},
2006  {ISD::AND, MVT::v8i16, 9},
2007  {ISD::AND, MVT::v2i32, 3},
2008  {ISD::AND, MVT::v4i32, 5},
2009  {ISD::AND, MVT::v2i64, 3},
2010  };
2011  switch (ISD) {
2012  default:
2013  break;
2014  case ISD::ADD:
2015  if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
2016  return (LT.first - 1) + Entry->Cost;
2017  break;
2018  case ISD::XOR:
2019  case ISD::AND:
2020  case ISD::OR:
2021  const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
2022  if (!Entry)
2023  break;
2024  auto *ValVTy = cast<FixedVectorType>(ValTy);
2025  if (!ValVTy->getElementType()->isIntegerTy(1) &&
2026  MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
2027  isPowerOf2_32(ValVTy->getNumElements())) {
2028  InstructionCost ExtraCost = 0;
2029  if (LT.first != 1) {
2030  // Type needs to be split, so there is an extra cost of LT.first - 1
2031  // arithmetic ops.
2032  auto *Ty = FixedVectorType::get(ValTy->getElementType(),
2033  MTy.getVectorNumElements());
2034  ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
2035  ExtraCost *= LT.first - 1;
2036  }
2037  return Entry->Cost + ExtraCost;
2038  }
2039  break;
2040  }
2041  return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2042 }
2043 
2045  static const CostTblEntry ShuffleTbl[] = {
2046  { TTI::SK_Splice, MVT::nxv16i8, 1 },
2047  { TTI::SK_Splice, MVT::nxv8i16, 1 },
2048  { TTI::SK_Splice, MVT::nxv4i32, 1 },
2049  { TTI::SK_Splice, MVT::nxv2i64, 1 },
2050  { TTI::SK_Splice, MVT::nxv2f16, 1 },
2051  { TTI::SK_Splice, MVT::nxv4f16, 1 },
2052  { TTI::SK_Splice, MVT::nxv8f16, 1 },
2053  { TTI::SK_Splice, MVT::nxv2bf16, 1 },
2054  { TTI::SK_Splice, MVT::nxv4bf16, 1 },
2055  { TTI::SK_Splice, MVT::nxv8bf16, 1 },
2056  { TTI::SK_Splice, MVT::nxv2f32, 1 },
2057  { TTI::SK_Splice, MVT::nxv4f32, 1 },
2058  { TTI::SK_Splice, MVT::nxv2f64, 1 },
2059  };
2060 
2061  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
2062  Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
2064  EVT PromotedVT = LT.second.getScalarType() == MVT::i1
2065  ? TLI->getPromotedVTForPredicate(EVT(LT.second))
2066  : LT.second;
2067  Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
2068  InstructionCost LegalizationCost = 0;
2069  if (Index < 0) {
2070  LegalizationCost =
2071  getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
2073  getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
2075  }
2076 
2077  // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
2078  // Cost performed on a promoted type.
2079  if (LT.second.getScalarType() == MVT::i1) {
2080  LegalizationCost +=
2081  getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
2083  getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
2085  }
2086  const auto *Entry =
2087  CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
2088  assert(Entry && "Illegal Type for Splice");
2089  LegalizationCost += Entry->Cost;
2090  return LegalizationCost * LT.first;
2091 }
2092 
2094  VectorType *Tp,
2095  ArrayRef<int> Mask, int Index,
2096  VectorType *SubTp) {
2097  Kind = improveShuffleKindFromMask(Kind, Mask);
2100  Kind == TTI::SK_Reverse) {
2101  static const CostTblEntry ShuffleTbl[] = {
2102  // Broadcast shuffle kinds can be performed with 'dup'.
2103  { TTI::SK_Broadcast, MVT::v8i8, 1 },
2104  { TTI::SK_Broadcast, MVT::v16i8, 1 },
2105  { TTI::SK_Broadcast, MVT::v4i16, 1 },
2106  { TTI::SK_Broadcast, MVT::v8i16, 1 },
2107  { TTI::SK_Broadcast, MVT::v2i32, 1 },
2108  { TTI::SK_Broadcast, MVT::v4i32, 1 },
2109  { TTI::SK_Broadcast, MVT::v2i64, 1 },
2110  { TTI::SK_Broadcast, MVT::v2f32, 1 },
2111  { TTI::SK_Broadcast, MVT::v4f32, 1 },
2112  { TTI::SK_Broadcast, MVT::v2f64, 1 },
2113  // Transpose shuffle kinds can be performed with 'trn1/trn2' and
2114  // 'zip1/zip2' instructions.
2115  { TTI::SK_Transpose, MVT::v8i8, 1 },
2116  { TTI::SK_Transpose, MVT::v16i8, 1 },
2117  { TTI::SK_Transpose, MVT::v4i16, 1 },
2118  { TTI::SK_Transpose, MVT::v8i16, 1 },
2119  { TTI::SK_Transpose, MVT::v2i32, 1 },
2120  { TTI::SK_Transpose, MVT::v4i32, 1 },
2121  { TTI::SK_Transpose, MVT::v2i64, 1 },
2122  { TTI::SK_Transpose, MVT::v2f32, 1 },
2123  { TTI::SK_Transpose, MVT::v4f32, 1 },
2124  { TTI::SK_Transpose, MVT::v2f64, 1 },
2125  // Select shuffle kinds.
2126  // TODO: handle vXi8/vXi16.
2127  { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
2128  { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
2129  { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
2130  { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
2131  { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
2132  { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
2133  // PermuteSingleSrc shuffle kinds.
2134  { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
2135  { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
2136  { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
2137  { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
2138  { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
2139  { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
2140  { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case.
2141  { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case.
2142  { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case.
2143  { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl
2144  { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl
2145  { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl
2146  { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl
2147  { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl
2148  // Reverse can be lowered with `rev`.
2149  { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov.
2150  { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT
2151  { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov.
2152  { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov.
2153  { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT
2154  { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov.
2155  // Broadcast shuffle kinds for scalable vectors
2173  // Handle the cases for vector.reverse with scalable vectors
2174  { TTI::SK_Reverse, MVT::nxv16i8, 1 },
2175  { TTI::SK_Reverse, MVT::nxv8i16, 1 },
2176  { TTI::SK_Reverse, MVT::nxv4i32, 1 },
2177  { TTI::SK_Reverse, MVT::nxv2i64, 1 },
2178  { TTI::SK_Reverse, MVT::nxv2f16, 1 },
2179  { TTI::SK_Reverse, MVT::nxv4f16, 1 },
2180  { TTI::SK_Reverse, MVT::nxv8f16, 1 },
2184  { TTI::SK_Reverse, MVT::nxv2f32, 1 },
2185  { TTI::SK_Reverse, MVT::nxv4f32, 1 },
2186  { TTI::SK_Reverse, MVT::nxv2f64, 1 },
2187  { TTI::SK_Reverse, MVT::nxv16i1, 1 },
2188  { TTI::SK_Reverse, MVT::nxv8i1, 1 },
2189  { TTI::SK_Reverse, MVT::nxv4i1, 1 },
2190  { TTI::SK_Reverse, MVT::nxv2i1, 1 },
2191  };
2192  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
2193  if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
2194  return LT.first * Entry->Cost;
2195  }
2196  if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
2197  return getSpliceCost(Tp, Index);
2198  return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
2199 }
i
i
Definition: README.txt:29
llvm::AArch64TTIImpl::getIntImmCostIntrin
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:157
llvm::MVT::nxv4i1
@ nxv4i1
Definition: MachineValueType.h:190
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::TargetTransformInfo::PSK_FastHardware
@ PSK_FastHardware
Definition: TargetTransformInfo.h:588
llvm::AArch64TTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: AArch64TargetTransformInfo.cpp:1703
llvm::MVT::nxv4i64
@ nxv4i64
Definition: MachineValueType.h:220
llvm::TargetTransformInfo::UnrollingPreferences::PartialOptSizeThreshold
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
Definition: TargetTransformInfo.h:453
llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition: TargetTransformInfo.h:855
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:481
llvm::MVT::nxv2i1
@ nxv2i1
Definition: MachineValueType.h:189
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:211
llvm::Loop::isLoopInvariant
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:64
llvm::MVT::v4f16
@ v4f16
Definition: MachineValueType.h:136
llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition: TargetTransformInfo.h:449
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
llvm::ConvertCostTableLookup
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType >> Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
llvm::Type::getInt1Ty
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:200
llvm::CostTblEntryT
Cost Table Entry.
Definition: CostTable.h:25
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
getFalkorUnrollingPreferences
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
Definition: AArch64TargetTransformInfo.cpp:1656
llvm::MVT::nxv2f64
@ nxv2f64
Definition: MachineValueType.h:247
llvm::AArch64TTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: AArch64TargetTransformInfo.cpp:1249
llvm::RecurKind::Or
@ Or
Bitwise or logical OR of integers.
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:633
InstCombiner.h
instCombineSVEPTest
static Optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:685
Insert
Vector Rotate Left Mask Mask Insert
Definition: README_P9.txt:112
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:720
llvm::SCEVAddRecExpr::isAffine
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
Definition: ScalarEvolutionExpressions.h:379
llvm::TargetTransformInfo::UnrollingPreferences::MaxCount
unsigned MaxCount
Definition: TargetTransformInfo.h:465
llvm::AArch64TTIImpl::getIntImmCost
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
Definition: AArch64TargetTransformInfo.cpp:48
IntrinsicInst.h
llvm::ElementCount
Definition: TypeSize.h:386
llvm::AArch64TTIImpl::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:1551
llvm::Function
Definition: Function.h:61
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
instCombineSVECmpNE
static Optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:439
llvm::ISD::UDIV
@ UDIV
Definition: ISDOpcodes.h:243
llvm::TargetTransformInfo::PopcntSupportKind
PopcntSupportKind
Flags indicating the kind of support for population count.
Definition: TargetTransformInfo.h:588
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
llvm::MVT::nxv2f32
@ nxv2f32
Definition: MachineValueType.h:241
llvm::PatternMatch::m_FPOne
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:848
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:319
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::MVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition: MachineValueType.h:366
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1643
llvm::getSplatValue
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
Definition: VectorUtils.cpp:360
llvm::IRBuilder<>
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:150
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:443
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJamInnerLoopThreshold
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
Definition: TargetTransformInfo.h:500
llvm::MVT::nxv2i64
@ nxv2i64
Definition: MachineValueType.h:219
llvm::TargetTransformInfo::UnrollingPreferences::UnrollRemainder
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
Definition: TargetTransformInfo.h:493
llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:785
llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:189
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:214
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::InsertElementInst::Create
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Definition: Instructions.h:1953
llvm::ExtractElementInst::Create
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Definition: Instructions.h:1888
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:477
llvm::MVT::nxv4f16
@ nxv4f16
Definition: MachineValueType.h:230
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1581
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:531
llvm::Optional
Definition: APInt.h:33
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:40
llvm::MVT::nxv4f64
@ nxv4f64
Definition: MachineValueType.h:248
llvm::TargetTransformInfo::OP_PowerOf2
@ OP_PowerOf2
Definition: TargetTransformInfo.h:879
llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:421
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:116
llvm::CallBase::getNumArgOperands
unsigned getNumArgOperands() const
Definition: InstrTypes.h:1336
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:267
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
llvm::RecurKind::SMin
@ SMin
Signed integer min implemented in terms of select(cmp()).
llvm::MVT::v2f64
@ v2f64
Definition: MachineValueType.h:172
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition: TargetTransformInfo.h:863
llvm::MVT::nxv8i16
@ nxv8i16
Definition: MachineValueType.h:207
llvm::LinearPolySize::isScalable
bool isScalable() const
Returns whether the size is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:299
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:203
llvm::APIntOps::umin
const APInt & umin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be unsigned.
Definition: APInt.h:2178
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:853
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::AArch64TTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: AArch64TargetTransformInfo.cpp:31
instCombineRDFFR
static Optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:628
llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(Optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition: TargetTransformInfo.h:1153
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:130
llvm::MVT::nxv8bf16
@ nxv8bf16
Definition: MachineValueType.h:238
llvm::AArch64TTIImpl::getExtractWithExtendCost
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
Definition: AArch64TargetTransformInfo.cpp:1178
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:266
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::MVT::v4bf16
@ v4bf16
Definition: MachineValueType.h:147
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:206
TargetLowering.h
llvm::MVT::i1
@ i1
Definition: MachineValueType.h:43
llvm::MVT::v8f16
@ v8f16
Definition: MachineValueType.h:137
llvm::MVT::nxv2bf16
@ nxv2bf16
Definition: MachineValueType.h:236
llvm::RecurKind::And
@ And
Bitwise or logical AND of integers.
instCombineConvertFromSVBool
static Optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:370
llvm::TypeConversionCostTblEntryT
Type Conversion Cost Table.
Definition: CostTable.h:55
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::PatternMatch::m_Select
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
Definition: PatternMatch.h:1472
llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:679
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:729
llvm::TargetTransformInfo::OP_None
@ OP_None
Definition: TargetTransformInfo.h:879
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:852
llvm::AArch64_AM::isLogicalImmediate
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
Definition: AArch64AddressingModes.h:275
llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition: TargetTransformInfo.h:1055
llvm::User
Definition: User.h:44
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:35
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::MVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition: MachineValueType.h:1062
llvm::AArch64TTIImpl::getCostOfKeepingLiveOverCall
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
Definition: AArch64TargetTransformInfo.cpp:1633
llvm::RecurrenceDescriptor::getRecurrenceType
Type * getRecurrenceType() const
Returns the type of the recurrence.
Definition: IVDescriptors.h:225
llvm::AArch64Subtarget::Others
@ Others
Definition: AArch64Subtarget.h:41
llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:735
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:178
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:237
llvm::MVT::v16i1
@ v16i1
Definition: MachineValueType.h:68
llvm::MVT::nxv2f16
@ nxv2f16
Definition: MachineValueType.h:229
llvm::VectorType::getElementCount
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:626
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
llvm::Log2_32
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:596
llvm::MVT::v8i1
@ v8i1
Definition: MachineValueType.h:67
llvm::TargetTransformInfo::UnrollingPreferences::UnrollAndJam
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
Definition: TargetTransformInfo.h:495
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::AArch64TTIImpl::getIntImmCostInst
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
Definition: AArch64TargetTransformInfo.cpp:88
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:153
llvm::RecurrenceDescriptor::getRecurrenceKind
RecurKind getRecurrenceKind() const
Definition: IVDescriptors.h:181
llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:739
llvm::AArch64TTIImpl::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:901
Options
const char LLVMTargetMachineRef LLVMPassBuilderOptionsRef Options
Definition: PassBuilderBindings.cpp:48
llvm::AArch64TTIImpl::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:1899
llvm::MVT::nxv4i8
@ nxv4i8
Definition: MachineValueType.h:198
llvm::MVT::nxv4f32
@ nxv4f32
Definition: MachineValueType.h:242
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1784
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:899
instCombineSVEVectorMul
static Optional< Instruction * > instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:710
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:632
Align
uint64_t Align
Definition: ELFObjHandler.cpp:83
PatternMatch.h
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:650
llvm::MVT::v1i64
@ v1i64
Definition: MachineValueType.h:117
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::MCID::Call
@ Call
Definition: MCInstrDesc.h:153
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
llvm::AArch64TTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, int Index, VectorType *SubTp)
Definition: AArch64TargetTransformInfo.cpp:2093
llvm::None
const NoneType None
Definition: None.h:23
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:91
llvm::EVT::getTypeForEVT
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:181
llvm::lltok::Kind
Kind
Definition: LLToken.h:18
llvm::MVT::v4i8
@ v4i8
Definition: MachineValueType.h:78
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:118
llvm::MVT::nxv4i16
@ nxv4i16
Definition: MachineValueType.h:206
instCombineSVETBL
static Optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:765
llvm::PatternMatch::m_One
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:513
llvm::RecurKind::UMin
@ UMin
Unisgned integer min implemented in terms of select(cmp()).
llvm::TargetTransformInfo::PSK_Software
@ PSK_Software
Definition: TargetTransformInfo.h:588
LoopInfo.h
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:78
llvm::APInt::ashr
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:963
llvm::AArch64TTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:218
getCalledFunction
static const Function * getCalledFunction(const Value *V, bool LookThroughBitCast, bool &IsNoBuiltin)
Definition: MemoryBuiltins.cpp:118
llvm::ScalarEvolution::getSCEV
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition: ScalarEvolution.cpp:4048
AArch64AddressingModes.h
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:201
llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:377
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:854
llvm::MVT::v2i8
@ v2i8
Definition: MachineValueType.h:77
llvm::MVT::v4i64
@ v4i64
Definition: MachineValueType.h:120
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:388
llvm::MVT::nxv16i8
@ nxv16i8
Definition: MachineValueType.h:200
llvm::cl::opt< bool >
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:78
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::MVT::nxv8i64
@ nxv8i64
Definition: MachineValueType.h:221
llvm::MVT::v16i8
@ v16i8
Definition: MachineValueType.h:80
llvm::CostTableLookup
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType >> Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
llvm::AArch64TTIImpl::getMaskedMemoryOpCost
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:1501
llvm::AArch64TTIImpl::instCombineIntrinsic
Optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
Definition: AArch64TargetTransformInfo.cpp:795
llvm::MVT::v16i16
@ v16i16
Definition: MachineValueType.h:93
Index
uint32_t Index
Definition: ELFObjHandler.cpp:84
llvm::MVT::v2i64
@ v2i64
Definition: MachineValueType.h:118
llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition: ISDOpcodes.h:786
llvm::AArch64::SVEBitsPerBlock
static constexpr unsigned SVEBitsPerBlock
Definition: AArch64BaseInfo.h:703
AArch64ExpandImm.h
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::MVT::v16f32
@ v16f32
Definition: MachineValueType.h:162
llvm::TargetTransformInfo::OK_UniformConstantValue
@ OK_UniformConstantValue
Definition: TargetTransformInfo.h:874
llvm::AArch64TTIImpl::getArithmeticReductionCostSVE
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
Definition: AArch64TargetTransformInfo.cpp:1924
llvm::LegalityPredicates::all
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
Definition: LegalizerInfo.h:220
llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition: Instructions.h:2777
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:57
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:424
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:879
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:928
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
llvm::RecurKind::Add
@ Add
Sum of integers.
llvm::MVT::v4f32
@ v4f32
Definition: MachineValueType.h:157
llvm::AArch64TTIImpl::isLegalToVectorizeReduction
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
Definition: AArch64TargetTransformInfo.cpp:1871
llvm::MVT::getVectorNumElements
unsigned getVectorNumElements() const
Definition: MachineValueType.h:850
llvm::MVT::i8
@ i8
Definition: MachineValueType.h:44
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
llvm::MVT::Other
@ Other
Definition: MachineValueType.h:42
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:840
llvm::LoopBase::getLoopDepth
unsigned getLoopDepth() const
Return the nesting level of this loop.
Definition: LoopInfo.h:96
llvm::MVT::nxv4i32
@ nxv4i32
Definition: MachineValueType.h:213
llvm::CmpInst::BAD_ICMP_PREDICATE
@ BAD_ICMP_PREDICATE
Definition: InstrTypes.h:753
llvm::MVT
Machine Value Type.
Definition: MachineValueType.h:31
llvm::TargetTransformInfo::SK_Splice
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
Definition: TargetTransformInfo.h:865
llvm::MVT::nxv4bf16
@ nxv4bf16
Definition: MachineValueType.h:237
llvm::LinearPolySize::getKnownMinValue
ScalarTy getKnownMinValue() const
Returns the minimum value this size can represent.
Definition: TypeSize.h:297
llvm::AArch64TTIImpl::getMaxInterleaveFactor
unsigned getMaxInterleaveFactor(unsigned VF)
Definition: AArch64TargetTransformInfo.cpp:1647
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:871
llvm::AArch64TTIImpl::getSpliceCost
InstructionCost getSpliceCost(VectorType *Tp, int Index)
Definition: AArch64TargetTransformInfo.cpp:2044
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:70
llvm::CmpInst::isIntPredicate
bool isIntPredicate() const
Definition: InstrTypes.h:814
llvm::TargetTransformInfo::MemCmpExpansionOptions
Returns options for expansion of memcmp. IsZeroCmp is.
Definition: TargetTransformInfo.h:754
llvm::MVT::nxv2i32
@ nxv2i32
Definition: MachineValueType.h:212
llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition: TargetTransformInfo.h:262
llvm::APIntOps::smin
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition: APInt.h:2168
llvm::RecurKind::UMax
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1554
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:47
llvm::StructType
Class to represent struct types.
Definition: DerivedTypes.h:212
llvm::MVT::v2i32
@ v2i32
Definition: MachineValueType.h:101
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:215
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::MVT::v2f32
@ v2f32
Definition: MachineValueType.h:155
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:979
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::TargetTransformInfo::UnrollingPreferences::DefaultUnrollRuntimeCount
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
Definition: TargetTransformInfo.h:460
llvm::RecurKind::FMax
@ FMax
FP max implemented in terms of select(cmp()).
llvm::MVT::v4i32
@ v4i32
Definition: MachineValueType.h:103
llvm::AArch64TTIImpl::getPopcntSupport
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
Definition: AArch64TargetTransformInfo.cpp:209
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:273
llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:127
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:175
llvm::TargetTransformInfo::SK_Transpose
@ SK_Transpose
Transpose two vectors.
Definition: TargetTransformInfo.h:858
instCombineSVEDup
static Optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:415
llvm::MVT::v8i64
@ v8i64
Definition: MachineValueType.h:121
llvm::TargetTransformInfo::CastContextHint::None
@ None
The cast is not used with a load/store of any kind.
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:634
llvm::InstCombiner::replaceInstUsesWith
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:416
AArch64TargetTransformInfo.h
llvm::MVT::v16i32
@ v16i32
Definition: MachineValueType.h:108
llvm::AArch64TTIImpl::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:1422
llvm::MCID::Select
@ Select
Definition: MCInstrDesc.h:162
llvm::MVT::nxv16i1
@ nxv16i1
Definition: MachineValueType.h:192
llvm::APIntOps::umax
const APInt & umax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be unsigned.
Definition: APInt.h:2183
llvm::MVT::v8bf16
@ v8bf16
Definition: MachineValueType.h:148
llvm::MVT::nxv2i16
@ nxv2i16
Definition: MachineValueType.h:205
llvm::Type::getInt64Ty
static IntegerType * getInt64Ty(LLVMContext &C)
Definition: Type.cpp:204
llvm::Constant::getNullValue
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:346
llvm::AArch64TTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: AArch64TargetTransformInfo.cpp:1274
EnableFalkorHWPFUnrollFix
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
CostTable.h
llvm::TargetTransformInfo::UnrollingPreferences::UpperBound
bool UpperBound
Allow using trip count upper bound to unroll loops.
Definition: TargetTransformInfo.h:491
llvm::AArch64_IMM::expandMOVImm
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
Definition: AArch64ExpandImm.cpp:304
instCombineSVECntElts
static Optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
Definition: AArch64TargetTransformInfo.cpp:646
llvm::APInt::sextOrTrunc
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:960
llvm::MVT::v8i16
@ v8i16
Definition: MachineValueType.h:92
llvm::SCEVAddRecExpr
This node represents a polynomial recurrence on the trip count of the specified loop.
Definition: ScalarEvolutionExpressions.h:352
llvm::AArch64TTIImpl::getTgtMemIntrinsic
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
Definition: AArch64TargetTransformInfo.cpp:1799
llvm::LinearPolySize< ElementCount >::getScalable
static ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:287
llvm::MVT::i32
@ i32
Definition: MachineValueType.h:46
llvm::ISD::SDIV
@ SDIV
Definition: ISDOpcodes.h:242
llvm::AArch64TTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:1239
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:183
llvm::MVT::nxv8i8
@ nxv8i8
Definition: MachineValueType.h:199
llvm::MVT::v8i32
@ v8i32
Definition: MachineValueType.h:107
llvm::InstCombiner
The core instruction combiner logic.
Definition: InstCombiner.h:45
llvm::AMDGPU::Hwreg::Width
Width
Definition: SIDefines.h:403
llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition: ISDOpcodes.h:740
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
llvm::MVT::nxv8i32
@ nxv8i32
Definition: MachineValueType.h:214
llvm::APInt::sext
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:910
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
llvm::MVT::nxv8f16
@ nxv8f16
Definition: MachineValueType.h:231
llvm::InstructionCost::getInvalid
static InstructionCost getInvalid(CostType Val=0)
Definition: InstructionCost.h:73
llvm::RecurrenceDescriptor
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:67
llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:833
llvm::RecurKind::FAdd
@ FAdd
Sum of floats.
llvm::Pattern
Definition: FileCheckImpl.h:614
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:148
llvm::MVT::nxv8i1
@ nxv8i1
Definition: MachineValueType.h:191
llvm::Type::isBFloatTy
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:144
processPhiNode
static Optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
Definition: AArch64TargetTransformInfo.cpp:332
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:241
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1338
TargetTransformInfo.h
llvm::AArch64TTIImpl::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, bool UseMaskForCond=false, bool UseMaskForGaps=false)
Definition: AArch64TargetTransformInfo.cpp:1606
llvm::AArch64TTIImpl::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AArch64TargetTransformInfo.cpp:1521
llvm::PHINode
Definition: Instructions.h:2627
llvm::PatternMatch
Definition: PatternMatch.h:47
llvm::ISD::MULHU
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:614
llvm::MemIntrinsicInfo
Information about a load/store intrinsic defined by the target.
Definition: TargetTransformInfo.h:70
instCombineSVELast
static Optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
Definition: AArch64TargetTransformInfo.cpp:544
llvm::AArch64TTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AArch64TargetTransformInfo.cpp:1758
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:45
llvm::ScalableVectorType::get
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:671
llvm::RecurKind::FMin
@ FMin
FP min implemented in terms of select(cmp()).
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::AArch64TTIImpl::shouldConsiderAddressTypePromotion
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
Definition: AArch64TargetTransformInfo.cpp:1844
llvm::AArch64TTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, Optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
Definition: AArch64TargetTransformInfo.cpp:1950
llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition: TargetTransformInfo.h:263
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
BasicTTIImpl.h
llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:726
llvm::MVT::v8i8
@ v8i8
Definition: MachineValueType.h:79
llvm::MVT::nxv2i8
@ nxv2i8
Definition: MachineValueType.h:197
llvm::MVT::v8f32
@ v8f32
Definition: MachineValueType.h:161
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:89
llvm::MVT::v16i64
@ v16i64
Definition: MachineValueType.h:122
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1284
llvm::PatternMatch::m_Cmp
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:89
llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:814
llvm::RecurKind::SMax
@ SMax
Signed integer max implemented in terms of select(cmp()).
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::AArch64Subtarget::Falkor
@ Falkor
Definition: AArch64Subtarget.h:65
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:212
llvm::Type::isFP128Ty
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:156
Debug.h
llvm::VectorType::get
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:634
llvm::AArch64TTIImpl::useNeonVector
bool useNeonVector(const Type *Ty) const
Definition: AArch64TargetTransformInfo.cpp:1547
llvm::APIntOps::smax
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition: APInt.h:2173
llvm::MVT::nxv8f64
@ nxv8f64
Definition: MachineValueType.h:249
llvm::ISD::CTPOP
@ CTPOP
Definition: ISDOpcodes.h:669
llvm::RecurKind::Xor
@ Xor
Bitwise or logical XOR of integers.
llvm::AArch64TTIImpl::getOrCreateResultFromMemIntrinsic
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
Definition: AArch64TargetTransformInfo.cpp:1763
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:128
llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:289
llvm::MVT::v4i1
@ v4i1
Definition: MachineValueType.h:66
llvm::AArch64TTIImpl::enableMemCmpExpansion
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
Definition: AArch64TargetTransformInfo.cpp:1483
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
llvm::AArch64TTIImpl::getAddressComputationCost
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
Definition: AArch64TargetTransformInfo.cpp:1403
llvm::MVT::nxv8f32
@ nxv8f32
Definition: MachineValueType.h:243
llvm::SmallVectorImpl::insert
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:773