LLVM 23.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
81 "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
82 cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83
84namespace {
85class TailFoldingOption {
86 // These bitfields will only ever be set to something non-zero in operator=,
87 // when setting the -sve-tail-folding option. This option should always be of
88 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
89 // InitialBits is one of (disabled|all|simple). EnableBits represents
90 // additional flags we're enabling, and DisableBits for those flags we're
91 // disabling. The default flag is tracked in the variable NeedsDefault, since
92 // at the time of setting the option we may not know what the default value
93 // for the CPU is.
97
98 // This value needs to be initialised to true in case the user does not
99 // explicitly set the -sve-tail-folding option.
100 bool NeedsDefault = true;
101
102 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104 void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106 void setEnableBit(TailFoldingOpts Bit) {
107 EnableBits |= Bit;
108 DisableBits &= ~Bit;
109 }
110
111 void setDisableBit(TailFoldingOpts Bit) {
112 EnableBits &= ~Bit;
113 DisableBits |= Bit;
114 }
115
116 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
123 Bits |= EnableBits;
124 Bits &= ~DisableBits;
125
126 return Bits;
127 }
128
129 void reportError(std::string Opt) {
130 errs() << "invalid argument '" << Opt
131 << "' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
134 report_fatal_error("Unrecognised tail-folding option");
135 }
136
137public:
138
139 void operator=(const std::string &Val) {
140 // If the user explicitly sets -sve-tail-folding= then treat as an error.
141 if (Val.empty()) {
142 reportError("");
143 return;
144 }
145
146 // Since the user is explicitly setting the option we don't automatically
147 // need the default unless they require it.
148 setNeedsDefault(false);
149
150 SmallVector<StringRef, 4> TailFoldTypes;
151 StringRef(Val).split(TailFoldTypes, '+', -1, false);
152
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] == "disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] == "all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] == "default")
159 setNeedsDefault(true);
160 else if (TailFoldTypes[0] == "simple")
161 setInitialBits(TailFoldingOpts::Simple);
162 else {
163 StartIdx = 0;
164 setInitialBits(TailFoldingOpts::Disabled);
165 }
166
167 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168 if (TailFoldTypes[I] == "reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[I] == "noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[I] == "norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[I] == "noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
180 else
181 reportError(Val);
182 }
183 }
184
185 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186 return (getBits(DefaultBits) & Required) == Required;
187 }
188};
189} // namespace
190
191TailFoldingOption TailFoldingOptionLoc;
192
194 "sve-tail-folding",
195 cl::desc(
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
199 "tail-folding"
200 "\ndefault (Initial) Uses the default tail-folding settings for "
201 "the target CPU"
202 "\nall (Initial) All legal loop types will vectorize using "
203 "tail-folding"
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
209 "recurrences"
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
212 "predicates"
213 "\nnoreverse Inverse of above"),
215
216// Experimental option that will only be fully functional when the
217// code-generator is changed to use SVE instead of NEON for all fixed-width
218// operations.
220 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
221
222// Experimental option that will only be fully functional when the cost-model
223// and code-generator have been changed to avoid using scalable vector
224// instructions that are not legal in streaming SVE mode.
226 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
227
228static bool isSMEABIRoutineCall(const CallInst &CI,
229 const AArch64TargetLowering &TLI) {
230 const auto *F = CI.getCalledFunction();
231 return F &&
233}
234
235/// Returns true if the function has explicit operations that can only be
236/// lowered using incompatible instructions for the selected mode. This also
237/// returns true if the function F may use or modify ZA state.
239 const AArch64TargetLowering &TLI) {
240 for (const BasicBlock &BB : *F) {
241 for (const Instruction &I : BB) {
242 // Be conservative for now and assume that any call to inline asm or to
243 // intrinsics could could result in non-streaming ops (e.g. calls to
244 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
245 // all native LLVM instructions can be lowered to compatible instructions.
246 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
247 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
249 return true;
250 }
251 }
252 return false;
253}
254
256 SmallVectorImpl<StringRef> &Features) {
257 StringRef AttributeStr =
258 TTI->isMultiversionedFunction(F) ? "fmv-features" : "target-features";
259 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
260 FeatureStr.split(Features, ",");
261}
262
265 extractAttrFeatures(F, this, Features);
266 return AArch64::getCpuSupportsMask(Features);
267}
268
271 extractAttrFeatures(F, this, Features);
272 return AArch64::getFMVPriority(Features);
273}
274
276 return F.hasFnAttribute("fmv-features");
277}
278
279const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
280 AArch64::FeatureExecuteOnly,
281};
282
284 const Function *Callee) const {
285 SMECallAttrs CallAttrs(*Caller, *Callee);
286
287 // Never inline a function explicitly marked as being streaming,
288 // into a non-streaming function. Assume it was marked as streaming
289 // for a reason.
290 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
292 return false;
293
294 // When inlining, we should consider the body of the function, not the
295 // interface.
296 if (CallAttrs.callee().hasStreamingBody()) {
297 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
298 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
299 }
300
301 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
302 return false;
303
304 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
305 CallAttrs.requiresPreservingZT0() ||
306 CallAttrs.requiresPreservingAllZAState()) {
307 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
308 return false;
309 }
310
311 const TargetMachine &TM = getTLI()->getTargetMachine();
312 const FeatureBitset &CallerBits =
313 TM.getSubtargetImpl(*Caller)->getFeatureBits();
314 const FeatureBitset &CalleeBits =
315 TM.getSubtargetImpl(*Callee)->getFeatureBits();
316 // Adjust the feature bitsets by inverting some of the bits. This is needed
317 // for target features that represent restrictions rather than capabilities,
318 // for example a "+execute-only" callee can be inlined into a caller without
319 // "+execute-only", but not vice versa.
320 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
321 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
322
323 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
324}
325
327 const Function *Callee,
328 ArrayRef<Type *> Types) const {
329 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
330 return false;
331
332 // We need to ensure that argument promotion does not attempt to promote
333 // pointers to fixed-length vector types larger than 128 bits like
334 // <8 x float> (and pointers to aggregate types which have such fixed-length
335 // vector type members) into the values of the pointees. Such vector types
336 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
337 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
338 // types can be safely treated as 128-bit NEON types and they cannot be
339 // distinguished in IR.
340 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
341 auto FVTy = dyn_cast<FixedVectorType>(Ty);
342 return FVTy &&
343 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
344 }))
345 return false;
346
347 return true;
348}
349
350unsigned
352 unsigned DefaultCallPenalty) const {
353 // This function calculates a penalty for executing Call in F.
354 //
355 // There are two ways this function can be called:
356 // (1) F:
357 // call from F -> G (the call here is Call)
358 //
359 // For (1), Call.getCaller() == F, so it will always return a high cost if
360 // a streaming-mode change is required (thus promoting the need to inline the
361 // function)
362 //
363 // (2) F:
364 // call from F -> G (the call here is not Call)
365 // G:
366 // call from G -> H (the call here is Call)
367 //
368 // For (2), if after inlining the body of G into F the call to H requires a
369 // streaming-mode change, and the call to G from F would also require a
370 // streaming-mode change, then there is benefit to do the streaming-mode
371 // change only once and avoid inlining of G into F.
372
373 SMEAttrs FAttrs(*F);
374 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
375
376 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
377 if (F == Call.getCaller()) // (1)
378 return CallPenaltyChangeSM * DefaultCallPenalty;
379 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
380 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
381 }
382
383 return DefaultCallPenalty;
384}
385
389
390 if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
391 return true;
392
394 ST->isSVEorStreamingSVEAvailable() &&
395 !ST->disableMaximizeScalableBandwidth();
396}
397
398/// Calculate the cost of materializing a 64-bit value. This helper
399/// method might only calculate a fraction of a larger immediate. Therefore it
400/// is valid to return a cost of ZERO.
402 // Check if the immediate can be encoded within an instruction.
403 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
404 return 0;
405
406 if (Val < 0)
407 Val = ~Val;
408
409 // Calculate how many moves we will need to materialize this constant.
411 AArch64_IMM::expandMOVImm(Val, 64, Insn);
412 return Insn.size();
413}
414
415/// Calculate the cost of materializing the given constant.
419 assert(Ty->isIntegerTy());
420
421 unsigned BitSize = Ty->getPrimitiveSizeInBits();
422 if (BitSize == 0)
423 return ~0U;
424
425 // Sign-extend all constants to a multiple of 64-bit.
426 APInt ImmVal = Imm;
427 if (BitSize & 0x3f)
428 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
429
430 // Split the constant into 64-bit chunks and calculate the cost for each
431 // chunk.
433 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
434 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
435 int64_t Val = Tmp.getSExtValue();
436 Cost += getIntImmCost(Val);
437 }
438 // We need at least one instruction to materialze the constant.
439 return std::max<InstructionCost>(1, Cost);
440}
441
443 const APInt &Imm, Type *Ty,
445 Instruction *Inst) const {
446 assert(Ty->isIntegerTy());
447
448 unsigned BitSize = Ty->getPrimitiveSizeInBits();
449 // There is no cost model for constants with a bit size of 0. Return TCC_Free
450 // here, so that constant hoisting will ignore this constant.
451 if (BitSize == 0)
452 return TTI::TCC_Free;
453
454 unsigned ImmIdx = ~0U;
455 switch (Opcode) {
456 default:
457 return TTI::TCC_Free;
458 case Instruction::GetElementPtr:
459 // Always hoist the base address of a GetElementPtr.
460 if (Idx == 0)
461 return 2 * TTI::TCC_Basic;
462 return TTI::TCC_Free;
463 case Instruction::Store:
464 ImmIdx = 0;
465 break;
466 case Instruction::Add:
467 case Instruction::Sub:
468 case Instruction::Mul:
469 case Instruction::UDiv:
470 case Instruction::SDiv:
471 case Instruction::URem:
472 case Instruction::SRem:
473 case Instruction::And:
474 case Instruction::Or:
475 case Instruction::Xor:
476 case Instruction::ICmp:
477 ImmIdx = 1;
478 break;
479 // Always return TCC_Free for the shift value of a shift instruction.
480 case Instruction::Shl:
481 case Instruction::LShr:
482 case Instruction::AShr:
483 if (Idx == 1)
484 return TTI::TCC_Free;
485 break;
486 case Instruction::Trunc:
487 case Instruction::ZExt:
488 case Instruction::SExt:
489 case Instruction::IntToPtr:
490 case Instruction::PtrToInt:
491 case Instruction::BitCast:
492 case Instruction::PHI:
493 case Instruction::Call:
494 case Instruction::Select:
495 case Instruction::Ret:
496 case Instruction::Load:
497 break;
498 }
499
500 if (Idx == ImmIdx) {
501 int NumConstants = (BitSize + 63) / 64;
503 return (Cost <= NumConstants * TTI::TCC_Basic)
504 ? static_cast<int>(TTI::TCC_Free)
505 : Cost;
506 }
508}
509
512 const APInt &Imm, Type *Ty,
514 assert(Ty->isIntegerTy());
515
516 unsigned BitSize = Ty->getPrimitiveSizeInBits();
517 // There is no cost model for constants with a bit size of 0. Return TCC_Free
518 // here, so that constant hoisting will ignore this constant.
519 if (BitSize == 0)
520 return TTI::TCC_Free;
521
522 // Most (all?) AArch64 intrinsics do not support folding immediates into the
523 // selected instruction, so we compute the materialization cost for the
524 // immediate directly.
525 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
527
528 switch (IID) {
529 default:
530 return TTI::TCC_Free;
531 case Intrinsic::sadd_with_overflow:
532 case Intrinsic::uadd_with_overflow:
533 case Intrinsic::ssub_with_overflow:
534 case Intrinsic::usub_with_overflow:
535 case Intrinsic::smul_with_overflow:
536 case Intrinsic::umul_with_overflow:
537 if (Idx == 1) {
538 int NumConstants = (BitSize + 63) / 64;
540 return (Cost <= NumConstants * TTI::TCC_Basic)
541 ? static_cast<int>(TTI::TCC_Free)
542 : Cost;
543 }
544 break;
545 case Intrinsic::experimental_stackmap:
546 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
547 return TTI::TCC_Free;
548 break;
549 case Intrinsic::experimental_patchpoint_void:
550 case Intrinsic::experimental_patchpoint:
551 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
552 return TTI::TCC_Free;
553 break;
554 case Intrinsic::experimental_gc_statepoint:
555 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
556 return TTI::TCC_Free;
557 break;
558 }
560}
561
563AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
564 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
565 if (TyWidth == 32 || TyWidth == 64)
567 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
568 return TTI::PSK_Software;
569}
570
572 // MispredictPenalty is defined per-CPU in AArch64Sched*.td (e.g.,
573 // AArch64SchedNeoverseV2.td).
574 return ST->getSchedModel().MispredictPenalty;
575}
576
577static bool isUnpackedVectorVT(EVT VecVT) {
578 return VecVT.isScalableVector() &&
580}
581
583 const IntrinsicCostAttributes &ICA) {
584 // We need to know at least the number of elements in the vector of buckets
585 // and the size of each element to update.
586 if (ICA.getArgTypes().size() < 2)
588
589 // Only interested in costing for the hardware instruction from SVE2.
590 if (!ST->hasSVE2())
592
593 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
594 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
595 unsigned TotalHistCnts = 1;
596
597 unsigned EltSize = EltTy->getScalarSizeInBits();
598 // Only allow (up to 64b) integers or pointers
599 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
601
602 // FIXME: We should be able to generate histcnt for fixed-length vectors
603 // using ptrue with a specific VL.
604 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
605 unsigned EC = VTy->getElementCount().getKnownMinValue();
606 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
608
609 // HistCnt only supports 32b and 64b element types
610 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
611
612 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
614
615 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
616 TotalHistCnts = EC / NaturalVectorWidth;
617
618 return InstructionCost(BaseHistCntCost * TotalHistCnts);
619 }
620
622}
623
627 // The code-generator is currently not able to handle scalable vectors
628 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
629 // it. This change will be removed when code-generation for these types is
630 // sufficiently reliable.
631 auto *RetTy = ICA.getReturnType();
632 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
633 if (VTy->getElementCount() == ElementCount::getScalable(1))
635
636 switch (ICA.getID()) {
637 case Intrinsic::experimental_vector_histogram_add: {
638 InstructionCost HistCost = getHistogramCost(ST, ICA);
639 // If the cost isn't valid, we may still be able to scalarize
640 if (HistCost.isValid())
641 return HistCost;
642 break;
643 }
644 case Intrinsic::clmul: {
645 auto LT = getTypeLegalizationCost(RetTy);
646
647 // PMUL v8i8/v16i8 is always available on AArch64
648 if (ST->hasNEON()) {
649 if (LT.second == MVT::v8i8 || LT.second == MVT::v16i8)
650 return LT.first;
651
652 // Scalar i8 lowers through scalar/vector moves around PMUL.
653 if (TLI->getValueType(DL, RetTy, true) == MVT::i8) {
654 auto *VecTy =
655 FixedVectorType::get(Type::getInt8Ty(RetTy->getContext()), 8);
656 return 1 +
657 getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
658 -1, nullptr, nullptr) *
659 2 +
660 getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
661 -1, nullptr, nullptr);
662 }
663 }
664
665 if (LT.second.SimpleTy == MVT::nxv2i64)
666 if (ST->hasSVEAES() && (ST->isSVEAvailable() || ST->hasSSVE_AES()))
667 return LT.first * 3;
668
669 if (ST->hasSVE2() || ST->hasSME()) {
670 switch (LT.second.SimpleTy) {
671 case MVT::nxv16i8:
672 return LT.first;
673 case MVT::nxv8i16:
674 return LT.first * 6;
675 case MVT::nxv4i32:
676 return LT.first * 3;
677 case MVT::nxv2i64:
678 return LT.first * 8;
679 default:
680 break;
681 }
682 }
683
684 // Avoid +sve giving this cost 2 due to custom lowering: It's very slow
685 if (LT.second.SimpleTy == MVT::nxv2i64)
686 return 192;
687
688 if (ST->hasAES()) {
689 switch (LT.second.SimpleTy) {
690 case MVT::i16:
691 case MVT::i32:
692 case MVT::i64:
693 case MVT::i128: {
694 auto *VecTy =
695 FixedVectorType::get(Type::getInt64Ty(RetTy->getContext()), 1);
696 return LT.first *
697 (1 +
698 getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
699 -1, nullptr, nullptr) *
700 2 +
701 getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
702 -1, nullptr, nullptr));
703 }
704 case MVT::v1i64:
705 return LT.first;
706 case MVT::v2i64:
707 return LT.first * 3;
708 case MVT::v2i32:
709 return LT.first * 6;
710 default:
711 break;
712 }
713 }
714 break;
715 }
716 case Intrinsic::umin:
717 case Intrinsic::umax:
718 case Intrinsic::smin:
719 case Intrinsic::smax: {
720 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
721 MVT::v8i16, MVT::v2i32, MVT::v4i32,
722 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
723 MVT::nxv2i64};
724 auto LT = getTypeLegalizationCost(RetTy);
725 // v2i64 types get converted to cmp+bif hence the cost of 2
726 if (LT.second == MVT::v2i64)
727 return LT.first * 2;
728 if (any_of(ValidMinMaxTys, equal_to(LT.second)))
729 return LT.first;
730 break;
731 }
732 case Intrinsic::scmp:
733 case Intrinsic::ucmp: {
734 static const CostTblEntry BitreverseTbl[] = {
735 {Intrinsic::scmp, MVT::i32, 3}, // cmp+cset+csinv
736 {Intrinsic::scmp, MVT::i64, 3}, // cmp+cset+csinv
737 {Intrinsic::scmp, MVT::v8i8, 3}, // cmgt+cmgt+sub
738 {Intrinsic::scmp, MVT::v16i8, 3}, // cmgt+cmgt+sub
739 {Intrinsic::scmp, MVT::v4i16, 3}, // cmgt+cmgt+sub
740 {Intrinsic::scmp, MVT::v8i16, 3}, // cmgt+cmgt+sub
741 {Intrinsic::scmp, MVT::v2i32, 3}, // cmgt+cmgt+sub
742 {Intrinsic::scmp, MVT::v4i32, 3}, // cmgt+cmgt+sub
743 {Intrinsic::scmp, MVT::v1i64, 3}, // cmgt+cmgt+sub
744 {Intrinsic::scmp, MVT::v2i64, 3}, // cmgt+cmgt+sub
745 };
746 const auto LT = getTypeLegalizationCost(RetTy);
747 const auto *Entry =
748 CostTableLookup(BitreverseTbl, Intrinsic::scmp, LT.second);
749 if (Entry)
750 return Entry->Cost * LT.first;
751 break;
752 }
753 case Intrinsic::sadd_sat:
754 case Intrinsic::ssub_sat:
755 case Intrinsic::uadd_sat:
756 case Intrinsic::usub_sat: {
757 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
758 MVT::v8i16, MVT::v2i32, MVT::v4i32,
759 MVT::v2i64};
760 auto LT = getTypeLegalizationCost(RetTy);
761 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
762 // need to extend the type, as it uses shr(qadd(shl, shl)).
763 unsigned Instrs =
764 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
765 if (any_of(ValidSatTys, equal_to(LT.second)))
766 return LT.first * Instrs;
767
769 uint64_t VectorSize = TS.getKnownMinValue();
770
771 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
772 return LT.first * Instrs;
773
774 break;
775 }
776 case Intrinsic::abs: {
777 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
778 MVT::v8i16, MVT::v2i32, MVT::v4i32,
779 MVT::v2i64, MVT::nxv16i8, MVT::nxv8i16,
780 MVT::nxv4i32, MVT::nxv2i64};
781 auto LT = getTypeLegalizationCost(RetTy);
782 if (any_of(ValidAbsTys, equal_to(LT.second)))
783 return LT.first;
784 break;
785 }
786 case Intrinsic::bswap: {
787 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
788 MVT::v4i32, MVT::v2i64};
789 auto LT = getTypeLegalizationCost(RetTy);
790 if (any_of(ValidAbsTys, equal_to(LT.second)) &&
791 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
792 return LT.first;
793 break;
794 }
795 case Intrinsic::fma:
796 case Intrinsic::fmuladd: {
797 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
798 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
799 Type *EltTy = RetTy->getScalarType();
800 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
801 (EltTy->isHalfTy() && ST->hasFullFP16()))
802 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
803 break;
804 }
805 case Intrinsic::stepvector: {
806 InstructionCost Cost = 1; // Cost of the `index' instruction
807 auto LT = getTypeLegalizationCost(RetTy);
808 // Legalisation of illegal vectors involves an `index' instruction plus
809 // (LT.first - 1) vector adds.
810 if (LT.first > 1) {
811 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
812 InstructionCost AddCost =
813 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
814 Cost += AddCost * (LT.first - 1);
815 }
816 return Cost;
817 }
818 case Intrinsic::vector_extract:
819 case Intrinsic::vector_insert: {
820 // If both the vector and subvector types are legal types and the index
821 // is 0, then this should be a no-op or simple operation; return a
822 // relatively low cost.
823
824 // If arguments aren't actually supplied, then we cannot determine the
825 // value of the index. We also want to skip predicate types.
826 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
828 break;
829
830 LLVMContext &C = RetTy->getContext();
831 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
832 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
833 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
834 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
835 // Skip this if either the vector or subvector types are unpacked
836 // SVE types; they may get lowered to stack stores and loads.
837 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
838 break;
839
841 getTLI()->getTypeConversion(C, SubVecVT);
843 getTLI()->getTypeConversion(C, VecVT);
844 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
845 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
846 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
847 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
848 return TTI::TCC_Free;
849 break;
850 }
851 case Intrinsic::bitreverse: {
852 static const CostTblEntry BitreverseTbl[] = {
853 {Intrinsic::bitreverse, MVT::i32, 1},
854 {Intrinsic::bitreverse, MVT::i64, 1},
855 {Intrinsic::bitreverse, MVT::v8i8, 1},
856 {Intrinsic::bitreverse, MVT::v16i8, 1},
857 {Intrinsic::bitreverse, MVT::v4i16, 2},
858 {Intrinsic::bitreverse, MVT::v8i16, 2},
859 {Intrinsic::bitreverse, MVT::v2i32, 2},
860 {Intrinsic::bitreverse, MVT::v4i32, 2},
861 {Intrinsic::bitreverse, MVT::v1i64, 2},
862 {Intrinsic::bitreverse, MVT::v2i64, 2},
863 };
864 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
865 const auto *Entry =
866 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
867 if (Entry) {
868 // Cost Model is using the legal type(i32) that i8 and i16 will be
869 // converted to +1 so that we match the actual lowering cost
870 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
871 TLI->getValueType(DL, RetTy, true) == MVT::i16)
872 return LegalisationCost.first * Entry->Cost + 1;
873
874 return LegalisationCost.first * Entry->Cost;
875 }
876 break;
877 }
878 case Intrinsic::ctpop: {
879 if (!ST->hasNEON()) {
880 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
881 return getTypeLegalizationCost(RetTy).first * 12;
882 }
883 static const CostTblEntry CtpopCostTbl[] = {
884 {ISD::CTPOP, MVT::v2i64, 4},
885 {ISD::CTPOP, MVT::v4i32, 3},
886 {ISD::CTPOP, MVT::v8i16, 2},
887 {ISD::CTPOP, MVT::v16i8, 1},
888 {ISD::CTPOP, MVT::i64, 4},
889 {ISD::CTPOP, MVT::v2i32, 3},
890 {ISD::CTPOP, MVT::v4i16, 2},
891 {ISD::CTPOP, MVT::v8i8, 1},
892 {ISD::CTPOP, MVT::i32, 5},
893 // SVE types (For targets that override NEON for fixed length vectors)
894 {ISD::CTPOP, MVT::nxv2i64, 1},
895 {ISD::CTPOP, MVT::nxv4i32, 1},
896 {ISD::CTPOP, MVT::nxv8i16, 1},
897 {ISD::CTPOP, MVT::nxv16i8, 1},
898 };
899 auto LT = getTypeLegalizationCost(RetTy);
900 MVT MTy = LT.second;
901
902 // When SVE is available CNT will be used for fixed and scalable vectors.
903 if (ST->isSVEorStreamingSVEAvailable() && MTy.isFixedLengthVector())
905 128 / MTy.getScalarSizeInBits());
906
907 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
908 // Extra cost of +1 when illegal vector types are legalized by promoting
909 // the integer type.
910 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
911 RetTy->getScalarSizeInBits()
912 ? 1
913 : 0;
914 return LT.first * Entry->Cost + ExtraCost;
915 }
916 break;
917 }
918 case Intrinsic::sadd_with_overflow:
919 case Intrinsic::uadd_with_overflow:
920 case Intrinsic::ssub_with_overflow:
921 case Intrinsic::usub_with_overflow:
922 case Intrinsic::smul_with_overflow:
923 case Intrinsic::umul_with_overflow: {
924 static const CostTblEntry WithOverflowCostTbl[] = {
925 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
926 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
927 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
928 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
929 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
930 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
931 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
932 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
933 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
934 {Intrinsic::usub_with_overflow, MVT::i8, 3},
935 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
936 {Intrinsic::usub_with_overflow, MVT::i16, 3},
937 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
938 {Intrinsic::usub_with_overflow, MVT::i32, 1},
939 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
940 {Intrinsic::usub_with_overflow, MVT::i64, 1},
941 {Intrinsic::smul_with_overflow, MVT::i8, 5},
942 {Intrinsic::umul_with_overflow, MVT::i8, 4},
943 {Intrinsic::smul_with_overflow, MVT::i16, 5},
944 {Intrinsic::umul_with_overflow, MVT::i16, 4},
945 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
946 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
947 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
948 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
949 };
950 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
951 if (MTy.isSimple())
952 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
953 MTy.getSimpleVT()))
954 return Entry->Cost;
955 break;
956 }
957 case Intrinsic::fptosi_sat:
958 case Intrinsic::fptoui_sat: {
959 if (ICA.getArgTypes().empty())
960 break;
961 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
962 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
963 EVT MTy = TLI->getValueType(DL, RetTy);
964 // Check for the legal types, which are where the size of the input and the
965 // output are the same, or we are using cvt f64->i32 or f32->i64.
966 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
967 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
968 LT.second == MVT::v2f64)) {
969 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
970 (LT.second == MVT::f64 && MTy == MVT::i32) ||
971 (LT.second == MVT::f32 && MTy == MVT::i64)))
972 return LT.first;
973 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
974 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
975 MTy.getScalarSizeInBits() == 64)
976 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
977 }
978 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
979 // f32.
980 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
981 return LT.first + getIntrinsicInstrCost(
982 {ICA.getID(),
983 RetTy,
984 {ICA.getArgTypes()[0]->getWithNewType(
985 Type::getFloatTy(RetTy->getContext()))}},
986 CostKind);
987 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
988 (LT.second == MVT::f16 && MTy == MVT::i64) ||
989 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
990 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
991 return LT.first;
992 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
993 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
994 MTy.getScalarSizeInBits() == 32)
995 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
996 // Extending vector types v8f16->v8i32. These current scalarize but the
997 // codegen could be better.
998 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
999 MTy.getScalarSizeInBits() == 64)
1000 return MTy.getVectorNumElements() * 3;
1001
1002 // If we can we use a legal convert followed by a min+max
1003 if ((LT.second.getScalarType() == MVT::f32 ||
1004 LT.second.getScalarType() == MVT::f64 ||
1005 LT.second.getScalarType() == MVT::f16) &&
1006 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
1007 Type *LegalTy =
1008 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
1009 if (LT.second.isVector())
1010 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
1012 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
1013 : Intrinsic::umin,
1014 LegalTy, {LegalTy, LegalTy});
1016 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
1017 : Intrinsic::umax,
1018 LegalTy, {LegalTy, LegalTy});
1020 return LT.first * Cost +
1021 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
1022 : 1);
1023 }
1024 // Otherwise we need to follow the default expansion that clamps the value
1025 // using a float min/max with a fcmp+sel for nan handling when signed.
1026 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
1027 RetTy = RetTy->getScalarType();
1028 if (LT.second.isVector()) {
1029 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
1030 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
1031 }
1032 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
1034 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
1036 Cost +=
1037 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1038 RetTy, FPTy, TTI::CastContextHint::None, CostKind);
1039 if (IsSigned) {
1040 Type *CondTy = RetTy->getWithNewBitWidth(1);
1041 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
1043 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1045 }
1046 return LT.first * Cost;
1047 }
1048 case Intrinsic::fshl:
1049 case Intrinsic::fshr: {
1050 if (ICA.getArgs().empty())
1051 break;
1052
1053 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
1054
1055 // ROTR / ROTL is a funnel shift with equal first and second operand. For
1056 // ROTR on integer registers (i32/i64) this can be done in a single ror
1057 // instruction. A fshl with a non-constant shift uses a neg + ror.
1058 if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] &&
1059 (RetTy->getPrimitiveSizeInBits() == 32 ||
1060 RetTy->getPrimitiveSizeInBits() == 64)) {
1061 InstructionCost NegCost =
1062 (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0;
1063 return 1 + NegCost;
1064 }
1065
1066 // TODO: Add handling for fshl where third argument is not a constant.
1067 if (!OpInfoZ.isConstant())
1068 break;
1069
1070 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
1071 if (OpInfoZ.isUniform()) {
1072 static const CostTblEntry FshlTbl[] = {
1073 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
1074 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
1075 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
1076 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
1077 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
1078 // to avoid having to duplicate the costs.
1079 const auto *Entry =
1080 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
1081 if (Entry)
1082 return LegalisationCost.first * Entry->Cost;
1083 }
1084
1085 auto TyL = getTypeLegalizationCost(RetTy);
1086 if (!RetTy->isIntegerTy())
1087 break;
1088
1089 // Estimate cost manually, as types like i8 and i16 will get promoted to
1090 // i32 and CostTableLookup will ignore the extra conversion cost.
1091 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
1092 RetTy->getScalarSizeInBits() < 64) ||
1093 (RetTy->getScalarSizeInBits() % 64 != 0);
1094 unsigned ExtraCost = HigherCost ? 1 : 0;
1095 if (RetTy->getScalarSizeInBits() == 32 ||
1096 RetTy->getScalarSizeInBits() == 64)
1097 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
1098 // extr instruction.
1099 else if (HigherCost)
1100 ExtraCost = 1;
1101 else
1102 break;
1103 return TyL.first + ExtraCost;
1104 }
1105 case Intrinsic::get_active_lane_mask: {
1106 auto RetTy = cast<VectorType>(ICA.getReturnType());
1107 EVT RetVT = getTLI()->getValueType(DL, RetTy);
1108 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1109 if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
1110 break;
1111
1112 if (RetTy->isScalableTy()) {
1113 if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
1115 break;
1116
1117 auto LT = getTypeLegalizationCost(RetTy);
1118 InstructionCost Cost = LT.first;
1119 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
1120 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
1121 // nxv32i1 = get_active_lane_mask(base, idx) ->
1122 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
1123 if (ST->hasSVE2p1() || ST->hasSME2()) {
1124 Cost /= 2;
1125 if (Cost == 1)
1126 return Cost;
1127 }
1128
1129 // If more than one whilelo intrinsic is required, include the extra cost
1130 // required by the saturating add & select required to increment the
1131 // start value after the first intrinsic call.
1132 Type *OpTy = ICA.getArgTypes()[0];
1133 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
1134 InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
1135 Type *CondTy = OpTy->getWithNewBitWidth(1);
1136 SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
1138 return Cost + (SplitCost * (Cost - 1));
1139 } else if (!getTLI()->isTypeLegal(RetVT)) {
1140 // We don't have enough context at this point to determine if the mask
1141 // is going to be kept live after the block, which will force the vXi1
1142 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1143 // For now, we just assume the vectorizer created this intrinsic and
1144 // the result will be the input for a PHI. In this case the cost will
1145 // be extremely high for fixed-width vectors.
1146 // NOTE: getScalarizationOverhead returns a cost that's far too
1147 // pessimistic for the actual generated codegen. In reality there are
1148 // two instructions generated per lane.
1149 return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
1150 }
1151 break;
1152 }
1153 case Intrinsic::experimental_vector_match: {
1154 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
1155 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1156 unsigned SearchSize = NeedleTy->getNumElements();
1157 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
1158 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1159 // Neoverse V3, these are cheap operations with the same latency as a
1160 // vector ADD. In most cases, however, we also need to do an extra DUP.
1161 // For fixed-length vectors we currently need an extra five--six
1162 // instructions besides the MATCH.
1164 if (isa<FixedVectorType>(RetTy))
1165 Cost += 10;
1166 return Cost;
1167 }
1168 break;
1169 }
1170 case Intrinsic::cttz: {
1171 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1172 if (LT.second == MVT::v8i8 || LT.second == MVT::v16i8)
1173 return LT.first * 2;
1174 if (LT.second == MVT::v4i16 || LT.second == MVT::v8i16 ||
1175 LT.second == MVT::v2i32 || LT.second == MVT::v4i32)
1176 return LT.first * 3;
1177 break;
1178 }
1179 case Intrinsic::experimental_cttz_elts: {
1180 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1181 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1182 // This will consist of a SVE brkb and a cntp instruction. These
1183 // typically have the same latency and half the throughput as a vector
1184 // add instruction.
1185 return 4;
1186 }
1187 break;
1188 }
1189 case Intrinsic::loop_dependence_raw_mask:
1190 case Intrinsic::loop_dependence_war_mask: {
1191 // The whilewr/rw instructions require SVE2 or SME.
1192 if (ST->hasSVE2() || ST->hasSME()) {
1193 EVT VecVT = getTLI()->getValueType(DL, RetTy);
1194 unsigned EltSizeInBytes =
1195 cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
1196 if (!is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes) ||
1197 VecVT.getVectorMinNumElements() != (16 / EltSizeInBytes))
1198 break;
1199 // For fixed-vector types we need to AND the mask with a ptrue vl<N>.
1200 return isa<FixedVectorType>(RetTy) ? 2 : 1;
1201 }
1202 break;
1203 }
1204 case Intrinsic::experimental_vector_extract_last_active:
1205 if (ST->isSVEorStreamingSVEAvailable()) {
1206 auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1207 // This should turn into chained clastb instructions.
1208 return LegalCost;
1209 }
1210 break;
1211 case Intrinsic::pow: {
1212 // For scalar calls we know the target has the libcall, and for fixed-width
1213 // vectors we know for the worst case it can be scalarised.
1214 EVT VT = getTLI()->getValueType(DL, RetTy);
1215 RTLIB::Libcall LC = RTLIB::getPOW(VT);
1216 bool HasLibcall = getTLI()->getLibcallImpl(LC) != RTLIB::Unsupported;
1217 bool CanLowerWithLibcalls = !isa<ScalableVectorType>(RetTy) || HasLibcall;
1218
1219 // If we know that the call can be lowered with libcalls then it's safe to
1220 // reduce the costs in some cases. This is important for scalable vectors,
1221 // since we cannot scalarize the call in the absence of a vector math
1222 // library.
1223 if (CanLowerWithLibcalls && ICA.getInst() && !ICA.getArgs().empty()) {
1224 // If we know the fast math flags and the exponent is a constant then the
1225 // cost may be less for some exponents like 0.25 and 0.75.
1226 const Constant *ExpC = dyn_cast<Constant>(ICA.getArgs()[1]);
1227 if (ExpC && isa<VectorType>(ExpC->getType()))
1228 ExpC = ExpC->getSplatValue();
1229 if (auto *ExpF = dyn_cast_or_null<ConstantFP>(ExpC)) {
1230 // The argument must be a FP constant.
1231 bool Is025 = ExpF->getValueAPF().isExactlyValue(0.25);
1232 bool Is075 = ExpF->getValueAPF().isExactlyValue(0.75);
1233 FastMathFlags FMF = ICA.getInst()->getFastMathFlags();
1234 if ((Is025 || Is075) && FMF.noInfs() && FMF.approxFunc() &&
1235 (!Is025 || FMF.noSignedZeros())) {
1236 IntrinsicCostAttributes Attrs(Intrinsic::sqrt, RetTy, {RetTy}, FMF);
1238 if (Is025)
1239 return 2 * Sqrt;
1241 getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
1242 return (Sqrt * 2) + FMul;
1243 }
1244 // TODO: For 1/3 exponents we expect the cbrt call to be slightly
1245 // cheaper than pow.
1246 }
1247 }
1248
1249 if (HasLibcall)
1250 return getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
1251 break;
1252 }
1253 case Intrinsic::sqrt:
1254 case Intrinsic::fabs:
1255 case Intrinsic::ceil:
1256 case Intrinsic::floor:
1257 case Intrinsic::nearbyint:
1258 case Intrinsic::round:
1259 case Intrinsic::rint:
1260 case Intrinsic::roundeven:
1261 case Intrinsic::trunc:
1262 case Intrinsic::minnum:
1263 case Intrinsic::maxnum:
1264 case Intrinsic::minimum:
1265 case Intrinsic::maximum: {
1266 if (isa<ScalableVectorType>(RetTy) && ST->isSVEorStreamingSVEAvailable()) {
1267 auto LT = getTypeLegalizationCost(RetTy);
1268 return LT.first;
1269 }
1270 break;
1271 }
1272 default:
1273 break;
1274 }
1276}
1277
1278/// The function will remove redundant reinterprets casting in the presence
1279/// of the control flow
1280static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1281 IntrinsicInst &II) {
1283 auto RequiredType = II.getType();
1284
1285 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1286 assert(PN && "Expected Phi Node!");
1287
1288 // Don't create a new Phi unless we can remove the old one.
1289 if (!PN->hasOneUse())
1290 return std::nullopt;
1291
1292 for (Value *IncValPhi : PN->incoming_values()) {
1293 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1294 if (!Reinterpret ||
1295 Reinterpret->getIntrinsicID() !=
1296 Intrinsic::aarch64_sve_convert_to_svbool ||
1297 RequiredType != Reinterpret->getArgOperand(0)->getType())
1298 return std::nullopt;
1299 }
1300
1301 // Create the new Phi
1302 IC.Builder.SetInsertPoint(PN);
1303 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1304 Worklist.push_back(PN);
1305
1306 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1307 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1308 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1309 Worklist.push_back(Reinterpret);
1310 }
1311
1312 // Cleanup Phi Node and reinterprets
1313 return IC.replaceInstUsesWith(II, NPN);
1314}
1315
1316// A collection of properties common to SVE intrinsics that allow for combines
1317// to be written without needing to know the specific intrinsic.
1319 //
1320 // Helper routines for common intrinsic definitions.
1321 //
1322
1323 // e.g. llvm.aarch64.sve.add pg, op1, op2
1324 // with IID ==> llvm.aarch64.sve.add_u
1325 static SVEIntrinsicInfo
1332
1333 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1340
1341 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1347
1348 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1354
1355 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1356 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1357 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1358 return SVEIntrinsicInfo()
1361 }
1362
1363 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1364 // llvm.aarch64.sve.ld1 pg, ptr
1371
1372 // All properties relate to predication and thus having a general predicate
1373 // is the minimum requirement to say there is intrinsic info to act on.
1374 explicit operator bool() const { return hasGoverningPredicate(); }
1375
1376 //
1377 // Properties relating to the governing predicate.
1378 //
1379
1381 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1382 }
1383
1385 assert(hasGoverningPredicate() && "Propery not set!");
1386 return GoverningPredicateIdx;
1387 }
1388
1390 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1391 GoverningPredicateIdx = Index;
1392 return *this;
1393 }
1394
1395 //
1396 // Properties relating to operations the intrinsic could be transformed into.
1397 // NOTE: This does not mean such a transformation is always possible, but the
1398 // knowledge makes it possible to reuse existing optimisations without needing
1399 // to embed specific handling for each intrinsic. For example, instruction
1400 // simplification can be used to optimise an intrinsic's active lanes.
1401 //
1402
1404 return UndefIntrinsic != Intrinsic::not_intrinsic;
1405 }
1406
1408 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1409 return UndefIntrinsic;
1410 }
1411
1413 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1414 UndefIntrinsic = IID;
1415 return *this;
1416 }
1417
1418 bool hasMatchingIROpode() const { return IROpcode != 0; }
1419
1420 unsigned getMatchingIROpode() const {
1421 assert(hasMatchingIROpode() && "Propery not set!");
1422 return IROpcode;
1423 }
1424
1426 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1427 IROpcode = Opcode;
1428 return *this;
1429 }
1430
1431 //
1432 // Properties relating to the result of inactive lanes.
1433 //
1434
1436 return ResultLanes == InactiveLanesTakenFromOperand;
1437 }
1438
1440 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1441 return OperandIdxForInactiveLanes;
1442 }
1443
1445 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1446 ResultLanes = InactiveLanesTakenFromOperand;
1447 OperandIdxForInactiveLanes = Index;
1448 return *this;
1449 }
1450
1452 return ResultLanes == InactiveLanesAreNotDefined;
1453 }
1454
1456 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1457 ResultLanes = InactiveLanesAreNotDefined;
1458 return *this;
1459 }
1460
1462 return ResultLanes == InactiveLanesAreUnused;
1463 }
1464
1466 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1467 ResultLanes = InactiveLanesAreUnused;
1468 return *this;
1469 }
1470
1471 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1472 // inactiveLanesAreZeroed =
1473 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1474 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1475
1477 ResultIsZeroInitialized = true;
1478 return *this;
1479 }
1480
1481 //
1482 // The first operand of unary merging operations is typically only used to
1483 // set the result for inactive lanes. Knowing this allows us to deadcode the
1484 // operand when we can prove there are no inactive lanes.
1485 //
1486
1488 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1489 }
1490
1492 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1493 return OperandIdxWithNoActiveLanes;
1494 }
1495
1497 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1498 OperandIdxWithNoActiveLanes = Index;
1499 return *this;
1500 }
1501
1502private:
1503 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1504
1505 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1506 unsigned IROpcode = 0;
1507
1508 enum PredicationStyle {
1510 InactiveLanesTakenFromOperand,
1511 InactiveLanesAreNotDefined,
1512 InactiveLanesAreUnused
1513 } ResultLanes = Uninitialized;
1514
1515 bool ResultIsZeroInitialized = false;
1516 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1517 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1518};
1519
1521 // Some SVE intrinsics do not use scalable vector types, but since they are
1522 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1523 if (!isa<ScalableVectorType>(II.getType()) &&
1524 all_of(II.args(), [&](const Value *V) {
1525 return !isa<ScalableVectorType>(V->getType());
1526 }))
1527 return SVEIntrinsicInfo();
1528
1529 Intrinsic::ID IID = II.getIntrinsicID();
1530 switch (IID) {
1531 default:
1532 break;
1533 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1534 case Intrinsic::aarch64_sve_fcvt_f16f32:
1535 case Intrinsic::aarch64_sve_fcvt_f16f64:
1536 case Intrinsic::aarch64_sve_fcvt_f32f16:
1537 case Intrinsic::aarch64_sve_fcvt_f32f64:
1538 case Intrinsic::aarch64_sve_fcvt_f64f16:
1539 case Intrinsic::aarch64_sve_fcvt_f64f32:
1540 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1541 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1542 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1543 case Intrinsic::aarch64_sve_fcvtzs:
1544 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1545 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1546 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1547 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1548 case Intrinsic::aarch64_sve_fcvtzu:
1549 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1550 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1551 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1552 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1553 case Intrinsic::aarch64_sve_revb:
1554 case Intrinsic::aarch64_sve_revh:
1555 case Intrinsic::aarch64_sve_revw:
1556 case Intrinsic::aarch64_sve_revd:
1557 case Intrinsic::aarch64_sve_scvtf:
1558 case Intrinsic::aarch64_sve_scvtf_f16i32:
1559 case Intrinsic::aarch64_sve_scvtf_f16i64:
1560 case Intrinsic::aarch64_sve_scvtf_f32i64:
1561 case Intrinsic::aarch64_sve_scvtf_f64i32:
1562 case Intrinsic::aarch64_sve_ucvtf:
1563 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1564 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1565 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1566 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1568
1569 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1570 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1571 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1572 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1574
1575 case Intrinsic::aarch64_sve_fabd:
1576 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1577 case Intrinsic::aarch64_sve_fadd:
1578 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1579 .setMatchingIROpcode(Instruction::FAdd);
1580 case Intrinsic::aarch64_sve_fdiv:
1581 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1582 .setMatchingIROpcode(Instruction::FDiv);
1583 case Intrinsic::aarch64_sve_fmax:
1584 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1585 case Intrinsic::aarch64_sve_fmaxnm:
1586 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1587 case Intrinsic::aarch64_sve_fmin:
1588 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1589 case Intrinsic::aarch64_sve_fminnm:
1590 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1591 case Intrinsic::aarch64_sve_fmla:
1592 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1593 case Intrinsic::aarch64_sve_fmls:
1594 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1595 case Intrinsic::aarch64_sve_fmul:
1596 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1597 .setMatchingIROpcode(Instruction::FMul);
1598 case Intrinsic::aarch64_sve_fmulx:
1599 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1600 case Intrinsic::aarch64_sve_fnmla:
1601 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1602 case Intrinsic::aarch64_sve_fnmls:
1603 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1604 case Intrinsic::aarch64_sve_fsub:
1605 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1606 .setMatchingIROpcode(Instruction::FSub);
1607 case Intrinsic::aarch64_sve_add:
1608 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1609 .setMatchingIROpcode(Instruction::Add);
1610 case Intrinsic::aarch64_sve_mla:
1611 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1612 case Intrinsic::aarch64_sve_mls:
1613 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1614 case Intrinsic::aarch64_sve_mul:
1615 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1616 .setMatchingIROpcode(Instruction::Mul);
1617 case Intrinsic::aarch64_sve_sabd:
1618 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1619 case Intrinsic::aarch64_sve_sdiv:
1620 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1621 .setMatchingIROpcode(Instruction::SDiv);
1622 case Intrinsic::aarch64_sve_smax:
1623 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1624 case Intrinsic::aarch64_sve_smin:
1625 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1626 case Intrinsic::aarch64_sve_smulh:
1627 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1628 case Intrinsic::aarch64_sve_sub:
1629 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1630 .setMatchingIROpcode(Instruction::Sub);
1631 case Intrinsic::aarch64_sve_uabd:
1632 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1633 case Intrinsic::aarch64_sve_udiv:
1634 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1635 .setMatchingIROpcode(Instruction::UDiv);
1636 case Intrinsic::aarch64_sve_umax:
1637 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1638 case Intrinsic::aarch64_sve_umin:
1639 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1640 case Intrinsic::aarch64_sve_umulh:
1641 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1642 case Intrinsic::aarch64_sve_asr:
1643 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1644 .setMatchingIROpcode(Instruction::AShr);
1645 case Intrinsic::aarch64_sve_lsl:
1646 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1647 .setMatchingIROpcode(Instruction::Shl);
1648 case Intrinsic::aarch64_sve_lsr:
1649 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1650 .setMatchingIROpcode(Instruction::LShr);
1651 case Intrinsic::aarch64_sve_and:
1652 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1653 .setMatchingIROpcode(Instruction::And);
1654 case Intrinsic::aarch64_sve_bic:
1655 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1656 case Intrinsic::aarch64_sve_eor:
1657 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1658 .setMatchingIROpcode(Instruction::Xor);
1659 case Intrinsic::aarch64_sve_orr:
1660 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1661 .setMatchingIROpcode(Instruction::Or);
1662 case Intrinsic::aarch64_sve_shsub:
1663 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_shsub_u);
1664 case Intrinsic::aarch64_sve_shsubr:
1666 case Intrinsic::aarch64_sve_sqrshl:
1667 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqrshl_u);
1668 case Intrinsic::aarch64_sve_sqshl:
1669 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqshl_u);
1670 case Intrinsic::aarch64_sve_sqsub:
1671 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1672 case Intrinsic::aarch64_sve_srshl:
1673 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_srshl_u);
1674 case Intrinsic::aarch64_sve_uhsub:
1675 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uhsub_u);
1676 case Intrinsic::aarch64_sve_uhsubr:
1678 case Intrinsic::aarch64_sve_uqrshl:
1679 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqrshl_u);
1680 case Intrinsic::aarch64_sve_uqshl:
1681 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqshl_u);
1682 case Intrinsic::aarch64_sve_uqsub:
1683 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1684 case Intrinsic::aarch64_sve_urshl:
1685 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_urshl_u);
1686
1687 case Intrinsic::aarch64_sve_add_u:
1689 Instruction::Add);
1690 case Intrinsic::aarch64_sve_and_u:
1692 Instruction::And);
1693 case Intrinsic::aarch64_sve_asr_u:
1695 Instruction::AShr);
1696 case Intrinsic::aarch64_sve_eor_u:
1698 Instruction::Xor);
1699 case Intrinsic::aarch64_sve_fadd_u:
1701 Instruction::FAdd);
1702 case Intrinsic::aarch64_sve_fdiv_u:
1704 Instruction::FDiv);
1705 case Intrinsic::aarch64_sve_fmul_u:
1707 Instruction::FMul);
1708 case Intrinsic::aarch64_sve_fsub_u:
1710 Instruction::FSub);
1711 case Intrinsic::aarch64_sve_lsl_u:
1713 Instruction::Shl);
1714 case Intrinsic::aarch64_sve_lsr_u:
1716 Instruction::LShr);
1717 case Intrinsic::aarch64_sve_mul_u:
1719 Instruction::Mul);
1720 case Intrinsic::aarch64_sve_orr_u:
1722 Instruction::Or);
1723 case Intrinsic::aarch64_sve_sdiv_u:
1725 Instruction::SDiv);
1726 case Intrinsic::aarch64_sve_sub_u:
1728 Instruction::Sub);
1729 case Intrinsic::aarch64_sve_udiv_u:
1731 Instruction::UDiv);
1732
1733 case Intrinsic::aarch64_sve_addqv:
1734 case Intrinsic::aarch64_sve_and_z:
1735 case Intrinsic::aarch64_sve_bic_z:
1736 case Intrinsic::aarch64_sve_brka_z:
1737 case Intrinsic::aarch64_sve_brkb_z:
1738 case Intrinsic::aarch64_sve_brkn_z:
1739 case Intrinsic::aarch64_sve_brkpa_z:
1740 case Intrinsic::aarch64_sve_brkpb_z:
1741 case Intrinsic::aarch64_sve_cntp:
1742 case Intrinsic::aarch64_sve_compact:
1743 case Intrinsic::aarch64_sve_eor_z:
1744 case Intrinsic::aarch64_sve_eorv:
1745 case Intrinsic::aarch64_sve_eorqv:
1746 case Intrinsic::aarch64_sve_nand_z:
1747 case Intrinsic::aarch64_sve_nor_z:
1748 case Intrinsic::aarch64_sve_orn_z:
1749 case Intrinsic::aarch64_sve_orr_z:
1750 case Intrinsic::aarch64_sve_orv:
1751 case Intrinsic::aarch64_sve_orqv:
1752 case Intrinsic::aarch64_sve_pnext:
1753 case Intrinsic::aarch64_sve_rdffr_z:
1754 case Intrinsic::aarch64_sve_saddv:
1755 case Intrinsic::aarch64_sve_uaddv:
1756 case Intrinsic::aarch64_sve_umaxv:
1757 case Intrinsic::aarch64_sve_umaxqv:
1758 case Intrinsic::aarch64_sve_cmpeq:
1759 case Intrinsic::aarch64_sve_cmpeq_wide:
1760 case Intrinsic::aarch64_sve_cmpge:
1761 case Intrinsic::aarch64_sve_cmpge_wide:
1762 case Intrinsic::aarch64_sve_cmpgt:
1763 case Intrinsic::aarch64_sve_cmpgt_wide:
1764 case Intrinsic::aarch64_sve_cmphi:
1765 case Intrinsic::aarch64_sve_cmphi_wide:
1766 case Intrinsic::aarch64_sve_cmphs:
1767 case Intrinsic::aarch64_sve_cmphs_wide:
1768 case Intrinsic::aarch64_sve_cmple_wide:
1769 case Intrinsic::aarch64_sve_cmplo_wide:
1770 case Intrinsic::aarch64_sve_cmpls_wide:
1771 case Intrinsic::aarch64_sve_cmplt_wide:
1772 case Intrinsic::aarch64_sve_cmpne:
1773 case Intrinsic::aarch64_sve_cmpne_wide:
1774 case Intrinsic::aarch64_sve_facge:
1775 case Intrinsic::aarch64_sve_facgt:
1776 case Intrinsic::aarch64_sve_fcmpeq:
1777 case Intrinsic::aarch64_sve_fcmpge:
1778 case Intrinsic::aarch64_sve_fcmpgt:
1779 case Intrinsic::aarch64_sve_fcmpne:
1780 case Intrinsic::aarch64_sve_fcmpuo:
1781 case Intrinsic::aarch64_sve_ld1:
1782 case Intrinsic::aarch64_sve_ld1_gather:
1783 case Intrinsic::aarch64_sve_ld1_gather_index:
1784 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1785 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1786 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1787 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1788 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1789 case Intrinsic::aarch64_sve_ld1q_gather_index:
1790 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1791 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1792 case Intrinsic::aarch64_sve_ld1ro:
1793 case Intrinsic::aarch64_sve_ld1rq:
1794 case Intrinsic::aarch64_sve_ld1udq:
1795 case Intrinsic::aarch64_sve_ld1uwq:
1796 case Intrinsic::aarch64_sve_ld2_sret:
1797 case Intrinsic::aarch64_sve_ld2q_sret:
1798 case Intrinsic::aarch64_sve_ld3_sret:
1799 case Intrinsic::aarch64_sve_ld3q_sret:
1800 case Intrinsic::aarch64_sve_ld4_sret:
1801 case Intrinsic::aarch64_sve_ld4q_sret:
1802 case Intrinsic::aarch64_sve_ldff1:
1803 case Intrinsic::aarch64_sve_ldff1_gather:
1804 case Intrinsic::aarch64_sve_ldff1_gather_index:
1805 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1806 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1807 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1808 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1809 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1810 case Intrinsic::aarch64_sve_ldnf1:
1811 case Intrinsic::aarch64_sve_ldnt1:
1812 case Intrinsic::aarch64_sve_ldnt1_gather:
1813 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1814 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1815 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1817
1818 case Intrinsic::aarch64_sve_prf:
1819 case Intrinsic::aarch64_sve_prfb_gather_index:
1820 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1821 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1822 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1823 case Intrinsic::aarch64_sve_prfd_gather_index:
1824 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1825 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1826 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1827 case Intrinsic::aarch64_sve_prfh_gather_index:
1828 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1829 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1830 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1831 case Intrinsic::aarch64_sve_prfw_gather_index:
1832 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1833 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1834 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1836
1837 case Intrinsic::aarch64_sve_st1_scatter:
1838 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1839 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1840 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1841 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1842 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1843 case Intrinsic::aarch64_sve_st1dq:
1844 case Intrinsic::aarch64_sve_st1q_scatter_index:
1845 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1846 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1847 case Intrinsic::aarch64_sve_st1wq:
1848 case Intrinsic::aarch64_sve_stnt1:
1849 case Intrinsic::aarch64_sve_stnt1_scatter:
1850 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1851 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1852 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1854 case Intrinsic::aarch64_sve_st2:
1855 case Intrinsic::aarch64_sve_st2q:
1857 case Intrinsic::aarch64_sve_st3:
1858 case Intrinsic::aarch64_sve_st3q:
1860 case Intrinsic::aarch64_sve_st4:
1861 case Intrinsic::aarch64_sve_st4q:
1863 }
1864
1865 return SVEIntrinsicInfo();
1866}
1867
1868static bool isAllActivePredicate(Value *Pred) {
1869 Value *UncastedPred;
1870
1871 // Look through predicate casts that only remove lanes.
1873 m_Value(UncastedPred)))) {
1874 auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
1875 Pred = UncastedPred;
1876
1878 m_Value(UncastedPred))))
1879 // If the predicate has the same or less lanes than the uncasted predicate
1880 // then we know the casting has no effect.
1881 if (OrigPredTy->getMinNumElements() <=
1882 cast<ScalableVectorType>(UncastedPred->getType())
1883 ->getMinNumElements())
1884 Pred = UncastedPred;
1885 }
1886
1887 auto *C = dyn_cast<Constant>(Pred);
1888 return C && C->isAllOnesValue();
1889}
1890
1891// Simplify `V` by only considering the operations that affect active lanes.
1892// This function should only return existing Values or newly created Constants.
1893static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1894 auto *Dup = dyn_cast<IntrinsicInst>(V);
1895 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1896 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1898 cast<VectorType>(V->getType())->getElementCount(),
1899 cast<Constant>(Dup->getOperand(2)));
1900
1901 return V;
1902}
1903
1904static std::optional<Instruction *>
1906 const SVEIntrinsicInfo &IInfo) {
1907 const unsigned Opc = IInfo.getMatchingIROpode();
1908 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1909
1910 Value *Pg = II.getOperand(0);
1911 Value *Op1 = II.getOperand(1);
1912 Value *Op2 = II.getOperand(2);
1913 const DataLayout &DL = II.getDataLayout();
1914
1915 // Canonicalise constants to the RHS.
1917 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1918 IC.replaceOperand(II, 1, Op2);
1919 IC.replaceOperand(II, 2, Op1);
1920 return &II;
1921 }
1922
1923 // Only active lanes matter when simplifying the operation.
1924 Op1 = stripInactiveLanes(Op1, Pg);
1925 Op2 = stripInactiveLanes(Op2, Pg);
1926
1927 Value *SimpleII;
1928 if (auto FII = dyn_cast<FPMathOperator>(&II))
1929 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1930 else
1931 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1932
1933 // An SVE intrinsic's result is always defined. However, this is not the case
1934 // for its equivalent IR instruction (e.g. when shifting by an amount more
1935 // than the data's bitwidth). Simplifications to an undefined result must be
1936 // ignored to preserve the intrinsic's expected behaviour.
1937 if (!SimpleII || isa<UndefValue>(SimpleII))
1938 return std::nullopt;
1939
1940 if (IInfo.inactiveLanesAreNotDefined())
1941 return IC.replaceInstUsesWith(II, SimpleII);
1942
1943 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1944
1945 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1946 if (SimpleII == Inactive)
1947 return IC.replaceInstUsesWith(II, SimpleII);
1948
1949 // Inactive lanes must be preserved.
1950 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1951 return IC.replaceInstUsesWith(II, SimpleII);
1952}
1953
1954// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1955// to operations with less strict inactive lane requirements.
1956static std::optional<Instruction *>
1958 const SVEIntrinsicInfo &IInfo) {
1959 if (!IInfo.hasGoverningPredicate())
1960 return std::nullopt;
1961
1962 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1963
1964 // If there are no active lanes.
1965 if (match(OpPredicate, m_ZeroInt())) {
1967 return IC.replaceInstUsesWith(
1968 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1969
1970 if (IInfo.inactiveLanesAreUnused()) {
1971 if (IInfo.resultIsZeroInitialized())
1973
1974 return IC.eraseInstFromFunction(II);
1975 }
1976 }
1977
1978 // If there are no inactive lanes.
1979 if (isAllActivePredicate(OpPredicate)) {
1980 if (IInfo.hasOperandWithNoActiveLanes()) {
1981 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1982 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1983 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1984 }
1985
1986 if (IInfo.hasMatchingUndefIntrinsic()) {
1987 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1988 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1989 II.setCalledFunction(NewDecl);
1990 return &II;
1991 }
1992 }
1993
1994 // Operation specific simplifications.
1995 if (IInfo.hasMatchingIROpode() &&
1997 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1998
1999 return std::nullopt;
2000}
2001
2002// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
2003// => (binop (pred) (from_svbool _) (from_svbool _))
2004//
2005// The above transformation eliminates a `to_svbool` in the predicate
2006// operand of bitwise operation `binop` by narrowing the vector width of
2007// the operation. For example, it would convert a `<vscale x 16 x i1>
2008// and` into a `<vscale x 4 x i1> and`. This is profitable because
2009// to_svbool must zero the new lanes during widening, whereas
2010// from_svbool is free.
2011static std::optional<Instruction *>
2013 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
2014 if (!BinOp)
2015 return std::nullopt;
2016
2017 auto IntrinsicID = BinOp->getIntrinsicID();
2018 switch (IntrinsicID) {
2019 case Intrinsic::aarch64_sve_and_z:
2020 case Intrinsic::aarch64_sve_bic_z:
2021 case Intrinsic::aarch64_sve_eor_z:
2022 case Intrinsic::aarch64_sve_nand_z:
2023 case Intrinsic::aarch64_sve_nor_z:
2024 case Intrinsic::aarch64_sve_orn_z:
2025 case Intrinsic::aarch64_sve_orr_z:
2026 break;
2027 default:
2028 return std::nullopt;
2029 }
2030
2031 auto BinOpPred = BinOp->getOperand(0);
2032 auto BinOpOp1 = BinOp->getOperand(1);
2033 auto BinOpOp2 = BinOp->getOperand(2);
2034
2035 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
2036 if (!PredIntr ||
2037 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
2038 return std::nullopt;
2039
2040 auto PredOp = PredIntr->getOperand(0);
2041 auto PredOpTy = cast<VectorType>(PredOp->getType());
2042 if (PredOpTy != II.getType())
2043 return std::nullopt;
2044
2045 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
2046 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
2047 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
2048 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
2049 if (BinOpOp1 == BinOpOp2)
2050 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
2051 else
2052 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
2053 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
2054
2055 auto NarrowedBinOp =
2056 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
2057 return IC.replaceInstUsesWith(II, NarrowedBinOp);
2058}
2059
2060static std::optional<Instruction *>
2062 // If the reinterpret instruction operand is a PHI Node
2063 if (isa<PHINode>(II.getArgOperand(0)))
2064 return processPhiNode(IC, II);
2065
2066 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
2067 return BinOpCombine;
2068
2069 // Ignore converts to/from svcount_t.
2070 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
2071 isa<TargetExtType>(II.getType()))
2072 return std::nullopt;
2073
2074 SmallVector<Instruction *, 32> CandidatesForRemoval;
2075 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
2076
2077 const auto *IVTy = cast<VectorType>(II.getType());
2078
2079 // Walk the chain of conversions.
2080 while (Cursor) {
2081 // If the type of the cursor has fewer lanes than the final result, zeroing
2082 // must take place, which breaks the equivalence chain.
2083 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
2084 if (CursorVTy->getElementCount().getKnownMinValue() <
2085 IVTy->getElementCount().getKnownMinValue())
2086 break;
2087
2088 // If the cursor has the same type as I, it is a viable replacement.
2089 if (Cursor->getType() == IVTy)
2090 EarliestReplacement = Cursor;
2091
2092 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
2093
2094 // If this is not an SVE conversion intrinsic, this is the end of the chain.
2095 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
2096 Intrinsic::aarch64_sve_convert_to_svbool ||
2097 IntrinsicCursor->getIntrinsicID() ==
2098 Intrinsic::aarch64_sve_convert_from_svbool))
2099 break;
2100
2101 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
2102 Cursor = IntrinsicCursor->getOperand(0);
2103 }
2104
2105 // If no viable replacement in the conversion chain was found, there is
2106 // nothing to do.
2107 if (!EarliestReplacement)
2108 return std::nullopt;
2109
2110 return IC.replaceInstUsesWith(II, EarliestReplacement);
2111}
2112
2113static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
2114 IntrinsicInst &II) {
2115 // svsel(ptrue, x, y) => x
2116 auto *OpPredicate = II.getOperand(0);
2117 if (isAllActivePredicate(OpPredicate))
2118 return IC.replaceInstUsesWith(II, II.getOperand(1));
2119
2120 auto Select =
2121 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
2122 return IC.replaceInstUsesWith(II, Select);
2123}
2124
2125static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
2126 IntrinsicInst &II) {
2127 Value *Pg = II.getOperand(1);
2128
2129 // sve.dup(V, all_active, X) ==> splat(X)
2130 if (isAllActivePredicate(Pg)) {
2131 auto *RetTy = cast<ScalableVectorType>(II.getType());
2132 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
2133 II.getArgOperand(2));
2134 return IC.replaceInstUsesWith(II, Splat);
2135 }
2136
2138 m_SpecificInt(AArch64SVEPredPattern::vl1))))
2139 return std::nullopt;
2140
2141 // sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0
2142 Value *Insert = IC.Builder.CreateInsertElement(
2143 II.getArgOperand(0), II.getArgOperand(2), uint64_t(0));
2144 return IC.replaceInstUsesWith(II, Insert);
2145}
2146
2147static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
2148 IntrinsicInst &II) {
2149 // Replace DupX with a regular IR splat.
2150 auto *RetTy = cast<ScalableVectorType>(II.getType());
2151 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
2152 II.getArgOperand(0));
2153 Splat->takeName(&II);
2154 return IC.replaceInstUsesWith(II, Splat);
2155}
2156
2157static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
2158 IntrinsicInst &II) {
2159 LLVMContext &Ctx = II.getContext();
2160
2161 if (!isAllActivePredicate(II.getArgOperand(0)))
2162 return std::nullopt;
2163
2164 // Check that we have a compare of zero..
2165 auto *SplatValue =
2167 if (!SplatValue || !SplatValue->isZero())
2168 return std::nullopt;
2169
2170 // ..against a dupq
2171 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
2172 if (!DupQLane ||
2173 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
2174 return std::nullopt;
2175
2176 // Where the dupq is a lane 0 replicate of a vector insert
2177 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
2178 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
2179 return std::nullopt;
2180
2181 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
2182 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
2183 return std::nullopt;
2184
2185 // Where the vector insert is a fixed constant vector insert into undef at
2186 // index zero
2187 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
2188 return std::nullopt;
2189
2190 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
2191 return std::nullopt;
2192
2193 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
2194 if (!ConstVec)
2195 return std::nullopt;
2196
2197 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
2198 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
2199 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
2200 return std::nullopt;
2201
2202 unsigned NumElts = VecTy->getNumElements();
2203 unsigned PredicateBits = 0;
2204
2205 // Expand intrinsic operands to a 16-bit byte level predicate
2206 for (unsigned I = 0; I < NumElts; ++I) {
2207 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
2208 if (!Arg)
2209 return std::nullopt;
2210 if (!Arg->isZero())
2211 PredicateBits |= 1 << (I * (16 / NumElts));
2212 }
2213
2214 // If all bits are zero bail early with an empty predicate
2215 if (PredicateBits == 0) {
2216 auto *PFalse = Constant::getNullValue(II.getType());
2217 PFalse->takeName(&II);
2218 return IC.replaceInstUsesWith(II, PFalse);
2219 }
2220
2221 // Calculate largest predicate type used (where byte predicate is largest)
2222 unsigned Mask = 8;
2223 for (unsigned I = 0; I < 16; ++I)
2224 if ((PredicateBits & (1 << I)) != 0)
2225 Mask |= (I % 8);
2226
2227 unsigned PredSize = Mask & -Mask;
2228 auto *PredType = ScalableVectorType::get(
2229 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
2230
2231 // Ensure all relevant bits are set
2232 for (unsigned I = 0; I < 16; I += PredSize)
2233 if ((PredicateBits & (1 << I)) == 0)
2234 return std::nullopt;
2235
2236 auto *ConvertToSVBool =
2237 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
2238 PredType, ConstantInt::getTrue(PredType));
2239 auto *ConvertFromSVBool =
2240 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
2241 II.getType(), ConvertToSVBool);
2242
2243 ConvertFromSVBool->takeName(&II);
2244 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
2245}
2246
2247static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2248 IntrinsicInst &II) {
2249 Value *Pg = II.getArgOperand(0);
2250 Value *Vec = II.getArgOperand(1);
2251 auto IntrinsicID = II.getIntrinsicID();
2252 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2253
2254 // lastX(splat(X)) --> X
2255 if (auto *SplatVal = getSplatValue(Vec))
2256 return IC.replaceInstUsesWith(II, SplatVal);
2257
2258 // If x and/or y is a splat value then:
2259 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2260 Value *LHS, *RHS;
2261 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
2262 if (isSplatValue(LHS) || isSplatValue(RHS)) {
2263 auto *OldBinOp = cast<BinaryOperator>(Vec);
2264 auto OpC = OldBinOp->getOpcode();
2265 auto *NewLHS =
2266 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
2267 auto *NewRHS =
2268 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
2270 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
2271 return IC.replaceInstUsesWith(II, NewBinOp);
2272 }
2273 }
2274
2275 auto *C = dyn_cast<Constant>(Pg);
2276 if (IsAfter && C && C->isNullValue()) {
2277 // The intrinsic is extracting lane 0 so use an extract instead.
2278 auto *IdxTy = Type::getInt64Ty(II.getContext());
2279 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
2280 Extract->insertBefore(II.getIterator());
2281 Extract->takeName(&II);
2282 return IC.replaceInstUsesWith(II, Extract);
2283 }
2284
2285 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
2286 if (!IntrPG)
2287 return std::nullopt;
2288
2289 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2290 return std::nullopt;
2291
2292 const auto PTruePattern =
2293 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2294
2295 // Can the intrinsic's predicate be converted to a known constant index?
2296 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2297 if (!MinNumElts)
2298 return std::nullopt;
2299
2300 unsigned Idx = MinNumElts - 1;
2301 // Increment the index if extracting the element after the last active
2302 // predicate element.
2303 if (IsAfter)
2304 ++Idx;
2305
2306 // Ignore extracts whose index is larger than the known minimum vector
2307 // length. NOTE: This is an artificial constraint where we prefer to
2308 // maintain what the user asked for until an alternative is proven faster.
2309 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2310 if (Idx >= PgVTy->getMinNumElements())
2311 return std::nullopt;
2312
2313 // The intrinsic is extracting a fixed lane so use an extract instead.
2314 auto *IdxTy = Type::getInt64Ty(II.getContext());
2315 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2316 Extract->insertBefore(II.getIterator());
2317 Extract->takeName(&II);
2318 return IC.replaceInstUsesWith(II, Extract);
2319}
2320
2321static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2322 IntrinsicInst &II) {
2323 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2324 // integer variant across a variety of micro-architectures. Replace scalar
2325 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2326 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2327 // depending on the micro-architecture, but has been observed as generally
2328 // being faster, particularly when the CLAST[AB] op is a loop-carried
2329 // dependency.
2330 Value *Pg = II.getArgOperand(0);
2331 Value *Fallback = II.getArgOperand(1);
2332 Value *Vec = II.getArgOperand(2);
2333 Type *Ty = II.getType();
2334
2335 if (!Ty->isIntegerTy())
2336 return std::nullopt;
2337
2338 Type *FPTy;
2339 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2340 default:
2341 return std::nullopt;
2342 case 16:
2343 FPTy = IC.Builder.getHalfTy();
2344 break;
2345 case 32:
2346 FPTy = IC.Builder.getFloatTy();
2347 break;
2348 case 64:
2349 FPTy = IC.Builder.getDoubleTy();
2350 break;
2351 }
2352
2353 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2354 auto *FPVTy = VectorType::get(
2355 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2356 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2357 auto *FPII = IC.Builder.CreateIntrinsic(
2358 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2359 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2360 return IC.replaceInstUsesWith(II, FPIItoInt);
2361}
2362
2363static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2364 IntrinsicInst &II) {
2365 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2366 // can work with RDFFR_PP for ptest elimination.
2367 auto *RDFFR = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z,
2368 ConstantInt::getTrue(II.getType()));
2369 RDFFR->takeName(&II);
2370 return IC.replaceInstUsesWith(II, RDFFR);
2371}
2372
2373static std::optional<Instruction *>
2375 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2376
2377 if (Pattern == AArch64SVEPredPattern::all) {
2379 II.getType(), ElementCount::getScalable(NumElts));
2380 Cnt->takeName(&II);
2381 return IC.replaceInstUsesWith(II, Cnt);
2382 }
2383
2384 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2385
2386 return MinNumElts && NumElts >= MinNumElts
2387 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2388 II, ConstantInt::get(II.getType(), MinNumElts)))
2389 : std::nullopt;
2390}
2391
2392static std::optional<Instruction *>
2394 const AArch64Subtarget *ST) {
2395 if (!ST->isStreaming())
2396 return std::nullopt;
2397
2398 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2399 // with SVEPredPattern::all
2400 Value *Cnt =
2402 Cnt->takeName(&II);
2403 return IC.replaceInstUsesWith(II, Cnt);
2404}
2405
2406static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2407 IntrinsicInst &II) {
2408 Value *PgVal = II.getArgOperand(0);
2409 Value *OpVal = II.getArgOperand(1);
2410
2411 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2412 // Later optimizations prefer this form.
2413 if (PgVal == OpVal &&
2414 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2415 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2416 Value *Ops[] = {PgVal, OpVal};
2417 Type *Tys[] = {PgVal->getType()};
2418
2419 auto *PTest =
2420 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2421 PTest->takeName(&II);
2422
2423 return IC.replaceInstUsesWith(II, PTest);
2424 }
2425
2428
2429 if (!Pg || !Op)
2430 return std::nullopt;
2431
2432 Intrinsic::ID OpIID = Op->getIntrinsicID();
2433
2434 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2435 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2436 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2437 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2438 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2439
2440 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2441
2442 PTest->takeName(&II);
2443 return IC.replaceInstUsesWith(II, PTest);
2444 }
2445
2446 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2447 // Later optimizations may rewrite sequence to use the flag-setting variant
2448 // of instruction X to remove PTEST.
2449 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2450 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2451 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2452 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2453 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2454 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2455 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2456 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2457 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2458 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2459 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2460 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2461 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2462 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2463 Type *Tys[] = {Pg->getType()};
2464
2465 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2466 PTest->takeName(&II);
2467
2468 return IC.replaceInstUsesWith(II, PTest);
2469 }
2470
2471 return std::nullopt;
2472}
2473
2474template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2475static std::optional<Instruction *>
2477 bool MergeIntoAddendOp) {
2478 Value *P = II.getOperand(0);
2479 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2480 if (MergeIntoAddendOp) {
2481 AddendOp = II.getOperand(1);
2482 Mul = II.getOperand(2);
2483 } else {
2484 AddendOp = II.getOperand(2);
2485 Mul = II.getOperand(1);
2486 }
2487
2489 m_Value(MulOp1))))
2490 return std::nullopt;
2491
2492 if (!Mul->hasOneUse())
2493 return std::nullopt;
2494
2495 Instruction *FMFSource = nullptr;
2496 if (II.getType()->isFPOrFPVectorTy()) {
2497 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2498 // Stop the combine when the flags on the inputs differ in case dropping
2499 // flags would lead to us missing out on more beneficial optimizations.
2500 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2501 return std::nullopt;
2502 if (!FAddFlags.allowContract())
2503 return std::nullopt;
2504 FMFSource = &II;
2505 }
2506
2507 Value *Res;
2508 if (MergeIntoAddendOp)
2509 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2510 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2511 else
2512 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2513 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2514
2515 return IC.replaceInstUsesWith(II, Res);
2516}
2517
2518static std::optional<Instruction *>
2520 Value *Pred = II.getOperand(0);
2521 Value *PtrOp = II.getOperand(1);
2522 Type *VecTy = II.getType();
2523
2524 if (isAllActivePredicate(Pred)) {
2525 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2526 Load->copyMetadata(II);
2527 return IC.replaceInstUsesWith(II, Load);
2528 }
2529
2530 CallInst *MaskedLoad =
2531 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2532 Pred, ConstantAggregateZero::get(VecTy));
2533 MaskedLoad->copyMetadata(II);
2534 return IC.replaceInstUsesWith(II, MaskedLoad);
2535}
2536
2537static std::optional<Instruction *>
2539 Value *VecOp = II.getOperand(0);
2540 Value *Pred = II.getOperand(1);
2541 Value *PtrOp = II.getOperand(2);
2542
2543 if (isAllActivePredicate(Pred)) {
2544 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2545 Store->copyMetadata(II);
2546 return IC.eraseInstFromFunction(II);
2547 }
2548
2549 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2550 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2551 MaskedStore->copyMetadata(II);
2552 return IC.eraseInstFromFunction(II);
2553}
2554
2556 switch (Intrinsic) {
2557 case Intrinsic::aarch64_sve_fmul_u:
2558 return Instruction::BinaryOps::FMul;
2559 case Intrinsic::aarch64_sve_fadd_u:
2560 return Instruction::BinaryOps::FAdd;
2561 case Intrinsic::aarch64_sve_fsub_u:
2562 return Instruction::BinaryOps::FSub;
2563 default:
2564 return Instruction::BinaryOpsEnd;
2565 }
2566}
2567
2568static std::optional<Instruction *>
2570 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2571 if (II.isStrictFP())
2572 return std::nullopt;
2573
2574 auto *OpPredicate = II.getOperand(0);
2575 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2576 if (BinOpCode == Instruction::BinaryOpsEnd ||
2577 !isAllActivePredicate(OpPredicate))
2578 return std::nullopt;
2579 auto BinOp = IC.Builder.CreateBinOpFMF(
2580 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2581 return IC.replaceInstUsesWith(II, BinOp);
2582}
2583
2584static std::optional<Instruction *>
2586 assert(II.getIntrinsicID() == Intrinsic::aarch64_sve_mla_u &&
2587 "Expected MLA_U intrinsic");
2588 Value *Acc = II.getArgOperand(1);
2589 Value *MulOp0 = II.getArgOperand(2);
2590 Value *MulOp1 = II.getArgOperand(3);
2591
2592 // For mla_u, inactive lanes are undefined, so it is valid to drop the
2593 // predicate when replacing mla_u(acc, x, 1) with add(acc, x) or
2594 // mla_u(acc, x, -1) with sub(acc, x).
2595 if (match(MulOp0, m_One()))
2596 return IC.replaceInstUsesWith(II, IC.Builder.CreateAdd(Acc, MulOp1));
2597 if (match(MulOp1, m_One()))
2598 return IC.replaceInstUsesWith(II, IC.Builder.CreateAdd(Acc, MulOp0));
2599 if (match(MulOp0, m_AllOnes()))
2600 return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Acc, MulOp1));
2601 if (match(MulOp1, m_AllOnes()))
2602 return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Acc, MulOp0));
2603
2604 return std::nullopt;
2605}
2606
2607static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2608 IntrinsicInst &II) {
2609 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2610 Intrinsic::aarch64_sve_mla>(
2611 IC, II, true))
2612 return MLA;
2613 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2614 Intrinsic::aarch64_sve_mad>(
2615 IC, II, false))
2616 return MAD;
2617 return std::nullopt;
2618}
2619
2620static std::optional<Instruction *>
2622 if (auto FMLA =
2623 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2624 Intrinsic::aarch64_sve_fmla>(IC, II,
2625 true))
2626 return FMLA;
2627 if (auto FMAD =
2628 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2629 Intrinsic::aarch64_sve_fmad>(IC, II,
2630 false))
2631 return FMAD;
2632 if (auto FMLA =
2633 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2634 Intrinsic::aarch64_sve_fmla>(IC, II,
2635 true))
2636 return FMLA;
2637 return std::nullopt;
2638}
2639
2640static std::optional<Instruction *>
2642 if (auto FMLA =
2643 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2644 Intrinsic::aarch64_sve_fmla>(IC, II,
2645 true))
2646 return FMLA;
2647 if (auto FMAD =
2648 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2649 Intrinsic::aarch64_sve_fmad>(IC, II,
2650 false))
2651 return FMAD;
2652 if (auto FMLA_U =
2653 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2654 Intrinsic::aarch64_sve_fmla_u>(
2655 IC, II, true))
2656 return FMLA_U;
2657 return instCombineSVEVectorBinOp(IC, II);
2658}
2659
2660static std::optional<Instruction *>
2662 if (auto FMLS =
2663 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2664 Intrinsic::aarch64_sve_fmls>(IC, II,
2665 true))
2666 return FMLS;
2667 if (auto FMSB =
2668 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2669 Intrinsic::aarch64_sve_fnmsb>(
2670 IC, II, false))
2671 return FMSB;
2672 if (auto FMLS =
2673 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2674 Intrinsic::aarch64_sve_fmls>(IC, II,
2675 true))
2676 return FMLS;
2677 return std::nullopt;
2678}
2679
2680static std::optional<Instruction *>
2682 if (auto FMLS =
2683 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2684 Intrinsic::aarch64_sve_fmls>(IC, II,
2685 true))
2686 return FMLS;
2687 if (auto FMSB =
2688 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2689 Intrinsic::aarch64_sve_fnmsb>(
2690 IC, II, false))
2691 return FMSB;
2692 if (auto FMLS_U =
2693 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2694 Intrinsic::aarch64_sve_fmls_u>(
2695 IC, II, true))
2696 return FMLS_U;
2697 return instCombineSVEVectorBinOp(IC, II);
2698}
2699
2700static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2701 IntrinsicInst &II) {
2702 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2703 Intrinsic::aarch64_sve_mls>(
2704 IC, II, true))
2705 return MLS;
2706 return std::nullopt;
2707}
2708
2709static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2710 IntrinsicInst &II) {
2711 Value *UnpackArg = II.getArgOperand(0);
2712 auto *RetTy = cast<ScalableVectorType>(II.getType());
2713 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2714 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2715
2716 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2717 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2718 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2719 ScalarArg =
2720 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2721 Value *NewVal =
2722 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2723 NewVal->takeName(&II);
2724 return IC.replaceInstUsesWith(II, NewVal);
2725 }
2726
2727 return std::nullopt;
2728}
2729static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2730 IntrinsicInst &II) {
2731 auto *OpVal = II.getOperand(0);
2732 auto *OpIndices = II.getOperand(1);
2733 VectorType *VTy = cast<VectorType>(II.getType());
2734
2735 // Check whether OpIndices is a constant splat value < minimal element count
2736 // of result.
2737 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2738 if (!SplatValue ||
2739 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2740 return std::nullopt;
2741
2742 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2743 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2744 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2745 auto *VectorSplat =
2746 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2747
2748 VectorSplat->takeName(&II);
2749 return IC.replaceInstUsesWith(II, VectorSplat);
2750}
2751
2752static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2753 IntrinsicInst &II) {
2754 Value *A, *B;
2755 Type *RetTy = II.getType();
2756 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2757 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2758
2759 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2760 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2761 if ((match(II.getArgOperand(0),
2763 match(II.getArgOperand(1),
2765 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2766 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2767 auto *TyA = cast<ScalableVectorType>(A->getType());
2768 if (TyA == B->getType() &&
2770 auto *SubVec = IC.Builder.CreateInsertVector(
2771 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2772 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2773 TyA->getMinNumElements());
2774 ConcatVec->takeName(&II);
2775 return IC.replaceInstUsesWith(II, ConcatVec);
2776 }
2777 }
2778
2779 return std::nullopt;
2780}
2781
2782static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2783 IntrinsicInst &II) {
2784 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2785 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2786 Value *A, *B;
2787 if (match(II.getArgOperand(0),
2790 m_Specific(A), m_Specific(B))))
2791 return IC.replaceInstUsesWith(
2792 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2793
2794 return std::nullopt;
2795}
2796
2797static std::optional<Instruction *>
2799 Value *Mask = II.getOperand(0);
2800 Value *BasePtr = II.getOperand(1);
2801 Value *Index = II.getOperand(2);
2802 Type *Ty = II.getType();
2803 Value *PassThru = ConstantAggregateZero::get(Ty);
2804
2805 // Contiguous gather => masked load.
2806 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2807 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2808 Value *IndexBase;
2810 m_One()))) {
2811 Align Alignment =
2812 BasePtr->getPointerAlignment(II.getDataLayout());
2813
2814 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2815 BasePtr, IndexBase);
2816 CallInst *MaskedLoad =
2817 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2818 MaskedLoad->takeName(&II);
2819 return IC.replaceInstUsesWith(II, MaskedLoad);
2820 }
2821
2822 return std::nullopt;
2823}
2824
2825static std::optional<Instruction *>
2827 Value *Val = II.getOperand(0);
2828 Value *Mask = II.getOperand(1);
2829 Value *BasePtr = II.getOperand(2);
2830 Value *Index = II.getOperand(3);
2831 Type *Ty = Val->getType();
2832
2833 // Contiguous scatter => masked store.
2834 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2835 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2836 Value *IndexBase;
2838 m_One()))) {
2839 Align Alignment =
2840 BasePtr->getPointerAlignment(II.getDataLayout());
2841
2842 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2843 BasePtr, IndexBase);
2844 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2845
2846 return IC.eraseInstFromFunction(II);
2847 }
2848
2849 return std::nullopt;
2850}
2851
2852static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2853 IntrinsicInst &II) {
2855 Value *Pred = II.getOperand(0);
2856 Value *Vec = II.getOperand(1);
2857 Value *DivVec = II.getOperand(2);
2858
2859 Value *SplatValue = getSplatValue(DivVec);
2860 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2861 if (!SplatConstantInt)
2862 return std::nullopt;
2863
2864 APInt Divisor = SplatConstantInt->getValue();
2865 const int64_t DivisorValue = Divisor.getSExtValue();
2866 if (DivisorValue == -1)
2867 return std::nullopt;
2868 if (DivisorValue == 1)
2869 IC.replaceInstUsesWith(II, Vec);
2870
2871 if (Divisor.isPowerOf2()) {
2872 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2873 auto ASRD = IC.Builder.CreateIntrinsic(
2874 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2875 return IC.replaceInstUsesWith(II, ASRD);
2876 }
2877 if (Divisor.isNegatedPowerOf2()) {
2878 Divisor.negate();
2879 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2880 auto ASRD = IC.Builder.CreateIntrinsic(
2881 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2882 auto NEG = IC.Builder.CreateIntrinsic(
2883 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2884 return IC.replaceInstUsesWith(II, NEG);
2885 }
2886
2887 return std::nullopt;
2888}
2889
2890bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2891 size_t VecSize = Vec.size();
2892 if (VecSize == 1)
2893 return true;
2894 if (!isPowerOf2_64(VecSize))
2895 return false;
2896 size_t HalfVecSize = VecSize / 2;
2897
2898 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2899 RHS != Vec.end(); LHS++, RHS++) {
2900 if (*LHS != nullptr && *RHS != nullptr) {
2901 if (*LHS == *RHS)
2902 continue;
2903 else
2904 return false;
2905 }
2906 if (!AllowPoison)
2907 return false;
2908 if (*LHS == nullptr && *RHS != nullptr)
2909 *LHS = *RHS;
2910 }
2911
2912 Vec.resize(HalfVecSize);
2913 SimplifyValuePattern(Vec, AllowPoison);
2914 return true;
2915}
2916
2917// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2918// to dupqlane(f64(C)) where C is A concatenated with B
2919static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2920 IntrinsicInst &II) {
2921 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2922 if (!match(II.getOperand(0),
2924 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2925 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2926 return std::nullopt;
2927 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2928
2929 // Insert the scalars into a container ordered by InsertElement index
2930 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2931 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2932 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2933 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2934 CurrentInsertElt = InsertElt->getOperand(0);
2935 }
2936
2937 bool AllowPoison =
2938 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2939 if (!SimplifyValuePattern(Elts, AllowPoison))
2940 return std::nullopt;
2941
2942 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2943 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2944 for (size_t I = 0; I < Elts.size(); I++) {
2945 if (Elts[I] == nullptr)
2946 continue;
2947 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2948 IC.Builder.getInt64(I));
2949 }
2950 if (InsertEltChain == nullptr)
2951 return std::nullopt;
2952
2953 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2954 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2955 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2956 // be narrowed back to the original type.
2957 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2958 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2959 IIScalableTy->getMinNumElements() /
2960 PatternWidth;
2961
2962 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2963 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2964 auto *WideShuffleMaskTy =
2965 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2966
2967 auto InsertSubvector = IC.Builder.CreateInsertVector(
2968 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2969 uint64_t(0));
2970 auto WideBitcast =
2971 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2972 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2973 auto WideShuffle = IC.Builder.CreateShuffleVector(
2974 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2975 auto NarrowBitcast =
2976 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2977
2978 return IC.replaceInstUsesWith(II, NarrowBitcast);
2979}
2980
2981static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2982 IntrinsicInst &II) {
2983 Value *A = II.getArgOperand(0);
2984 Value *B = II.getArgOperand(1);
2985 if (A == B)
2986 return IC.replaceInstUsesWith(II, A);
2987
2988 return std::nullopt;
2989}
2990
2991static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2992 IntrinsicInst &II) {
2993 Value *Pred = II.getOperand(0);
2994 Value *Vec = II.getOperand(1);
2995 Value *Shift = II.getOperand(2);
2996
2997 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2998 Value *AbsPred, *MergedValue;
3000 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
3002 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
3003
3004 return std::nullopt;
3005
3006 // Transform is valid if any of the following are true:
3007 // * The ABS merge value is an undef or non-negative
3008 // * The ABS predicate is all active
3009 // * The ABS predicate and the SRSHL predicates are the same
3010 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
3011 AbsPred != Pred && !isAllActivePredicate(AbsPred))
3012 return std::nullopt;
3013
3014 // Only valid when the shift amount is non-negative, otherwise the rounding
3015 // behaviour of SRSHL cannot be ignored.
3016 if (!match(Shift, m_NonNegative()))
3017 return std::nullopt;
3018
3019 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
3020 {II.getType()}, {Pred, Vec, Shift});
3021
3022 return IC.replaceInstUsesWith(II, LSL);
3023}
3024
3025static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
3026 IntrinsicInst &II) {
3027 Value *Vec = II.getOperand(0);
3028
3029 if (getSplatValue(Vec) == II.getOperand(1))
3030 return IC.replaceInstUsesWith(II, Vec);
3031
3032 return std::nullopt;
3033}
3034
3035static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
3036 IntrinsicInst &II) {
3037 // If this barrier is post-dominated by identical one we can remove it
3038 auto *NI = II.getNextNode();
3039 unsigned LookaheadThreshold = DMBLookaheadThreshold;
3040 auto CanSkipOver = [](Instruction *I) {
3041 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
3042 };
3043 while (LookaheadThreshold-- && CanSkipOver(NI)) {
3044 auto *NIBB = NI->getParent();
3045 NI = NI->getNextNode();
3046 if (!NI) {
3047 if (auto *SuccBB = NIBB->getUniqueSuccessor())
3048 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
3049 else
3050 break;
3051 }
3052 }
3053 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
3054 if (NextII && II.isIdenticalTo(NextII))
3055 return IC.eraseInstFromFunction(II);
3056
3057 return std::nullopt;
3058}
3059
3060static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
3061 IntrinsicInst &II) {
3062 return IC.replaceInstUsesWith(
3063 II,
3064 IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
3065 {II.getType(), II.getOperand(0)->getType()},
3066 {II.getOperand(0), II.getOperand(1)}));
3067}
3068
3069static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
3070 IntrinsicInst &II) {
3071 unsigned PredPattern = cast<ConstantInt>(II.getOperand(0))->getZExtValue();
3072 // SVE vector length is a power-of-two, thus pow2 is synonymous with all.
3073 if (PredPattern == AArch64SVEPredPattern::all ||
3074 PredPattern == AArch64SVEPredPattern::pow2)
3075 return IC.replaceInstUsesWith(II, ConstantInt::getTrue(II.getType()));
3076 return std::nullopt;
3077}
3078
3079static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
3081 unsigned NumBits) {
3082 Value *Passthru = II.getOperand(0);
3083 Value *Pg = II.getOperand(1);
3084 Value *Op = II.getOperand(2);
3085
3086 // Convert UXT[BHW] to AND.
3087 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
3088 auto *Ty = cast<VectorType>(II.getType());
3089 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
3090 auto *Mask = ConstantInt::get(Ty, MaskValue);
3091 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
3092 {Pg, Op, Mask});
3093 return IC.replaceInstUsesWith(II, And);
3094 }
3095
3096 return std::nullopt;
3097}
3098
3099static std::optional<Instruction *>
3101 SMEAttrs FnSMEAttrs(*II.getFunction());
3102 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
3103 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
3104 return IC.replaceInstUsesWith(
3105 II, ConstantInt::getBool(II.getType(), IsStreaming));
3106 return std::nullopt;
3107}
3108
3109std::optional<Instruction *>
3111 IntrinsicInst &II) const {
3113 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
3114 return I;
3115
3116 Intrinsic::ID IID = II.getIntrinsicID();
3117 switch (IID) {
3118 default:
3119 break;
3120 case Intrinsic::aarch64_dmb:
3121 return instCombineDMB(IC, II);
3122 case Intrinsic::aarch64_neon_fmaxnm:
3123 case Intrinsic::aarch64_neon_fminnm:
3124 return instCombineMaxMinNM(IC, II);
3125 case Intrinsic::aarch64_sve_convert_from_svbool:
3126 return instCombineConvertFromSVBool(IC, II);
3127 case Intrinsic::aarch64_sve_dup:
3128 return instCombineSVEDup(IC, II);
3129 case Intrinsic::aarch64_sve_dup_x:
3130 return instCombineSVEDupX(IC, II);
3131 case Intrinsic::aarch64_sve_cmpne:
3132 case Intrinsic::aarch64_sve_cmpne_wide:
3133 return instCombineSVECmpNE(IC, II);
3134 case Intrinsic::aarch64_sve_rdffr:
3135 return instCombineRDFFR(IC, II);
3136 case Intrinsic::aarch64_sve_lasta:
3137 case Intrinsic::aarch64_sve_lastb:
3138 return instCombineSVELast(IC, II);
3139 case Intrinsic::aarch64_sve_clasta_n:
3140 case Intrinsic::aarch64_sve_clastb_n:
3141 return instCombineSVECondLast(IC, II);
3142 case Intrinsic::aarch64_sve_cntd:
3143 return instCombineSVECntElts(IC, II, 2);
3144 case Intrinsic::aarch64_sve_cntw:
3145 return instCombineSVECntElts(IC, II, 4);
3146 case Intrinsic::aarch64_sve_cnth:
3147 return instCombineSVECntElts(IC, II, 8);
3148 case Intrinsic::aarch64_sve_cntb:
3149 return instCombineSVECntElts(IC, II, 16);
3150 case Intrinsic::aarch64_sme_cntsd:
3151 return instCombineSMECntsd(IC, II, ST);
3152 case Intrinsic::aarch64_sve_ptest_any:
3153 case Intrinsic::aarch64_sve_ptest_first:
3154 case Intrinsic::aarch64_sve_ptest_last:
3155 return instCombineSVEPTest(IC, II);
3156 case Intrinsic::aarch64_sve_fadd:
3157 return instCombineSVEVectorFAdd(IC, II);
3158 case Intrinsic::aarch64_sve_fadd_u:
3159 return instCombineSVEVectorFAddU(IC, II);
3160 case Intrinsic::aarch64_sve_fmul_u:
3161 return instCombineSVEVectorBinOp(IC, II);
3162 case Intrinsic::aarch64_sve_fsub:
3163 return instCombineSVEVectorFSub(IC, II);
3164 case Intrinsic::aarch64_sve_fsub_u:
3165 return instCombineSVEVectorFSubU(IC, II);
3166 case Intrinsic::aarch64_sve_add:
3167 return instCombineSVEVectorAdd(IC, II);
3168 case Intrinsic::aarch64_sve_add_u:
3169 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3170 Intrinsic::aarch64_sve_mla_u>(
3171 IC, II, true);
3172 case Intrinsic::aarch64_sve_mla_u:
3173 return instCombineSVEVectorMlaU(IC, II);
3174 case Intrinsic::aarch64_sve_sub:
3175 return instCombineSVEVectorSub(IC, II);
3176 case Intrinsic::aarch64_sve_sub_u:
3177 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3178 Intrinsic::aarch64_sve_mls_u>(
3179 IC, II, true);
3180 case Intrinsic::aarch64_sve_tbl:
3181 return instCombineSVETBL(IC, II);
3182 case Intrinsic::aarch64_sve_uunpkhi:
3183 case Intrinsic::aarch64_sve_uunpklo:
3184 case Intrinsic::aarch64_sve_sunpkhi:
3185 case Intrinsic::aarch64_sve_sunpklo:
3186 return instCombineSVEUnpack(IC, II);
3187 case Intrinsic::aarch64_sve_uzp1:
3188 return instCombineSVEUzp1(IC, II);
3189 case Intrinsic::aarch64_sve_zip1:
3190 case Intrinsic::aarch64_sve_zip2:
3191 return instCombineSVEZip(IC, II);
3192 case Intrinsic::aarch64_sve_ld1_gather_index:
3193 return instCombineLD1GatherIndex(IC, II);
3194 case Intrinsic::aarch64_sve_st1_scatter_index:
3195 return instCombineST1ScatterIndex(IC, II);
3196 case Intrinsic::aarch64_sve_ld1:
3197 return instCombineSVELD1(IC, II, DL);
3198 case Intrinsic::aarch64_sve_st1:
3199 return instCombineSVEST1(IC, II, DL);
3200 case Intrinsic::aarch64_sve_sdiv:
3201 return instCombineSVESDIV(IC, II);
3202 case Intrinsic::aarch64_sve_sel:
3203 return instCombineSVESel(IC, II);
3204 case Intrinsic::aarch64_sve_srshl:
3205 return instCombineSVESrshl(IC, II);
3206 case Intrinsic::aarch64_sve_dupq_lane:
3207 return instCombineSVEDupqLane(IC, II);
3208 case Intrinsic::aarch64_sve_insr:
3209 return instCombineSVEInsr(IC, II);
3210 case Intrinsic::aarch64_sve_whilelo:
3211 return instCombineWhilelo(IC, II);
3212 case Intrinsic::aarch64_sve_ptrue:
3213 return instCombinePTrue(IC, II);
3214 case Intrinsic::aarch64_sve_uxtb:
3215 return instCombineSVEUxt(IC, II, 8);
3216 case Intrinsic::aarch64_sve_uxth:
3217 return instCombineSVEUxt(IC, II, 16);
3218 case Intrinsic::aarch64_sve_uxtw:
3219 return instCombineSVEUxt(IC, II, 32);
3220 case Intrinsic::aarch64_sme_in_streaming_mode:
3221 return instCombineInStreamingMode(IC, II);
3222 }
3223
3224 return std::nullopt;
3225}
3226
3228 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
3229 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
3230 std::function<void(Instruction *, unsigned, APInt, APInt &)>
3231 SimplifyAndSetOp) const {
3232 switch (II.getIntrinsicID()) {
3233 default:
3234 break;
3235 case Intrinsic::aarch64_neon_fcvtxn:
3236 case Intrinsic::aarch64_neon_rshrn:
3237 case Intrinsic::aarch64_neon_sqrshrn:
3238 case Intrinsic::aarch64_neon_sqrshrun:
3239 case Intrinsic::aarch64_neon_sqshrn:
3240 case Intrinsic::aarch64_neon_sqshrun:
3241 case Intrinsic::aarch64_neon_sqxtn:
3242 case Intrinsic::aarch64_neon_sqxtun:
3243 case Intrinsic::aarch64_neon_uqrshrn:
3244 case Intrinsic::aarch64_neon_uqshrn:
3245 case Intrinsic::aarch64_neon_uqxtn:
3246 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
3247 break;
3248 }
3249
3250 return std::nullopt;
3251}
3252
3254 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3256}
3257
3260 switch (K) {
3262 return TypeSize::getFixed(64);
3264 if (ST->useSVEForFixedLengthVectors() &&
3265 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
3266 return TypeSize::getFixed(
3267 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
3268 else if (ST->isNeonAvailable())
3269 return TypeSize::getFixed(128);
3270 else
3271 return TypeSize::getFixed(0);
3273 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3275 return TypeSize::getScalable(128);
3276 else
3277 return TypeSize::getScalable(0);
3278 }
3279 llvm_unreachable("Unsupported register kind");
3280}
3281
3282bool AArch64TTIImpl::isSingleExtWideningInstruction(
3283 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3284 Type *SrcOverrideTy) const {
3285 // A helper that returns a vector type from the given type. The number of
3286 // elements in type Ty determines the vector width.
3287 auto toVectorTy = [&](Type *ArgTy) {
3288 return VectorType::get(ArgTy->getScalarType(),
3289 cast<VectorType>(DstTy)->getElementCount());
3290 };
3291
3292 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3293 // i32, i64]. SVE doesn't generally have the same set of instructions to
3294 // perform an extend with the add/sub/mul. There are SMULLB style
3295 // instructions, but they operate on top/bottom, requiring some sort of lane
3296 // interleaving to be used with zext/sext.
3297 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3298 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3299 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3300 return false;
3301
3302 Type *SrcTy = SrcOverrideTy;
3303 switch (Opcode) {
3304 case Instruction::Add: // UADDW(2), SADDW(2).
3305 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3306 // The second operand needs to be an extend
3307 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
3308 if (!SrcTy)
3309 SrcTy =
3310 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
3311 break;
3312 }
3313
3314 if (Opcode == Instruction::Sub)
3315 return false;
3316
3317 // UADDW(2), SADDW(2) can be commutted.
3318 if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
3319 if (!SrcTy)
3320 SrcTy =
3321 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3322 break;
3323 }
3324 return false;
3325 }
3326 default:
3327 return false;
3328 }
3329
3330 // Legalize the destination type and ensure it can be used in a widening
3331 // operation.
3332 auto DstTyL = getTypeLegalizationCost(DstTy);
3333 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3334 return false;
3335
3336 // Legalize the source type and ensure it can be used in a widening
3337 // operation.
3338 assert(SrcTy && "Expected some SrcTy");
3339 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3340 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3341 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3342 return false;
3343
3344 // Get the total number of vector elements in the legalized types.
3345 InstructionCost NumDstEls =
3346 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3347 InstructionCost NumSrcEls =
3348 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3349
3350 // Return true if the legalized types have the same number of vector elements
3351 // and the destination element type size is twice that of the source type.
3352 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3353}
3354
3355Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3357 Type *SrcOverrideTy) const {
3358 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3359 Opcode != Instruction::Mul)
3360 return nullptr;
3361
3362 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3363 // i32, i64]. SVE doesn't generally have the same set of instructions to
3364 // perform an extend with the add/sub/mul. There are SMULLB style
3365 // instructions, but they operate on top/bottom, requiring some sort of lane
3366 // interleaving to be used with zext/sext.
3367 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3368 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3369 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3370 return nullptr;
3371
3372 auto getScalarSizeWithOverride = [&](const Value *V) {
3373 if (SrcOverrideTy)
3374 return SrcOverrideTy->getScalarSizeInBits();
3375 return cast<Instruction>(V)
3376 ->getOperand(0)
3377 ->getType()
3378 ->getScalarSizeInBits();
3379 };
3380
3381 unsigned MaxEltSize = 0;
3382 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3383 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3384 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3385 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3386 MaxEltSize = std::max(EltSize0, EltSize1);
3387 } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
3388 isa<SExtInst, ZExtInst>(Args[1])) {
3389 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3390 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3391 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3392 // enough.
3393 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3394 return nullptr;
3395 MaxEltSize = DstEltSize / 2;
3396 } else if (Opcode == Instruction::Mul &&
3397 (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
3398 // If one of the operands is a Zext and the other has enough zero bits
3399 // to be treated as unsigned, we can still generate a umull, meaning the
3400 // zext is free.
3401 KnownBits Known =
3402 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3403 if (Args[0]->getType()->getScalarSizeInBits() -
3404 Known.Zero.countLeadingOnes() >
3405 DstTy->getScalarSizeInBits() / 2)
3406 return nullptr;
3407
3408 MaxEltSize =
3409 getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
3410 } else
3411 return nullptr;
3412
3413 if (MaxEltSize * 2 > DstEltSize)
3414 return nullptr;
3415
3416 Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
3417 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3418 return nullptr;
3419 return ExtTy;
3420}
3421
3422// s/urhadd instructions implement the following pattern, making the
3423// extends free:
3424// %x = add ((zext i8 -> i16), 1)
3425// %y = (zext i8 -> i16)
3426// trunc i16 (lshr (add %x, %y), 1) -> i8
3427//
3429 Type *Src) const {
3430 // The source should be a legal vector type.
3431 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3432 (Src->isScalableTy() && !ST->hasSVE2()))
3433 return false;
3434
3435 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3436 return false;
3437
3438 // Look for trunc/shl/add before trying to match the pattern.
3439 const Instruction *Add = ExtUser;
3440 auto *AddUser =
3441 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3442 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3443 Add = AddUser;
3444
3445 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3446 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3447 return false;
3448
3449 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3450 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3451 Src->getScalarSizeInBits() !=
3452 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3453 return false;
3454
3455 // Try to match the whole pattern. Ext could be either the first or second
3456 // m_ZExtOrSExt matched.
3457 Instruction *Ex1, *Ex2;
3458 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3459 m_c_Add(m_Instruction(Ex2), m_One())))))
3460 return false;
3461
3462 // Ensure both extends are of the same type
3463 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3464 Ex1->getOpcode() == Ex2->getOpcode())
3465 return true;
3466
3467 return false;
3468}
3469
3471 Type *Src,
3474 const Instruction *I) const {
3475 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3476 assert(ISD && "Invalid opcode");
3477 // If the cast is observable, and it is used by a widening instruction (e.g.,
3478 // uaddl, saddw, etc.), it may be free.
3479 if (I && I->hasOneUser()) {
3480 auto *SingleUser = cast<Instruction>(*I->user_begin());
3481 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3482 if (Type *ExtTy = isBinExtWideningInstruction(
3483 SingleUser->getOpcode(), Dst, Operands,
3484 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3485 // The cost from Src->Src*2 needs to be added if required, the cost from
3486 // Src*2->ExtTy is free.
3487 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3488 Type *DoubleSrcTy =
3489 Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
3490 return getCastInstrCost(Opcode, DoubleSrcTy, Src,
3492 }
3493
3494 return 0;
3495 }
3496
3497 if (isSingleExtWideningInstruction(
3498 SingleUser->getOpcode(), Dst, Operands,
3499 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3500 // For adds only count the second operand as free if both operands are
3501 // extends but not the same operation. (i.e both operands are not free in
3502 // add(sext, zext)).
3503 if (SingleUser->getOpcode() == Instruction::Add) {
3504 if (I == SingleUser->getOperand(1) ||
3505 (isa<CastInst>(SingleUser->getOperand(1)) &&
3506 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3507 return 0;
3508 } else {
3509 // Others are free so long as isSingleExtWideningInstruction
3510 // returned true.
3511 return 0;
3512 }
3513 }
3514
3515 // The cast will be free for the s/urhadd instructions
3516 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3517 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3518 return 0;
3519 }
3520
3521 EVT SrcTy = TLI->getValueType(DL, Src);
3522 EVT DstTy = TLI->getValueType(DL, Dst);
3523
3524 if (!SrcTy.isSimple() || !DstTy.isSimple())
3525 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3526
3527 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3528 // we use fcvtx under SVE2. Give them invalid costs.
3529 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3530 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3531 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3533
3534 static const TypeConversionCostTblEntry BF16Tbl[] = {
3535 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3536 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3537 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3538 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3539 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3540 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3541 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3542 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3543 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3544 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3545 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3546 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3547 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3548 };
3549
3550 if (ST->hasBF16())
3551 if (const auto *Entry = ConvertCostTableLookup(
3552 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3553 return Entry->Cost;
3554
3555 // We have to estimate a cost of fixed length operation upon
3556 // SVE registers(operations) with the number of registers required
3557 // for a fixed type to be represented upon SVE registers.
3558 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3559 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3560 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3561 ST->useSVEForFixedLengthVectors(WiderTy)) {
3562 std::pair<InstructionCost, MVT> LT =
3563 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3564 unsigned NumElements =
3565 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3566 return LT.first *
3568 Opcode,
3569 ScalableVectorType::get(Dst->getScalarType(), NumElements),
3570 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3571 CostKind, I);
3572 }
3573
3574 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3575 // The cost of unpacking twice is artificially increased for now in order
3576 // to avoid regressions against NEON, which will use tbl instructions directly
3577 // instead of multiple layers of [s|u]unpk[lo|hi].
3578 // We use the unpacks in cases where the destination type is illegal and
3579 // requires splitting of the input, even if the input type itself is legal.
3580 const unsigned int SVE_EXT_COST = 1;
3581 const unsigned int SVE_FCVT_COST = 1;
3582 const unsigned int SVE_UNPACK_ONCE = 4;
3583 const unsigned int SVE_UNPACK_TWICE = 16;
3584
3585 static const TypeConversionCostTblEntry ConversionTbl[] = {
3586 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3587 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3588 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3589 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3590 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3591 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3592 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3593 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3594 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3595 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3596 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3597 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3598 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3599 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3600 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3601 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3602 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3603 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3604 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3605 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3606
3607 // Truncations on nxvmiN
3608 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3609 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3610 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3611 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3612 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3613 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3614 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3615 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3616 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3617 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3618 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3619 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3620 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3621 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3622 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3623 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3624 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3625 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3626 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3627 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3628 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3629 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3630 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3631 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3632 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3633 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3634 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3635 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3636 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3637 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3638 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3639 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3640 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3641
3642 // The number of shll instructions for the extension.
3643 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3644 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3645 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3646 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3647 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3648 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3649 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3650 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3651 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3652 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3653 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3654 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3655 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3656 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3657 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3658 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3659
3660 // FP Ext and trunc
3661 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3662 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3663 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3664 // FP16
3665 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3666 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3667 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3668 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3669 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3670 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3671 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3672 // BF16 (uses shift)
3673 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3674 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3675 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3676 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3677 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3678 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3679 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3680 // FP Ext and trunc
3681 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3682 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3683 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3684 // FP16
3685 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3686 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3687 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3688 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3689 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3690 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3691 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3692 // BF16 (more complex, with +bf16 is handled above)
3693 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3694 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3695 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3696 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3697 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3698 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3699 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3700 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3701
3702 // LowerVectorINT_TO_FP:
3703 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3704 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3705 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3706 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3707 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3708 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3709
3710 // SVE: to nxv2f16
3711 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3712 SVE_EXT_COST + SVE_FCVT_COST},
3713 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3714 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3715 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3716 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3717 SVE_EXT_COST + SVE_FCVT_COST},
3718 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3719 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3720 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3721
3722 // SVE: to nxv4f16
3723 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3724 SVE_EXT_COST + SVE_FCVT_COST},
3725 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3726 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3727 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3728 SVE_EXT_COST + SVE_FCVT_COST},
3729 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3730 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3731
3732 // SVE: to nxv8f16
3733 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3734 SVE_EXT_COST + SVE_FCVT_COST},
3735 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3736 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3737 SVE_EXT_COST + SVE_FCVT_COST},
3738 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3739
3740 // SVE: to nxv16f16
3741 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3742 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3743 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3744 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3745
3746 // Complex: to v2f32
3747 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3748 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3749 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3750 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3751
3752 // SVE: to nxv2f32
3753 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3754 SVE_EXT_COST + SVE_FCVT_COST},
3755 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3756 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3757 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3758 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3759 SVE_EXT_COST + SVE_FCVT_COST},
3760 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3761 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3762 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3763
3764 // Complex: to v4f32
3765 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3766 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3767 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3768 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3769
3770 // SVE: to nxv4f32
3771 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3772 SVE_EXT_COST + SVE_FCVT_COST},
3773 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3774 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3775 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3776 SVE_EXT_COST + SVE_FCVT_COST},
3777 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3778 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3779
3780 // Complex: to v8f32
3781 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3782 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3783 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3784 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3785
3786 // SVE: to nxv8f32
3787 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3788 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3789 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3790 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3791 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3792 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3793 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3794 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3795
3796 // SVE: to nxv16f32
3797 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3798 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3799 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3800 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3801
3802 // Complex: to v16f32
3803 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3804 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3805
3806 // Complex: to v2f64
3807 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3808 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3809 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3810 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3811 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3812 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3813
3814 // SVE: to nxv2f64
3815 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3816 SVE_EXT_COST + SVE_FCVT_COST},
3817 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3818 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3819 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3820 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3821 SVE_EXT_COST + SVE_FCVT_COST},
3822 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3823 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3824 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3825
3826 // Complex: to v4f64
3827 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3828 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3829
3830 // SVE: to nxv4f64
3831 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3832 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3833 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3834 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3835 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3836 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3837 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3838 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3839 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3840 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3841 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3842 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3843
3844 // SVE: to nxv8f64
3845 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3846 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3847 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3848 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3849 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3850 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3851 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3852 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3853
3854 // LowerVectorFP_TO_INT
3855 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3856 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3857 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3858 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3859 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3860 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3861
3862 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3863 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3864 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3865 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3866 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3867 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3868 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3869
3870 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3871 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3872 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3873 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3874 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3875
3876 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3877 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3878 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3879 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3880 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3881 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3882 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3883
3884 // Complex, from nxv2f32.
3885 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3886 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3887 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3888 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3889 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3890 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3891 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3892 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3893
3894 // Complex, from nxv2f64.
3895 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3896 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3897 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3898 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3899 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3900 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3901 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3902 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3903 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3904 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3905
3906 // Complex, from nxv4f32.
3907 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3908 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3909 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3910 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3911 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3912 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3913 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3914 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3915 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3916 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3917
3918 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3919 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3920 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3921 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3922 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3923
3924 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3925 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3926 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3927 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3928 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3929 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3930 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3931
3932 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3933 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3934 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3935 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3936 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3937
3938 // Complex, from nxv8f16.
3939 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3940 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3941 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3942 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3943 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3944 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3945 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3946 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3947 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3948 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3949
3950 // Complex, from nxv4f16.
3951 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3952 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3953 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3954 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3955 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3956 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3957 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3958 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3959
3960 // Complex, from nxv2f16.
3961 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3962 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3963 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3964 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3965 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3966 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3967 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3968 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3969
3970 // Truncate from nxvmf32 to nxvmf16.
3971 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3972 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3973 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3974
3975 // Truncate from nxvmf32 to nxvmbf16.
3976 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3977 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
3978 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
3979
3980 // Truncate from nxvmf64 to nxvmf16.
3981 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3982 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3983 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3984
3985 // Truncate from nxvmf64 to nxvmbf16.
3986 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
3987 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
3988 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
3989
3990 // Truncate from nxvmf64 to nxvmf32.
3991 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3992 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3993 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3994
3995 // Extend from nxvmf16 to nxvmf32.
3996 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3997 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3998 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3999
4000 // Extend from nxvmbf16 to nxvmf32.
4001 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
4002 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
4003 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
4004
4005 // Extend from nxvmf16 to nxvmf64.
4006 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
4007 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
4008 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
4009
4010 // Extend from nxvmbf16 to nxvmf64.
4011 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
4012 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
4013 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
4014
4015 // Extend from nxvmf32 to nxvmf64.
4016 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
4017 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
4018 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
4019
4020 // Bitcasts from float to integer
4021 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
4022 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
4023 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
4024
4025 // Bitcasts from integer to float
4026 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
4027 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
4028 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
4029
4030 // Add cost for extending to illegal -too wide- scalable vectors.
4031 // zero/sign extend are implemented by multiple unpack operations,
4032 // where each operation has a cost of 1.
4033 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
4034 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
4035 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
4036 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
4037 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
4038 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
4039
4040 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
4041 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
4042 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
4043 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
4044 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
4045 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
4046 };
4047
4048 if (const auto *Entry = ConvertCostTableLookup(
4049 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
4050 return Entry->Cost;
4051
4052 static const TypeConversionCostTblEntry FP16Tbl[] = {
4053 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
4054 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
4055 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
4056 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
4057 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
4058 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
4059 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
4060 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
4061 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
4062 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
4063 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
4064 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
4065 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
4066 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
4067 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
4068 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
4069 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
4070 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
4071 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
4072 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
4073 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
4074 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
4075 };
4076
4077 if (ST->hasFullFP16())
4078 if (const auto *Entry = ConvertCostTableLookup(
4079 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
4080 return Entry->Cost;
4081
4082 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
4083 // double-rounding issues.
4084 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
4085 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
4087 return cast<FixedVectorType>(Dst)->getNumElements() *
4088 getCastInstrCost(Opcode, Dst->getScalarType(),
4089 Src->getScalarType(), CCH, CostKind) +
4091 true, CostKind) +
4093 false, CostKind);
4094
4095 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
4097 ST->isSVEorStreamingSVEAvailable() &&
4098 TLI->getTypeAction(Src->getContext(), SrcTy) ==
4100 TLI->getTypeAction(Dst->getContext(), DstTy) ==
4102 // The standard behaviour in the backend for these cases is to split the
4103 // extend up into two parts:
4104 // 1. Perform an extending load or masked load up to the legal type.
4105 // 2. Extend the loaded data to the final type.
4106 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
4107 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
4109 Opcode, LegalTy, Src, CCH, CostKind, I);
4111 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
4112 return Part1 + Part2;
4113 }
4114
4115 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
4116 // but we also want to include the TTI::CastContextHint::Masked case too.
4117 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
4119 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
4121
4122 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
4123}
4124
4127 VectorType *VecTy, unsigned Index,
4129
4130 // Make sure we were given a valid extend opcode.
4131 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
4132 "Invalid opcode");
4133
4134 // We are extending an element we extract from a vector, so the source type
4135 // of the extend is the element type of the vector.
4136 auto *Src = VecTy->getElementType();
4137
4138 // Sign- and zero-extends are for integer types only.
4139 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
4140
4141 // Get the cost for the extract. We compute the cost (if any) for the extend
4142 // below.
4143 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
4144 CostKind, Index, nullptr, nullptr);
4145
4146 // Legalize the types.
4147 auto VecLT = getTypeLegalizationCost(VecTy);
4148 auto DstVT = TLI->getValueType(DL, Dst);
4149 auto SrcVT = TLI->getValueType(DL, Src);
4150
4151 // If the resulting type is still a vector and the destination type is legal,
4152 // we may get the extension for free. If not, get the default cost for the
4153 // extend.
4154 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
4155 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4156 CostKind);
4157
4158 // The destination type should be larger than the element type. If not, get
4159 // the default cost for the extend.
4160 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
4161 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4162 CostKind);
4163
4164 switch (Opcode) {
4165 default:
4166 llvm_unreachable("Opcode should be either SExt or ZExt");
4167
4168 // For sign-extends, we only need a smov, which performs the extension
4169 // automatically.
4170 case Instruction::SExt:
4171 return Cost;
4172
4173 // For zero-extends, the extend is performed automatically by a umov unless
4174 // the destination type is i64 and the element type is i8 or i16.
4175 case Instruction::ZExt:
4176 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
4177 return Cost;
4178 }
4179
4180 // If we are unable to perform the extend for free, get the default cost.
4181 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4182 CostKind);
4183}
4184
4187 const Instruction *I) const {
4189 return Opcode == Instruction::PHI ? 0 : 1;
4190 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
4191 // Branches are assumed to be predicted.
4192 return 0;
4193}
4194
4195InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
4196 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4197 const Instruction *I, Value *Scalar,
4198 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4199 TTI::VectorInstrContext VIC) const {
4200 assert(Val->isVectorTy() && "This must be a vector type");
4201
4202 if (Index != -1U) {
4203 // Legalize the type.
4204 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4205
4206 // This type is legalized to a scalar type.
4207 if (!LT.second.isVector())
4208 return 0;
4209
4210 // The type may be split. For fixed-width vectors we can normalize the
4211 // index to the new type.
4212 if (LT.second.isFixedLengthVector()) {
4213 unsigned Width = LT.second.getVectorNumElements();
4214 Index = Index % Width;
4215 }
4216
4217 // The element at index zero is already inside the vector.
4218 // - For a insert-element or extract-element
4219 // instruction that extracts integers, an explicit FPR -> GPR move is
4220 // needed. So it has non-zero cost.
4221 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
4222 return 0;
4223
4224 // This is recognising a LD1 single-element structure to one lane of one
4225 // register instruction. I.e., if this is an `insertelement` instruction,
4226 // and its second operand is a load, then we will generate a LD1, which
4227 // are expensive instructions on some uArchs.
4228 if (VIC == TTI::VectorInstrContext::Load) {
4229 if (ST->hasFastLD1Single())
4230 return 0;
4231 return CostKind == TTI::TCK_CodeSize
4232 ? 0
4234 }
4235
4236 // i1 inserts and extract will include an extra cset or cmp of the vector
4237 // value. Increase the cost by 1 to account.
4238 if (Val->getScalarSizeInBits() == 1)
4239 return CostKind == TTI::TCK_CodeSize
4240 ? 2
4241 : ST->getVectorInsertExtractBaseCost() + 1;
4242
4243 // FIXME:
4244 // If the extract-element and insert-element instructions could be
4245 // simplified away (e.g., could be combined into users by looking at use-def
4246 // context), they have no cost. This is not done in the first place for
4247 // compile-time considerations.
4248 }
4249
4250 // In case of Neon, if there exists extractelement from lane != 0 such that
4251 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
4252 // 2. extractelement result feeds into fmul.
4253 // 3. Other operand of fmul is an extractelement from lane 0 or lane
4254 // equivalent to 0.
4255 // then the extractelement can be merged with fmul in the backend and it
4256 // incurs no cost.
4257 // e.g.
4258 // define double @foo(<2 x double> %a) {
4259 // %1 = extractelement <2 x double> %a, i32 0
4260 // %2 = extractelement <2 x double> %a, i32 1
4261 // %res = fmul double %1, %2
4262 // ret double %res
4263 // }
4264 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4265 auto ExtractCanFuseWithFmul = [&]() {
4266 // We bail out if the extract is from lane 0.
4267 if (Index == 0)
4268 return false;
4269
4270 // Check if the scalar element type of the vector operand of ExtractElement
4271 // instruction is one of the allowed types.
4272 auto IsAllowedScalarTy = [&](const Type *T) {
4273 return T->isFloatTy() || T->isDoubleTy() ||
4274 (T->isHalfTy() && ST->hasFullFP16());
4275 };
4276
4277 // Check if the extractelement user is scalar fmul.
4278 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4279 // Check if the user is scalar fmul.
4280 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
4281 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4282 !BO->getType()->isVectorTy();
4283 };
4284
4285 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4286 // certain scalar type and a certain vector register width.
4287 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4288 auto RegWidth =
4290 .getFixedValue();
4291 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4292 };
4293
4294 // Check if the type constraints on input vector type and result scalar type
4295 // of extractelement instruction are satisfied.
4296 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4297 return false;
4298
4299 if (Scalar) {
4300 DenseMap<User *, unsigned> UserToExtractIdx;
4301 for (auto *U : Scalar->users()) {
4302 if (!IsUserFMulScalarTy(U))
4303 return false;
4304 // Recording entry for the user is important. Index value is not
4305 // important.
4306 UserToExtractIdx[U];
4307 }
4308 if (UserToExtractIdx.empty())
4309 return false;
4310 for (auto &[S, U, L] : ScalarUserAndIdx) {
4311 for (auto *U : S->users()) {
4312 if (UserToExtractIdx.contains(U)) {
4313 auto *FMul = cast<BinaryOperator>(U);
4314 auto *Op0 = FMul->getOperand(0);
4315 auto *Op1 = FMul->getOperand(1);
4316 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4317 UserToExtractIdx[U] = L;
4318 break;
4319 }
4320 }
4321 }
4322 }
4323 for (auto &[U, L] : UserToExtractIdx) {
4324 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4325 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4326 return false;
4327 }
4328 } else {
4329 const auto *EE = cast<ExtractElementInst>(I);
4330
4331 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
4332 if (!IdxOp)
4333 return false;
4334
4335 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
4336 if (!IsUserFMulScalarTy(U))
4337 return false;
4338
4339 // Check if the other operand of extractelement is also extractelement
4340 // from lane equivalent to 0.
4341 const auto *BO = cast<BinaryOperator>(U);
4342 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4343 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
4344 if (OtherEE) {
4345 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
4346 if (!IdxOp)
4347 return false;
4348 return IsExtractLaneEquivalentToZero(
4349 cast<ConstantInt>(OtherEE->getIndexOperand())
4350 ->getValue()
4351 .getZExtValue(),
4352 OtherEE->getType()->getScalarSizeInBits());
4353 }
4354 return true;
4355 });
4356 }
4357 return true;
4358 };
4359
4360 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4361 ExtractCanFuseWithFmul())
4362 return 0;
4363
4364 // All other insert/extracts cost this much.
4365 return CostKind == TTI::TCK_CodeSize ? 1
4366 : ST->getVectorInsertExtractBaseCost();
4367}
4368
4370 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4371 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4372 // Treat insert at lane 0 into a poison vector as having zero cost. This
4373 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4374 // single dup) are treated as cheap.
4375 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4376 isa<PoisonValue>(Op0))
4377 return 0;
4378 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr,
4379 nullptr, {}, VIC);
4380}
4381
4383 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4384 Value *Scalar, ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4385 TTI::VectorInstrContext VIC) const {
4386 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4387 ScalarUserAndIdx, VIC);
4388}
4389
4392 TTI::TargetCostKind CostKind, unsigned Index,
4393 TTI::VectorInstrContext VIC) const {
4394 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I,
4395 nullptr, {}, VIC);
4396}
4397
4401 unsigned Index) const {
4402 if (isa<FixedVectorType>(Val))
4404 Index);
4405
4406 // This typically requires both while and lastb instructions in order
4407 // to extract the last element. If this is in a loop the while
4408 // instruction can at least be hoisted out, although it will consume a
4409 // predicate register. The cost should be more expensive than the base
4410 // extract cost, which is 2 for most CPUs.
4411 return CostKind == TTI::TCK_CodeSize
4412 ? 2
4413 : ST->getVectorInsertExtractBaseCost() + 1;
4414}
4415
4417 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4418 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
4419 TTI::VectorInstrContext VIC) const {
4422 if (Ty->getElementType()->isFloatingPointTy())
4423 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4424 CostKind);
4425 unsigned VecInstCost =
4426 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4427 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4428}
4429
4430std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4432 TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4433 std::function<InstructionCost(Type *)> InstCost) const {
4434 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4435 return std::nullopt;
4436 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4437 return std::nullopt;
4438 // If we have +sve-b16b16 the operation can be promoted to SVE.
4439 if (CanUseSVE && ST->hasSVEB16B16() && ST->isNonStreamingSVEorSME2Available())
4440 return std::nullopt;
4441
4442 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4443 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4445 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4446 Cost *= 2;
4447 Cost += InstCost(PromotedTy);
4448 if (IncludeTrunc)
4449 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4451 return Cost;
4452}
4453
4455 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4457 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4458
4459 // The code-generator is currently not able to handle scalable vectors
4460 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4461 // it. This change will be removed when code-generation for these types is
4462 // sufficiently reliable.
4463 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4464 if (VTy->getElementCount() == ElementCount::getScalable(1))
4466
4467 // TODO: Handle more cost kinds.
4469 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4470 Op2Info, Args, CxtI);
4471
4472 // Legalize the type.
4473 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4474 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4475
4476 // Increase the cost for half and bfloat types if not architecturally
4477 // supported.
4478 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4479 ISD == ISD::FDIV || ISD == ISD::FREM) {
4480 if (auto PromotedCost = getFP16BF16PromoteCost(
4481 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4482 // There is not native support for fdiv/frem even with +sve-b16b16.
4483 /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
4484 [&](Type *PromotedTy) {
4485 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4486 Op1Info, Op2Info);
4487 }))
4488 return *PromotedCost;
4489
4490 // fp128 all go via libcalls
4491 if (Ty->getScalarType()->isFP128Ty())
4492 return (CostKind == TTI::TCK_CodeSize ? 1 : 10) * LT.first;
4493 }
4494
4495 // If the operation is a widening instruction (smull or umull) and both
4496 // operands are extends the cost can be cheaper by considering that the
4497 // operation will operate on the narrowest type size possible (double the
4498 // largest input size) and a further extend.
4499 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
4500 if (ExtTy != Ty)
4501 return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
4502 getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
4504 return LT.first;
4505 }
4506
4507 switch (ISD) {
4508 default:
4509 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4510 Op2Info);
4511 case ISD::ADD:
4512 case ISD::SUB:
4513 return LT.first; // Also works for i128
4514 case ISD::MUL:
4515 if (LT.second == MVT::v2i64) {
4516 // When SVE is available, then we can lower the v2i64 operation using
4517 // the SVE mul instruction, which has a lower cost.
4518 if (ST->hasSVE())
4519 return LT.first;
4520
4521 // When SVE is not available, there is no MUL.2d instruction,
4522 // which means mul <2 x i64> is expensive as elements are extracted
4523 // from the vectors and the muls scalarized.
4524 // As getScalarizationOverhead is a bit too pessimistic, we
4525 // estimate the cost for a i64 vector directly here, which is:
4526 // - four 2-cost i64 extracts,
4527 // - two 2-cost i64 inserts, and
4528 // - two 1-cost muls.
4529 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4530 // LT.first = 2 the cost is 28.
4531 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4532 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4533 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4534 nullptr, nullptr) *
4535 2 +
4536 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4537 nullptr, nullptr));
4538 }
4539 return LT.first;
4540 case ISD::SREM:
4541 case ISD::SDIV:
4542 /*
4543 Notes for sdiv/srem specific costs:
4544 1. This only considers the cases where the divisor is constant, uniform and
4545 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4546 result in some form of (ldr + adrp), corresponding to constant vectors, or
4547 scalarization of the division operation.
4548 2. Constant divisors, either negative in whole or partially, don't result in
4549 significantly different codegen as compared to positive constant divisors.
4550 So, we don't consider negative divisors separately.
4551 3. If the codegen is significantly different with SVE, it has been indicated
4552 using comments at appropriate places.
4553
4554 sdiv specific cases:
4555 -----------------------------------------------------------------------
4556 codegen | pow-of-2 | Type
4557 -----------------------------------------------------------------------
4558 add + cmp + csel + asr | Y | i64
4559 add + cmp + csel + asr | Y | i32
4560 -----------------------------------------------------------------------
4561
4562 srem specific cases:
4563 -----------------------------------------------------------------------
4564 codegen | pow-of-2 | Type
4565 -----------------------------------------------------------------------
4566 negs + and + and + csneg | Y | i64
4567 negs + and + and + csneg | Y | i32
4568 -----------------------------------------------------------------------
4569
4570 other sdiv/srem cases:
4571 -------------------------------------------------------------------------
4572 common codegen | + srem | + sdiv | pow-of-2 | Type
4573 -------------------------------------------------------------------------
4574 smulh + asr + add + add | - | - | N | i64
4575 smull + lsr + add + add | - | - | N | i32
4576 usra | and + sub | sshr | Y | <2 x i64>
4577 2 * (scalar code) | - | - | N | <2 x i64>
4578 usra | bic + sub | sshr + neg | Y | <4 x i32>
4579 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4580 + sshr + usra | | | |
4581 -------------------------------------------------------------------------
4582 */
4583 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4584 InstructionCost AddCost =
4585 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4586 Op1Info.getNoProps(), Op2Info.getNoProps());
4587 InstructionCost AsrCost =
4588 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4589 Op1Info.getNoProps(), Op2Info.getNoProps());
4590 InstructionCost MulCost =
4591 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4592 Op1Info.getNoProps(), Op2Info.getNoProps());
4593 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4594 // have similar cost.
4595 auto VT = TLI->getValueType(DL, Ty);
4596 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4597 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4598 // Neg can be folded into the asr instruction.
4599 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4600 : (3 * AsrCost + AddCost);
4601 } else {
4602 return MulCost + AsrCost + 2 * AddCost;
4603 }
4604 } else if (VT.isVector()) {
4605 InstructionCost UsraCost = 2 * AsrCost;
4606 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4607 // Division with scalable types corresponds to native 'asrd'
4608 // instruction when SVE is available.
4609 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4610
4611 // One more for the negation in SDIV
4613 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4614 if (Ty->isScalableTy() && ST->hasSVE())
4615 Cost += 2 * AsrCost;
4616 else {
4617 Cost +=
4618 UsraCost +
4619 (ISD == ISD::SDIV
4620 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4621 : 2 * AddCost);
4622 }
4623 return Cost;
4624 } else if (LT.second == MVT::v2i64) {
4625 return VT.getVectorNumElements() *
4626 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4627 Op1Info.getNoProps(),
4628 Op2Info.getNoProps());
4629 } else {
4630 // When SVE is available, we get:
4631 // smulh + lsr + add/sub + asr + add/sub.
4632 if (Ty->isScalableTy() && ST->hasSVE())
4633 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4634 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4635 }
4636 }
4637 }
4638 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4639 LT.second.isFixedLengthVector()) {
4640 // FIXME: When the constant vector is non-uniform, this may result in
4641 // loading the vector from constant pool or in some cases, may also result
4642 // in scalarization. For now, we are approximating this with the
4643 // scalarization cost.
4644 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4645 CostKind, -1, nullptr, nullptr);
4646 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4647 CostKind, -1, nullptr, nullptr);
4648 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4649 return ExtractCost + InsertCost +
4650 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4651 CostKind, Op1Info.getNoProps(),
4652 Op2Info.getNoProps());
4653 }
4654 [[fallthrough]];
4655 case ISD::UDIV:
4656 case ISD::UREM: {
4657 auto VT = TLI->getValueType(DL, Ty);
4658 if (Op2Info.isConstant()) {
4659 // If the operand is a power of 2 we can use the shift or and cost.
4660 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4661 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4662 Op1Info.getNoProps(),
4663 Op2Info.getNoProps());
4664 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4665 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4666 Op1Info.getNoProps(),
4667 Op2Info.getNoProps());
4668
4669 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4670 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4671 // The MULHU will be expanded to UMULL for the types not listed below,
4672 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4673 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4674 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4675 LT.second == MVT::nxv16i8;
4676 bool Is128bit = LT.second.is128BitVector();
4677
4678 InstructionCost MulCost =
4679 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4680 Op1Info.getNoProps(), Op2Info.getNoProps());
4681 InstructionCost AddCost =
4682 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4683 Op1Info.getNoProps(), Op2Info.getNoProps());
4684 InstructionCost ShrCost =
4685 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4686 Op1Info.getNoProps(), Op2Info.getNoProps());
4687 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4688 (HasMULH ? 0 : ShrCost) + // UMULL shift
4689 AddCost * 2 + ShrCost;
4690 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4691 }
4692 }
4693
4694 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4695 // emitted by the backend even when those functions are not declared in the
4696 // module.
4697 if (!VT.isVector() && VT.getSizeInBits() > 64)
4698 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4699
4701 Opcode, Ty, CostKind, Op1Info, Op2Info);
4702 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4703 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4704 // SDIV/UDIV operations are lowered using SVE, then we can have less
4705 // costs.
4706 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4707 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4708 static const CostTblEntry DivTbl[]{
4709 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4710 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4711 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4712 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4713 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4714 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4715
4716 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4717 if (nullptr != Entry)
4718 return Entry->Cost;
4719 }
4720 // For 8/16-bit elements, the cost is higher because the type
4721 // requires promotion and possibly splitting:
4722 if (LT.second.getScalarType() == MVT::i8)
4723 Cost *= 8;
4724 else if (LT.second.getScalarType() == MVT::i16)
4725 Cost *= 4;
4726 return Cost;
4727 } else {
4728 // If one of the operands is a uniform constant then the cost for each
4729 // element is Cost for insertion, extraction and division.
4730 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4731 // operation with scalar type
4732 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4733 (Op2Info.isConstant() && Op2Info.isUniform())) {
4734 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4736 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4737 return (4 + DivCost) * VTy->getNumElements();
4738 }
4739 }
4740 // On AArch64, without SVE, vector divisions are expanded
4741 // into scalar divisions of each pair of elements.
4742 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4743 -1, nullptr, nullptr);
4744 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4745 nullptr, nullptr);
4746 }
4747
4748 // TODO: if one of the arguments is scalar, then it's not necessary to
4749 // double the cost of handling the vector elements.
4750 Cost += Cost;
4751 }
4752 return Cost;
4753 }
4754 case ISD::XOR:
4755 case ISD::OR:
4756 case ISD::AND:
4757 case ISD::SRL:
4758 case ISD::SRA:
4759 case ISD::SHL:
4760 // These nodes are marked as 'custom' for combining purposes only.
4761 // We know that they are legal. See LowerAdd in ISelLowering.
4762 return LT.first;
4763
4764 case ISD::FNEG:
4765 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4766 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4767 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4768 CxtI &&
4769 ((CxtI->hasOneUse() &&
4770 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4771 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4772 return 0;
4773 [[fallthrough]];
4774 case ISD::FADD:
4775 case ISD::FSUB:
4776 if (!Ty->getScalarType()->isFP128Ty())
4777 return LT.first;
4778 [[fallthrough]];
4779 case ISD::FMUL:
4780 case ISD::FDIV:
4781 // These nodes are marked as 'custom' just to lower them to SVE.
4782 // We know said lowering will incur no additional cost.
4783 if (!Ty->getScalarType()->isFP128Ty())
4784 return 2 * LT.first;
4785
4786 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4787 Op2Info);
4788 case ISD::FREM:
4789 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4790 // those functions are not declared in the module.
4791 if (!Ty->isVectorTy())
4792 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4793 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4794 Op2Info);
4795 }
4796}
4797
4800 const SCEV *Ptr,
4802 // Address computations in vectorized code with non-consecutive addresses will
4803 // likely result in more instructions compared to scalar code where the
4804 // computation can more often be merged into the index mode. The resulting
4805 // extra micro-ops can significantly decrease throughput.
4806 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4807 int MaxMergeDistance = 64;
4808
4809 if (PtrTy->isVectorTy() && SE &&
4810 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4811 return NumVectorInstToHideOverhead;
4812
4813 // In many cases the address computation is not merged into the instruction
4814 // addressing mode.
4815 return 1;
4816}
4817
4818/// Check whether Opcode1 has less throughput according to the scheduling
4819/// model than Opcode2.
4821 unsigned Opcode1, unsigned Opcode2) const {
4822 const MCSchedModel &Sched = ST->getSchedModel();
4823 const TargetInstrInfo *TII = ST->getInstrInfo();
4824 if (!Sched.hasInstrSchedModel())
4825 return false;
4826
4827 const MCSchedClassDesc *SCD1 =
4828 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4829 const MCSchedClassDesc *SCD2 =
4830 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4831 // We cannot handle variant scheduling classes without an MI. If we need to
4832 // support them for any of the instructions we query the information of we
4833 // might need to add a way to resolve them without a MI or not use the
4834 // scheduling info.
4835 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4836 "Cannot handle variant scheduling classes without an MI");
4837 if (!SCD1->isValid() || !SCD2->isValid())
4838 return false;
4839
4840 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4842}
4843
4845 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4847 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4848 // We don't lower some vector selects well that are wider than the register
4849 // width. TODO: Improve this with different cost kinds.
4850 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4851 // We would need this many instructions to hide the scalarization happening.
4852 const int AmortizationCost = 20;
4853
4854 // If VecPred is not set, check if we can get a predicate from the context
4855 // instruction, if its type matches the requested ValTy.
4856 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4857 CmpPredicate CurrentPred;
4858 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4859 m_Value())))
4860 VecPred = CurrentPred;
4861 }
4862 // Check if we have a compare/select chain that can be lowered using
4863 // a (F)CMxx & BFI pair.
4864 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4865 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4866 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4867 VecPred == CmpInst::FCMP_UNE) {
4868 static const auto ValidMinMaxTys = {
4869 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4870 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4871 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4872
4873 auto LT = getTypeLegalizationCost(ValTy);
4874 if (any_of(ValidMinMaxTys, equal_to(LT.second)) ||
4875 (ST->hasFullFP16() &&
4876 any_of(ValidFP16MinMaxTys, equal_to(LT.second))))
4877 return LT.first;
4878 }
4879
4880 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4881 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4882 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4883 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4884 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4885 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4886 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4887 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4888 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4889 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4890 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4891 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4892
4893 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4894 EVT SelValTy = TLI->getValueType(DL, ValTy);
4895 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4896 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4897 SelCondTy.getSimpleVT(),
4898 SelValTy.getSimpleVT()))
4899 return Entry->Cost;
4900 }
4901 }
4902
4903 if (Opcode == Instruction::FCmp) {
4904 if (auto PromotedCost = getFP16BF16PromoteCost(
4905 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4906 // TODO: Consider costing SVE FCMPs.
4907 /*CanUseSVE=*/false, [&](Type *PromotedTy) {
4909 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4910 CostKind, Op1Info, Op2Info);
4911 if (isa<VectorType>(PromotedTy))
4913 Instruction::Trunc,
4917 return Cost;
4918 }))
4919 return *PromotedCost;
4920
4921 auto LT = getTypeLegalizationCost(ValTy);
4922 // Model unknown fp compares as a libcall.
4923 if (LT.second.getScalarType() != MVT::f64 &&
4924 LT.second.getScalarType() != MVT::f32 &&
4925 LT.second.getScalarType() != MVT::f16)
4926 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4927 {ValTy, ValTy}, CostKind);
4928
4929 // Some comparison operators require expanding to multiple compares + or.
4930 unsigned Factor = 1;
4931 if (!CondTy->isVectorTy() &&
4932 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4933 Factor = 2; // fcmp with 2 selects
4934 else if (isa<FixedVectorType>(ValTy) &&
4935 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4936 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4937 Factor = 3; // fcmxx+fcmyy+or
4938 else if (isa<ScalableVectorType>(ValTy) &&
4939 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4940 Factor = 3; // fcmxx+fcmyy+or
4941
4942 if (isa<ScalableVectorType>(ValTy) &&
4944 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4945 AArch64::FCMEQv4f32))
4946 Factor *= 2;
4947
4948 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4949 }
4950
4951 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4952 // icmp(and, 0) as free, as we can make use of ands, but only if the
4953 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4954 // providing it will not cause performance regressions.
4955 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4956 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4957 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4958 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4959 if (match(I->getOperand(1), m_Zero()))
4960 return 0;
4961
4962 // x >= 1 / x < 1 -> x > 0 / x <= 0
4963 if (match(I->getOperand(1), m_One()) &&
4964 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4965 return 0;
4966
4967 // x <= -1 / x > -1 -> x > 0 / x <= 0
4968 if (match(I->getOperand(1), m_AllOnes()) &&
4969 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4970 return 0;
4971 }
4972
4973 // The base case handles scalable vectors fine for now, since it treats the
4974 // cost as 1 * legalization cost.
4975 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4976 Op1Info, Op2Info, I);
4977}
4978
4980AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4982 if (ST->requiresStrictAlign()) {
4983 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4984 // a bunch of instructions when strict align is enabled.
4985 return Options;
4986 }
4987 Options.AllowOverlappingLoads = true;
4988 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4989 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4990 // TODO: Though vector loads usually perform well on AArch64, in some targets
4991 // they may wake up the FP unit, which raises the power consumption. Perhaps
4992 // they could be used with no holds barred (-O3).
4993 Options.LoadSizes = {8, 4, 2, 1};
4994 Options.AllowedTailExpansions = {3, 5, 6};
4995 return Options;
4996}
4997
4999 return ST->hasSVE();
5000}
5001
5005 switch (MICA.getID()) {
5006 case Intrinsic::masked_scatter:
5007 case Intrinsic::masked_gather:
5008 return getGatherScatterOpCost(MICA, CostKind);
5009 case Intrinsic::masked_load:
5010 case Intrinsic::masked_expandload:
5011 case Intrinsic::masked_store:
5012 return getMaskedMemoryOpCost(MICA, CostKind);
5013 }
5015}
5016
5020 Type *Src = MICA.getDataType();
5021
5022 if (useNeonVector(Src))
5024 auto LT = getTypeLegalizationCost(Src);
5025 if (!LT.first.isValid())
5027
5028 // Return an invalid cost for element types that we are unable to lower.
5029 auto *VT = cast<VectorType>(Src);
5030 if (VT->getElementType()->isIntegerTy(1))
5032
5033 // The code-generator is currently not able to handle scalable vectors
5034 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5035 // it. This change will be removed when code-generation for these types is
5036 // sufficiently reliable.
5037 if (VT->getElementCount() == ElementCount::getScalable(1))
5039
5040 InstructionCost MemOpCost = LT.first;
5041 if (MICA.getID() == Intrinsic::masked_expandload) {
5042 if (!isLegalMaskedExpandLoad(Src, MICA.getAlignment()))
5044
5045 // Operation will be split into expand of masked.load
5046 MemOpCost *= 2;
5047 }
5048
5049 // If we need to split the memory operation, we will also need to split the
5050 // mask. This will likely lead to overestimating the cost in some cases if
5051 // multiple memory operations use the same mask, but we often don't have
5052 // enough context to figure that out here.
5053 //
5054 // If the elements being loaded are bytes then the mask will already be split,
5055 // since the number of bits in a P register matches the number of bytes in a
5056 // Z register.
5057 if (LT.first > 1 && LT.second.getScalarSizeInBits() > 8)
5058 return MemOpCost * 2;
5059
5060 return MemOpCost;
5061}
5062
5063// This function returns gather/scatter overhead either from
5064// user-provided value or specialized values per-target from \p ST.
5065static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
5066 const AArch64Subtarget *ST) {
5067 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5068 "Should be called on only load or stores.");
5069 switch (Opcode) {
5070 case Instruction::Load:
5071 if (SVEGatherOverhead.getNumOccurrences() > 0)
5072 return SVEGatherOverhead;
5073 return ST->getGatherOverhead();
5074 break;
5075 case Instruction::Store:
5076 if (SVEScatterOverhead.getNumOccurrences() > 0)
5077 return SVEScatterOverhead;
5078 return ST->getScatterOverhead();
5079 break;
5080 default:
5081 llvm_unreachable("Shouldn't have reached here");
5082 }
5083}
5084
5088
5089 unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather ||
5090 MICA.getID() == Intrinsic::vp_gather)
5091 ? Instruction::Load
5092 : Instruction::Store;
5093
5094 Type *DataTy = MICA.getDataType();
5095 Align Alignment = MICA.getAlignment();
5096 const Instruction *I = MICA.getInst();
5097
5098 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
5100 auto *VT = cast<VectorType>(DataTy);
5101 auto LT = getTypeLegalizationCost(DataTy);
5102 if (!LT.first.isValid())
5104
5105 // Return an invalid cost for element types that we are unable to lower.
5106 if (!LT.second.isVector() ||
5107 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
5108 VT->getElementType()->isIntegerTy(1))
5110
5111 // The code-generator is currently not able to handle scalable vectors
5112 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5113 // it. This change will be removed when code-generation for these types is
5114 // sufficiently reliable.
5115 if (VT->getElementCount() == ElementCount::getScalable(1))
5117
5118 ElementCount LegalVF = LT.second.getVectorElementCount();
5119 InstructionCost MemOpCost =
5120 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
5121 {TTI::OK_AnyValue, TTI::OP_None}, I);
5122 // Add on an overhead cost for using gathers/scatters.
5123 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
5124 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
5125}
5126
5128 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
5129}
5130
5132 Align Alignment,
5133 unsigned AddressSpace,
5135 TTI::OperandValueInfo OpInfo,
5136 const Instruction *I) const {
5137 EVT VT = TLI->getValueType(DL, Ty, true);
5138 // Type legalization can't handle structs, and load latency isn't handled here
5139 if (VT == MVT::Other ||
5140 (Opcode == Instruction::Load && CostKind == TTI::TCK_Latency))
5141 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
5142 CostKind);
5143
5144 auto LT = getTypeLegalizationCost(Ty);
5145 if (!LT.first.isValid())
5147
5148 // The code-generator is currently not able to handle scalable vectors
5149 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5150 // it. This change will be removed when code-generation for these types is
5151 // sufficiently reliable.
5152 // We also only support full register predicate loads and stores.
5153 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5154 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
5155 (VTy->getElementType()->isIntegerTy(1) &&
5156 !VTy->getElementCount().isKnownMultipleOf(
5159
5160 // TODO: consider latency as well for TCK_SizeAndLatency.
5162 return LT.first;
5163
5165 return 1;
5166
5167 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
5168 LT.second.is128BitVector() && Alignment < Align(16)) {
5169 // Unaligned stores are extremely inefficient. We don't split all
5170 // unaligned 128-bit stores because the negative impact that has shown in
5171 // practice on inlined block copy code.
5172 // We make such stores expensive so that we will only vectorize if there
5173 // are 6 other instructions getting vectorized.
5174 const int AmortizationCost = 6;
5175
5176 return LT.first * 2 * AmortizationCost;
5177 }
5178
5179 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
5180 if (Ty->isPtrOrPtrVectorTy())
5181 return LT.first;
5182
5183 if (useNeonVector(Ty)) {
5184 // Check truncating stores and extending loads.
5185 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
5186 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
5187 if (VT == MVT::v4i8)
5188 return 2;
5189 // Otherwise we need to scalarize.
5190 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
5191 }
5192 EVT EltVT = VT.getVectorElementType();
5193 unsigned EltSize = EltVT.getScalarSizeInBits();
5194 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
5195 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
5196 return LT.first;
5197 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
5198 // widening to v4i8, which produces suboptimal results.
5199 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
5200 return LT.first;
5201
5202 // Check non-power-of-2 loads/stores for legal vector element types with
5203 // NEON. Non-power-of-2 memory ops will get broken down to a set of
5204 // operations on smaller power-of-2 ops, including ld1/st1.
5205 LLVMContext &C = Ty->getContext();
5207 SmallVector<EVT> TypeWorklist;
5208 TypeWorklist.push_back(VT);
5209 while (!TypeWorklist.empty()) {
5210 EVT CurrVT = TypeWorklist.pop_back_val();
5211 unsigned CurrNumElements = CurrVT.getVectorNumElements();
5212 if (isPowerOf2_32(CurrNumElements)) {
5213 Cost += 1;
5214 continue;
5215 }
5216
5217 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
5218 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
5219 TypeWorklist.push_back(
5220 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
5221 }
5222 return Cost;
5223 }
5224
5225 return LT.first;
5226}
5227
5229 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
5230 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
5231 bool UseMaskForCond, bool UseMaskForGaps) const {
5232 assert(Factor >= 2 && "Invalid interleave factor");
5233 auto *VecVTy = cast<VectorType>(VecTy);
5234
5235 if (VecTy->isScalableTy() && !ST->hasSVE())
5237
5238 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
5239 // only have lowering for power-of-2 factors.
5240 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
5241 // InterleavedAccessPass for ld3/st3
5242 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
5244
5245 // Vectorization for masked interleaved accesses is only enabled for scalable
5246 // VF.
5247 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
5249
5250 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
5251 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
5252 auto *SubVecTy =
5253 VectorType::get(VecVTy->getElementType(),
5254 VecVTy->getElementCount().divideCoefficientBy(Factor));
5255
5256 // ldN/stN only support legal vector types of size 64 or 128 in bits.
5257 // Accesses having vector types that are a multiple of 128 bits can be
5258 // matched to more than one ldN/stN instruction.
5259 bool UseScalable;
5260 if (MinElts % Factor == 0 &&
5261 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
5262 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
5263 }
5264
5265 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5266 Alignment, AddressSpace, CostKind,
5267 UseMaskForCond, UseMaskForGaps);
5268}
5269
5274 for (auto *I : Tys) {
5275 if (!I->isVectorTy())
5276 continue;
5277 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
5278 128)
5279 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
5280 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
5281 }
5282 return Cost;
5283}
5284
5286 Align Alignment) const {
5287 // Neon types should be scalarised when we are not choosing to use SVE.
5288 if (useNeonVector(DataTy))
5289 return false;
5290
5291 // Return true only if we are able to lower using the SVE2p2/SME2p2
5292 // expand instruction.
5293 return (ST->isSVEAvailable() && ST->hasSVE2p2()) ||
5294 (ST->isSVEorStreamingSVEAvailable() && ST->hasSME2p2());
5295}
5296
5298 if (VF.isScalar())
5299 return 4;
5300 return ST->getMaxInterleaveFactor();
5301}
5302
5303// For Falkor, we want to avoid having too many strided loads in a loop since
5304// that can exhaust the HW prefetcher resources. We adjust the unroller
5305// MaxCount preference below to attempt to ensure unrolling doesn't create too
5306// many strided loads.
5307static void
5310 enum { MaxStridedLoads = 7 };
5311 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
5312 int StridedLoads = 0;
5313 // FIXME? We could make this more precise by looking at the CFG and
5314 // e.g. not counting loads in each side of an if-then-else diamond.
5315 for (const auto BB : L->blocks()) {
5316 for (auto &I : *BB) {
5317 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
5318 if (!LMemI)
5319 continue;
5320
5321 Value *PtrValue = LMemI->getPointerOperand();
5322 if (L->isLoopInvariant(PtrValue))
5323 continue;
5324
5325 const SCEV *LSCEV = SE.getSCEV(PtrValue);
5326 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
5327 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
5328 continue;
5329
5330 // FIXME? We could take pairing of unrolled load copies into account
5331 // by looking at the AddRec, but we would probably have to limit this
5332 // to loops with no stores or other memory optimization barriers.
5333 ++StridedLoads;
5334 // We've seen enough strided loads that seeing more won't make a
5335 // difference.
5336 if (StridedLoads > MaxStridedLoads / 2)
5337 return StridedLoads;
5338 }
5339 }
5340 return StridedLoads;
5341 };
5342
5343 int StridedLoads = countStridedLoads(L, SE);
5344 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5345 << " strided loads\n");
5346 // Pick the largest power of 2 unroll count that won't result in too many
5347 // strided loads.
5348 if (StridedLoads) {
5349 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
5350 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5351 << UP.MaxCount << '\n');
5352 }
5353}
5354
5355// This function returns true if the loop:
5356// 1. Has a valid cost, and
5357// 2. Has a cost within the supplied budget.
5358// Otherwise it returns false.
5360 InstructionCost Budget,
5361 unsigned *FinalSize) {
5362 // Estimate the size of the loop.
5363 InstructionCost LoopCost = 0;
5364
5365 for (auto *BB : L->getBlocks()) {
5366 for (auto &I : *BB) {
5367 SmallVector<const Value *, 4> Operands(I.operand_values());
5368 InstructionCost Cost =
5369 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
5370 // This can happen with intrinsics that don't currently have a cost model
5371 // or for some operations that require SVE.
5372 if (!Cost.isValid())
5373 return false;
5374
5375 LoopCost += Cost;
5376 if (LoopCost > Budget)
5377 return false;
5378 }
5379 }
5380
5381 if (FinalSize)
5382 *FinalSize = LoopCost.getValue();
5383 return true;
5384}
5385
5387 const AArch64TTIImpl &TTI) {
5388 // Only consider loops with unknown trip counts for which we can determine
5389 // a symbolic expression. Multi-exit loops with small known trip counts will
5390 // likely be unrolled anyway.
5391 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5393 return false;
5394
5395 // It might not be worth unrolling loops with low max trip counts. Restrict
5396 // this to max trip counts > 32 for now.
5397 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5398 if (MaxTC > 0 && MaxTC <= 32)
5399 return false;
5400
5401 // Make sure the loop size is <= 5.
5402 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
5403 return false;
5404
5405 // Small search loops with multiple exits can be highly beneficial to unroll.
5406 // We only care about loops with exactly two exiting blocks, although each
5407 // block could jump to the same exit block.
5408 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5409 if (Blocks.size() != 2)
5410 return false;
5411
5412 if (any_of(Blocks, [](BasicBlock *BB) {
5414 }))
5415 return false;
5416
5417 return true;
5418}
5419
5420/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5421/// OOO engine's wide instruction window and various predictors.
5422static void
5425 const AArch64TTIImpl &TTI) {
5426 // Limit loops with structure that is highly likely to benefit from runtime
5427 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5428 // likely with complex control flow). Note that the heuristics here may be
5429 // overly conservative and we err on the side of avoiding runtime unrolling
5430 // rather than unroll excessively. They are all subject to further refinement.
5431 if (!L->isInnermost() || L->getNumBlocks() > 8)
5432 return;
5433
5434 // Loops with multiple exits are handled by common code.
5435 if (!L->getExitBlock())
5436 return;
5437
5438 // Check if the loop contains any reductions that could be parallelized when
5439 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5440 // a multiple of 2.
5441 bool HasParellelizableReductions =
5442 L->getNumBlocks() == 1 &&
5443 any_of(L->getHeader()->phis(),
5444 [&SE, L](PHINode &Phi) {
5445 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
5446 }) &&
5447 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
5448 if (HasParellelizableReductions &&
5449 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
5450 UP.Partial = true;
5451 UP.MaxCount = 4;
5452 UP.AddAdditionalAccumulators = true;
5453 }
5454
5455 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5457 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5458 SE.getSmallConstantMaxTripCount(L) <= 32))
5459 return;
5460
5461 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
5462 return;
5463
5465 return;
5466
5467 // Limit to loops with trip counts that are cheap to expand.
5468 UP.SCEVExpansionBudget = 1;
5469
5470 if (HasParellelizableReductions) {
5471 UP.Runtime = true;
5473 UP.AddAdditionalAccumulators = true;
5474 }
5475
5476 // Try to unroll small loops, of few-blocks with low budget, if they have
5477 // load/store dependencies, to expose more parallel memory access streams,
5478 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5479 BasicBlock *Header = L->getHeader();
5480 BasicBlock *Latch = L->getLoopLatch();
5481 if (Header == Latch) {
5482 // Estimate the size of the loop.
5483 unsigned Size;
5484 unsigned Width = 10;
5485 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5486 return;
5487
5488 // Try to find an unroll count that maximizes the use of the instruction
5489 // window, i.e. trying to fetch as many instructions per cycle as possible.
5490 unsigned MaxInstsPerLine = 16;
5491 unsigned UC = 1;
5492 unsigned BestUC = 1;
5493 unsigned SizeWithBestUC = BestUC * Size;
5494 while (UC <= 8) {
5495 unsigned SizeWithUC = UC * Size;
5496 if (SizeWithUC > 48)
5497 break;
5498 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5499 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5500 BestUC = UC;
5501 SizeWithBestUC = BestUC * Size;
5502 }
5503 UC++;
5504 }
5505
5506 if (BestUC == 1)
5507 return;
5508
5509 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5511 for (auto *BB : L->blocks()) {
5512 for (auto &I : *BB) {
5514 if (!Ptr)
5515 continue;
5516 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5517 if (SE.isLoopInvariant(PtrSCEV, L))
5518 continue;
5519 if (isa<LoadInst>(&I)) {
5520 LoadedValuesPlus.insert(&I);
5521 // Include in-loop 1st users of loaded values.
5522 for (auto *U : I.users())
5523 if (L->contains(cast<Instruction>(U)))
5524 LoadedValuesPlus.insert(U);
5525 } else
5526 Stores.push_back(cast<StoreInst>(&I));
5527 }
5528 }
5529
5530 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5531 return LoadedValuesPlus.contains(SI->getOperand(0));
5532 }))
5533 return;
5534
5535 UP.Runtime = true;
5536 UP.DefaultUnrollRuntimeCount = BestUC;
5537 return;
5538 }
5539
5540 // Try to runtime-unroll loops with early-continues depending on loop-varying
5541 // loads; this helps with branch-prediction for the early-continues.
5542 auto *Term = dyn_cast<CondBrInst>(Header->getTerminator());
5544 if (!Term || Preds.size() == 1 || !llvm::is_contained(Preds, Header) ||
5545 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5546 return;
5547
5548 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5549 [&](Instruction *I, unsigned Depth) -> bool {
5550 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5551 return false;
5552
5553 if (isa<LoadInst>(I))
5554 return true;
5555
5556 return any_of(I->operands(), [&](Value *V) {
5557 auto *I = dyn_cast<Instruction>(V);
5558 return I && DependsOnLoopLoad(I, Depth + 1);
5559 });
5560 };
5561 CmpPredicate Pred;
5562 Instruction *I;
5563 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5564 m_Value())) &&
5565 DependsOnLoopLoad(I, 0)) {
5566 UP.Runtime = true;
5567 }
5568}
5569
5572 OptimizationRemarkEmitter *ORE) const {
5573 // Enable partial unrolling and runtime unrolling.
5574 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5575
5576 UP.UpperBound = true;
5577
5578 // For inner loop, it is more likely to be a hot one, and the runtime check
5579 // can be promoted out from LICM pass, so the overhead is less, let's try
5580 // a larger threshold to unroll more loops.
5581 if (L->getLoopDepth() > 1)
5582 UP.PartialThreshold *= 2;
5583
5584 // Disable partial & runtime unrolling on -Os.
5586
5587 // Scan the loop: don't unroll loops with calls as this could prevent
5588 // inlining. Don't unroll auto-vectorized loops either, though do allow
5589 // unrolling of the scalar remainder.
5590 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5592 for (auto *BB : L->getBlocks()) {
5593 for (auto &I : *BB) {
5594 // Both auto-vectorized loops and the scalar remainder have the
5595 // isvectorized attribute, so differentiate between them by the presence
5596 // of vector instructions.
5597 if (IsVectorized && I.getType()->isVectorTy())
5598 return;
5599 if (isa<CallBase>(I)) {
5602 if (!isLoweredToCall(F))
5603 continue;
5604 return;
5605 }
5606
5607 SmallVector<const Value *, 4> Operands(I.operand_values());
5608 Cost += getInstructionCost(&I, Operands,
5610 }
5611 }
5612
5613 // Apply subtarget-specific unrolling preferences.
5614 if (ST->isAppleMLike())
5615 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5616 else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
5619
5620 // If this is a small, multi-exit loop similar to something like std::find,
5621 // then there is typically a performance improvement achieved by unrolling.
5622 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5623 UP.RuntimeUnrollMultiExit = true;
5624 UP.Runtime = true;
5625 // Limit unroll count.
5627 // Allow slightly more costly trip-count expansion to catch search loops
5628 // with pointer inductions.
5629 UP.SCEVExpansionBudget = 5;
5630 return;
5631 }
5632
5633 // Enable runtime unrolling for in-order models
5634 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5635 // checking for that case, we can ensure that the default behaviour is
5636 // unchanged
5637 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5638 !ST->getSchedModel().isOutOfOrder()) {
5639 UP.Runtime = true;
5640 UP.Partial = true;
5641 UP.UnrollRemainder = true;
5643
5644 UP.UnrollAndJam = true;
5646 }
5647
5648 // Force unrolling small loops can be very useful because of the branch
5649 // taken cost of the backedge.
5651 UP.Force = true;
5652}
5653
5658
5660 Type *ExpectedType,
5661 bool CanCreate) const {
5662 switch (Inst->getIntrinsicID()) {
5663 default:
5664 return nullptr;
5665 case Intrinsic::aarch64_neon_st1x2:
5666 case Intrinsic::aarch64_neon_st1x3:
5667 case Intrinsic::aarch64_neon_st1x4:
5668 case Intrinsic::aarch64_neon_st2:
5669 case Intrinsic::aarch64_neon_st3:
5670 case Intrinsic::aarch64_neon_st4: {
5671 // Create a struct type
5672 StructType *ST = dyn_cast<StructType>(ExpectedType);
5673 if (!CanCreate || !ST)
5674 return nullptr;
5675 unsigned NumElts = Inst->arg_size() - 1;
5676 if (ST->getNumElements() != NumElts)
5677 return nullptr;
5678 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5679 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5680 return nullptr;
5681 }
5682 Value *Res = PoisonValue::get(ExpectedType);
5683 IRBuilder<> Builder(Inst);
5684 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5685 Value *L = Inst->getArgOperand(i);
5686 Res = Builder.CreateInsertValue(Res, L, i);
5687 }
5688 return Res;
5689 }
5690 case Intrinsic::aarch64_neon_ld1x2:
5691 case Intrinsic::aarch64_neon_ld1x3:
5692 case Intrinsic::aarch64_neon_ld1x4:
5693 case Intrinsic::aarch64_neon_ld2:
5694 case Intrinsic::aarch64_neon_ld3:
5695 case Intrinsic::aarch64_neon_ld4:
5696 if (Inst->getType() == ExpectedType)
5697 return Inst;
5698 return nullptr;
5699 }
5700}
5701
5703 MemIntrinsicInfo &Info) const {
5704 switch (Inst->getIntrinsicID()) {
5705 default:
5706 break;
5707 case Intrinsic::aarch64_neon_ld1x2:
5708 case Intrinsic::aarch64_neon_ld1x3:
5709 case Intrinsic::aarch64_neon_ld1x4:
5710 case Intrinsic::aarch64_neon_ld2:
5711 case Intrinsic::aarch64_neon_ld3:
5712 case Intrinsic::aarch64_neon_ld4:
5713 Info.ReadMem = true;
5714 Info.WriteMem = false;
5715 Info.PtrVal = Inst->getArgOperand(0);
5716 break;
5717 case Intrinsic::aarch64_neon_st1x2:
5718 case Intrinsic::aarch64_neon_st1x3:
5719 case Intrinsic::aarch64_neon_st1x4:
5720 case Intrinsic::aarch64_neon_st2:
5721 case Intrinsic::aarch64_neon_st3:
5722 case Intrinsic::aarch64_neon_st4:
5723 Info.ReadMem = false;
5724 Info.WriteMem = true;
5725 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5726 break;
5727 }
5728
5729 // Use the ID of neon load as the "matching id".
5730 switch (Inst->getIntrinsicID()) {
5731 default:
5732 return false;
5733 case Intrinsic::aarch64_neon_ld1x2:
5734 case Intrinsic::aarch64_neon_st1x2:
5735 Info.MatchingId = Intrinsic::aarch64_neon_ld1x2;
5736 break;
5737 case Intrinsic::aarch64_neon_ld1x3:
5738 case Intrinsic::aarch64_neon_st1x3:
5739 Info.MatchingId = Intrinsic::aarch64_neon_ld1x3;
5740 break;
5741 case Intrinsic::aarch64_neon_ld1x4:
5742 case Intrinsic::aarch64_neon_st1x4:
5743 Info.MatchingId = Intrinsic::aarch64_neon_ld1x4;
5744 break;
5745 case Intrinsic::aarch64_neon_ld2:
5746 case Intrinsic::aarch64_neon_st2:
5747 Info.MatchingId = Intrinsic::aarch64_neon_ld2;
5748 break;
5749 case Intrinsic::aarch64_neon_ld3:
5750 case Intrinsic::aarch64_neon_st3:
5751 Info.MatchingId = Intrinsic::aarch64_neon_ld3;
5752 break;
5753 case Intrinsic::aarch64_neon_ld4:
5754 case Intrinsic::aarch64_neon_st4:
5755 Info.MatchingId = Intrinsic::aarch64_neon_ld4;
5756 break;
5757 }
5758 return true;
5759}
5760
5761/// See if \p I should be considered for address type promotion. We check if \p
5762/// I is a sext with right type and used in memory accesses. If it used in a
5763/// "complex" getelementptr, we allow it to be promoted without finding other
5764/// sext instructions that sign extended the same initial value. A getelementptr
5765/// is considered as "complex" if it has more than 2 operands.
5767 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5768 bool Considerable = false;
5769 AllowPromotionWithoutCommonHeader = false;
5770 if (!isa<SExtInst>(&I))
5771 return false;
5772 Type *ConsideredSExtType =
5773 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5774 if (I.getType() != ConsideredSExtType)
5775 return false;
5776 // See if the sext is the one with the right type and used in at least one
5777 // GetElementPtrInst.
5778 for (const User *U : I.users()) {
5779 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5780 Considerable = true;
5781 // A getelementptr is considered as "complex" if it has more than 2
5782 // operands. We will promote a SExt used in such complex GEP as we
5783 // expect some computation to be merged if they are done on 64 bits.
5784 if (GEPInst->getNumOperands() > 2) {
5785 AllowPromotionWithoutCommonHeader = true;
5786 break;
5787 }
5788 }
5789 }
5790 return Considerable;
5791}
5792
5794 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5795 if (!VF.isScalable())
5796 return true;
5797
5798 Type *Ty = RdxDesc.getRecurrenceType();
5799 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5800 return false;
5801
5802 switch (RdxDesc.getRecurrenceKind()) {
5803 case RecurKind::Sub:
5804 case RecurKind::FSub:
5807 case RecurKind::Add:
5808 case RecurKind::FAdd:
5809 case RecurKind::And:
5810 case RecurKind::Or:
5811 case RecurKind::Xor:
5812 case RecurKind::SMin:
5813 case RecurKind::SMax:
5814 case RecurKind::UMin:
5815 case RecurKind::UMax:
5816 case RecurKind::FMin:
5817 case RecurKind::FMax:
5818 case RecurKind::FMulAdd:
5819 case RecurKind::AnyOf:
5821 return true;
5822 default:
5823 return false;
5824 }
5825}
5826
5829 FastMathFlags FMF,
5831 // The code-generator is currently not able to handle scalable vectors
5832 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5833 // it. This change will be removed when code-generation for these types is
5834 // sufficiently reliable.
5835 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5836 if (VTy->getElementCount() == ElementCount::getScalable(1))
5838
5839 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5840
5841 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5842 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5843
5844 InstructionCost LegalizationCost = 0;
5845 if (LT.first > 1) {
5846 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5847 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5848 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5849 }
5850
5851 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5852}
5853
5855 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5856 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5857 InstructionCost LegalizationCost = 0;
5858 if (LT.first > 1) {
5859 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5860 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5861 LegalizationCost *= LT.first - 1;
5862 }
5863
5864 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5865 assert(ISD && "Invalid opcode");
5866 // Add the final reduction cost for the legal horizontal reduction
5867 switch (ISD) {
5868 case ISD::ADD:
5869 case ISD::AND:
5870 case ISD::OR:
5871 case ISD::XOR:
5872 case ISD::FADD:
5873 return LegalizationCost + 2;
5874 default:
5876 }
5877}
5878
5881 std::optional<FastMathFlags> FMF,
5883 // The code-generator is currently not able to handle scalable vectors
5884 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5885 // it. This change will be removed when code-generation for these types is
5886 // sufficiently reliable.
5887 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5888 if (VTy->getElementCount() == ElementCount::getScalable(1))
5890
5892 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5893 InstructionCost BaseCost =
5894 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5895 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5896 // end up vectorizing for more computationally intensive loops.
5897 return BaseCost + FixedVTy->getNumElements();
5898 }
5899
5900 if (Opcode != Instruction::FAdd || ValTy->getElementType()->isBFloatTy())
5902
5903 auto *VTy = cast<ScalableVectorType>(ValTy);
5905 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5906 Cost *= getMaxNumElements(VTy->getElementCount());
5907 return Cost;
5908 }
5909
5910 if (isa<ScalableVectorType>(ValTy))
5911 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5912
5913 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5914 MVT MTy = LT.second;
5915 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5916 assert(ISD && "Invalid opcode");
5917
5918 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5919 // instructions as twice a normal vector add, plus 1 for each legalization
5920 // step (LT.first). This is the only arithmetic vector reduction operation for
5921 // which we have an instruction.
5922 // OR, XOR and AND costs should match the codegen from:
5923 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5924 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5925 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5926 static const CostTblEntry CostTblNoPairwise[]{
5927 {ISD::ADD, MVT::v8i8, 2},
5928 {ISD::ADD, MVT::v16i8, 2},
5929 {ISD::ADD, MVT::v4i16, 2},
5930 {ISD::ADD, MVT::v8i16, 2},
5931 {ISD::ADD, MVT::v2i32, 2},
5932 {ISD::ADD, MVT::v4i32, 2},
5933 {ISD::ADD, MVT::v2i64, 2},
5934 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5935 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
5936 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
5937 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
5938 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
5939 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
5940 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
5941 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
5942 {ISD::XOR, MVT::v16i8, 7},
5943 {ISD::XOR, MVT::v4i16, 4},
5944 {ISD::XOR, MVT::v8i16, 6},
5945 {ISD::XOR, MVT::v2i32, 3},
5946 {ISD::XOR, MVT::v4i32, 5},
5947 {ISD::XOR, MVT::v2i64, 3},
5948 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
5949 {ISD::AND, MVT::v16i8, 7},
5950 {ISD::AND, MVT::v4i16, 4},
5951 {ISD::AND, MVT::v8i16, 6},
5952 {ISD::AND, MVT::v2i32, 3},
5953 {ISD::AND, MVT::v4i32, 5},
5954 {ISD::AND, MVT::v2i64, 3},
5955 };
5956 switch (ISD) {
5957 default:
5958 break;
5959 case ISD::FADD:
5960 if (Type *EltTy = ValTy->getScalarType();
5961 // FIXME: For half types without fullfp16 support, this could extend and
5962 // use a fp32 faddp reduction but current codegen unrolls.
5963 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5964 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5965 const unsigned NElts = MTy.getVectorNumElements();
5966 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5967 isPowerOf2_32(NElts))
5968 // Reduction corresponding to series of fadd instructions is lowered to
5969 // series of faddp instructions. faddp has latency/throughput that
5970 // matches fadd instruction and hence, every faddp instruction can be
5971 // considered to have a relative cost = 1 with
5972 // CostKind = TCK_RecipThroughput.
5973 // An faddp will pairwise add vector elements, so the size of input
5974 // vector reduces by half every time, requiring
5975 // #(faddp instructions) = log2_32(NElts).
5976 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5977 }
5978 break;
5979 case ISD::ADD:
5980 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
5981 return (LT.first - 1) + Entry->Cost;
5982 break;
5983 case ISD::XOR:
5984 case ISD::AND:
5985 case ISD::OR:
5986 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5987 if (!Entry)
5988 break;
5989 auto *ValVTy = cast<FixedVectorType>(ValTy);
5990 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5991 isPowerOf2_32(ValVTy->getNumElements())) {
5992 InstructionCost ExtraCost = 0;
5993 if (LT.first != 1) {
5994 // Type needs to be split, so there is an extra cost of LT.first - 1
5995 // arithmetic ops.
5996 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5997 MTy.getVectorNumElements());
5998 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5999 ExtraCost *= LT.first - 1;
6000 }
6001 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
6002 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
6003 return Cost + ExtraCost;
6004 }
6005 break;
6006 }
6007 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
6008}
6009
6011 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
6012 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
6013 EVT VecVT = TLI->getValueType(DL, VecTy);
6014 EVT ResVT = TLI->getValueType(DL, ResTy);
6015
6016 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
6017 VecVT.getSizeInBits() >= 64) {
6018 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
6019
6020 // The legal cases are:
6021 // UADDLV 8/16/32->32
6022 // UADDLP 32->64
6023 unsigned RevVTSize = ResVT.getSizeInBits();
6024 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
6025 RevVTSize <= 32) ||
6026 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
6027 RevVTSize <= 32) ||
6028 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
6029 RevVTSize <= 64))
6030 return (LT.first - 1) * 2 + 2;
6031 }
6032
6033 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
6034 CostKind);
6035}
6036
6038AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
6039 Type *ResTy, VectorType *VecTy,
6041 EVT VecVT = TLI->getValueType(DL, VecTy);
6042 EVT ResVT = TLI->getValueType(DL, ResTy);
6043
6044 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
6045 RedOpcode == Instruction::Add) {
6046 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
6047
6048 // The legal cases with dotprod are
6049 // UDOT 8->32
6050 // Which requires an additional uaddv to sum the i32 values.
6051 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
6052 ResVT == MVT::i32)
6053 return LT.first + 2;
6054 }
6055
6056 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
6057 CostKind);
6058}
6059
6063 static const CostTblEntry ShuffleTbl[] = {
6064 { TTI::SK_Splice, MVT::nxv16i8, 1 },
6065 { TTI::SK_Splice, MVT::nxv8i16, 1 },
6066 { TTI::SK_Splice, MVT::nxv4i32, 1 },
6067 { TTI::SK_Splice, MVT::nxv2i64, 1 },
6068 { TTI::SK_Splice, MVT::nxv2f16, 1 },
6069 { TTI::SK_Splice, MVT::nxv4f16, 1 },
6070 { TTI::SK_Splice, MVT::nxv8f16, 1 },
6071 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
6072 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
6073 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
6074 { TTI::SK_Splice, MVT::nxv2f32, 1 },
6075 { TTI::SK_Splice, MVT::nxv4f32, 1 },
6076 { TTI::SK_Splice, MVT::nxv2f64, 1 },
6077 };
6078
6079 // The code-generator is currently not able to handle scalable vectors
6080 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
6081 // it. This change will be removed when code-generation for these types is
6082 // sufficiently reliable.
6085
6086 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
6087 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
6088 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
6089 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
6090 : LT.second;
6091 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
6092 InstructionCost LegalizationCost = 0;
6093 if (Index < 0) {
6094 LegalizationCost =
6095 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
6097 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
6099 }
6100
6101 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
6102 // Cost performed on a promoted type.
6103 if (LT.second.getScalarType() == MVT::i1) {
6104 LegalizationCost +=
6105 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
6107 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
6109 }
6110 const auto *Entry =
6111 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
6112 assert(Entry && "Illegal Type for Splice");
6113 LegalizationCost += Entry->Cost;
6114 return LegalizationCost * LT.first;
6115}
6116
6118 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
6120 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
6121 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
6123
6125 return Invalid;
6126
6127 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub &&
6128 Opcode != Instruction::FAdd && Opcode != Instruction::FSub) ||
6129 OpAExtend == TTI::PR_None)
6130 return Invalid;
6131
6132 // Floating-point partial reductions are invalid if `reassoc` and `contract`
6133 // are not allowed.
6134 if (AccumType->isFloatingPointTy()) {
6135 assert(FMF && "Missing FastMathFlags for floating-point partial reduction");
6136 if (!FMF->allowReassoc() || !FMF->allowContract())
6137 return Invalid;
6138 } else {
6139 assert(!FMF &&
6140 "FastMathFlags only apply to floating-point partial reductions");
6141 }
6142
6143 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
6144 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
6145 "Unexpected values for OpBExtend or InputTypeB");
6146
6147 // We only support multiply binary operations for now, and for muls we
6148 // require the types being extended to be the same.
6149 if (BinOp && ((*BinOp != Instruction::Mul && *BinOp != Instruction::FMul) ||
6150 InputTypeA != InputTypeB))
6151 return Invalid;
6152
6153 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
6154 // USDot is natively supported with +i8mm. With plain +dotprod, SUMLA is
6155 // lowered to two udots plus an eor and a sub.
6156 if (IsUSDot && !ST->hasMatMulInt8() && !ST->hasDotProd())
6157 // FIXME: Remove this early bailout in favour of expand cost.
6158 return Invalid;
6159
6160 unsigned Ratio =
6161 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
6162 if (VF.getKnownMinValue() <= Ratio)
6163 return Invalid;
6164
6165 VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
6166 VectorType *AccumVectorType =
6167 VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
6168 // We don't yet support all kinds of legalization.
6169 auto TC = TLI->getTypeConversion(AccumVectorType->getContext(),
6170 EVT::getEVT(AccumVectorType));
6171 switch (TC.first) {
6172 default:
6173 return Invalid;
6177 // The legalised type (e.g. after splitting) must be legal too.
6178 if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) !=
6180 return Invalid;
6181 break;
6182 }
6183
6184 std::pair<InstructionCost, MVT> AccumLT =
6185 getTypeLegalizationCost(AccumVectorType);
6186 std::pair<InstructionCost, MVT> InputLT =
6187 getTypeLegalizationCost(InputVectorType);
6188
6189 // Returns true if the subtarget supports the operation for a given type.
6190 auto IsSupported = [&](bool SVEPred, bool NEONPred) -> bool {
6191 return (ST->isSVEorStreamingSVEAvailable() && SVEPred) ||
6192 (AccumLT.second.isFixedLengthVector() &&
6193 AccumLT.second.getSizeInBits() <= 128 && ST->isNeonAvailable() &&
6194 NEONPred);
6195 };
6196
6197 bool IsSub = Opcode == Instruction::Sub || Opcode == Instruction::FSub;
6198 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
6199 // Integer partial sub-reductions that don't map to a specific instruction,
6200 // carry an extra cost for implementing a double negation:
6201 // partial_reduce_umls acc, lhs, rhs
6202 // <=> -partial_reduce_umla -acc, lhs, rhs
6203 InstructionCost INegCost = IsSub ? 2 * InputLT.first * TTI::TCC_Basic : 0;
6204
6205 if (AccumLT.second.getScalarType() == MVT::i32 &&
6206 InputLT.second.getScalarType() == MVT::i8) {
6207 // i8 -> i32 is natively supported with udot/sdot for both NEON and SVE.
6208 if (!IsUSDot && IsSupported(true, ST->hasDotProd()))
6209 return Cost + INegCost;
6210 // i8 -> i32 usdot requires +i8mm
6211 if (IsUSDot && IsSupported(ST->hasMatMulInt8(), ST->hasMatMulInt8()))
6212 return Cost + INegCost;
6213 // Without +i8mm, lower SUMLA via two udots plus an eor and a sub on plain
6214 // +dotprod targets. Note that this is only implemented for NEON, as all
6215 // modern CPUs with SVE also have +i8mm. Charge an extra factor for the
6216 // expansion.
6217 if (IsUSDot && IsSupported(false, ST->hasDotProd()))
6218 return Cost * 3 + INegCost;
6219 }
6220
6221 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
6222 // i16 -> i64 is natively supported for udot/sdot
6223 if (AccumLT.second.getScalarType() == MVT::i64 &&
6224 InputLT.second.getScalarType() == MVT::i16)
6225 return Cost + INegCost;
6226 // i16 -> i32 is natively supported with SVE2p1 udot/sdot.
6227 // For sub-reductions, we prefer using the *mlslb/t instructions.
6228 if (AccumLT.second.getScalarType() == MVT::i32 &&
6229 InputLT.second.getScalarType() == MVT::i16 &&
6230 (ST->hasSVE2p1() || ST->hasSME2()) && !IsSub)
6231 return Cost;
6232 // i8 -> i64 is supported with an extra level of extends
6233 if (AccumLT.second.getScalarType() == MVT::i64 &&
6234 InputLT.second.getScalarType() == MVT::i8)
6235 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
6236 // because it requires two extra extends on the inputs. But if we'd change
6237 // that now, a regular reduction would be cheaper because the costs of
6238 // the extends in the IR are still counted. This can be fixed
6239 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
6240 return Cost + INegCost;
6241 // i8 -> i16 is natively supported with SVE2p3 udot/sdot
6242 // For sub-reductions, we prefer using the *mlslb/t instructions.
6243 if (AccumLT.second.getScalarType() == MVT::i16 &&
6244 InputLT.second.getScalarType() == MVT::i8 &&
6245 (ST->hasSVE2p3() || ST->hasSME2p3()) && !IsSub)
6246 return Cost;
6247 }
6248
6249 // f16 -> f32 is natively supported for fdot using either
6250 // SVE or NEON instruction.
6251 if (Opcode == Instruction::FAdd && !IsSub &&
6252 IsSupported(ST->hasSME2() || ST->hasSVE2p1(), ST->hasF16F32DOT()) &&
6253 AccumLT.second.getScalarType() == MVT::f32 &&
6254 InputLT.second.getScalarType() == MVT::f16)
6255 return Cost;
6256
6257 // For a ratio of 2, we can use *mlal and *mlsl top/bottom instructions.
6258 if (Ratio == 2 && !IsUSDot) {
6259 MVT InVT = InputLT.second.getScalarType();
6260
6261 // SVE2 [us]ml[as]lb/t and NEON [us]ml[as]l(2)
6262 if (IsSupported(ST->hasSVE2() || ST->hasSME(), true) &&
6263 llvm::is_contained({MVT::i8, MVT::i16, MVT::i32}, InVT.SimpleTy))
6264 return Cost * 2;
6265
6266 // SVE2 fml[as]lb/t and NEON fml[as]l(2)
6267 if (IsSupported(ST->hasSVE2(), ST->hasFP16FML()) && InVT == MVT::f16)
6268 return Cost * 2;
6269
6270 // SME2/SVE2p1 bfmlslb/t
6271 if (IsSupported(ST->hasSVE2p1() || ST->hasSME2(), false) &&
6272 InVT == MVT::bf16 && IsSub)
6273 return Cost * 2;
6274
6275 // FP partial sub-reductions that don't map to a specific instruction,
6276 // carry an extra cost for implementing an extra negation:
6277 // partial_reduce_fmls acc, lhs, rhs
6278 // <=> partial_reduce_fmla acc, lhs, -rhs
6279 InstructionCost FNegCost = IsSub ? InputLT.first * TTI::TCC_Basic : 0;
6280
6281 // SVE and NEON bfmlalb/t
6282 if (IsSupported(ST->hasBF16(), ST->hasBF16()) && InVT == MVT::bf16)
6283 return Cost * 2 + FNegCost;
6284 }
6285
6286 return BaseT::getPartialReductionCost(Opcode, InputTypeA, InputTypeB,
6287 AccumType, VF, OpAExtend, OpBExtend,
6288 BinOp, CostKind, FMF);
6289}
6290
6293 VectorType *SrcTy, ArrayRef<int> Mask,
6294 TTI::TargetCostKind CostKind, int Index,
6296 const Instruction *CxtI) const {
6297 assert((Mask.empty() || DstTy->isScalableTy() ||
6298 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
6299 "Expected the Mask to match the return size if given");
6300 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
6301 "Expected the same scalar types");
6302 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
6303
6304 // If we have a Mask, and the LT is being legalized somehow, split the Mask
6305 // into smaller vectors and sum the cost of each shuffle.
6306 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
6307 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
6308 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
6309 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
6310 // Check for LD3/LD4 instructions, which are represented in llvm IR as
6311 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
6312 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
6313 // cost than just the load.
6314 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
6317 return std::max<InstructionCost>(1, LT.first / 4);
6318
6319 // Check for ST3/ST4 instructions, which are represented in llvm IR as
6320 // store(interleaving-shuffle). The shuffle cost could potentially be free,
6321 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
6322 // cost than just the store.
6323 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
6325 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
6327 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
6328 return LT.first;
6329
6330 unsigned TpNumElts = Mask.size();
6331 unsigned LTNumElts = LT.second.getVectorNumElements();
6332 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
6333 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
6334 LT.second.getVectorElementCount());
6336 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
6337 PreviousCosts;
6338 for (unsigned N = 0; N < NumVecs; N++) {
6339 SmallVector<int> NMask;
6340 // Split the existing mask into chunks of size LTNumElts. Track the source
6341 // sub-vectors to ensure the result has at most 2 inputs.
6342 unsigned Source1 = -1U, Source2 = -1U;
6343 unsigned NumSources = 0;
6344 for (unsigned E = 0; E < LTNumElts; E++) {
6345 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
6347 if (MaskElt < 0) {
6349 continue;
6350 }
6351
6352 // Calculate which source from the input this comes from and whether it
6353 // is new to us.
6354 unsigned Source = MaskElt / LTNumElts;
6355 if (NumSources == 0) {
6356 Source1 = Source;
6357 NumSources = 1;
6358 } else if (NumSources == 1 && Source != Source1) {
6359 Source2 = Source;
6360 NumSources = 2;
6361 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
6362 NumSources++;
6363 }
6364
6365 // Add to the new mask. For the NumSources>2 case these are not correct,
6366 // but are only used for the modular lane number.
6367 if (Source == Source1)
6368 NMask.push_back(MaskElt % LTNumElts);
6369 else if (Source == Source2)
6370 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
6371 else
6372 NMask.push_back(MaskElt % LTNumElts);
6373 }
6374 // Check if we have already generated this sub-shuffle, which means we
6375 // will have already generated the output. For example a <16 x i32> splat
6376 // will be the same sub-splat 4 times, which only needs to be generated
6377 // once and reused.
6378 auto Result =
6379 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
6380 // Check if it was already in the map (already costed).
6381 if (!Result.second)
6382 continue;
6383 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
6384 // getShuffleCost. If not then cost it using the worst case as the number
6385 // of element moves into a new vector.
6386 InstructionCost NCost =
6387 NumSources <= 2
6388 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
6390 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
6391 CxtI)
6392 : LTNumElts;
6393 Result.first->second = NCost;
6394 Cost += NCost;
6395 }
6396 return Cost;
6397 }
6398
6399 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
6400 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
6401 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
6402 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
6403 // This currently only handles low or high extracts to prevent SLP vectorizer
6404 // regressions.
6405 // Note that SVE's ext instruction is destructive, but it can be fused with
6406 // a movprfx to act like a constructive instruction.
6407 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6408 if (LT.second.getFixedSizeInBits() >= 128 &&
6409 cast<FixedVectorType>(SubTp)->getNumElements() ==
6410 LT.second.getVectorNumElements() / 2) {
6411 if (Index == 0)
6412 return 0;
6413 if (Index == (int)LT.second.getVectorNumElements() / 2)
6414 return 1;
6415 }
6417 }
6418 // FIXME: This was added to keep the costs equal when adding DstTys. Update
6419 // the code to handle length-changing shuffles.
6420 if (Kind == TTI::SK_InsertSubvector) {
6421 LT = getTypeLegalizationCost(DstTy);
6422 SrcTy = DstTy;
6423 }
6424
6425 // Check for identity masks, which we can treat as free for both fixed and
6426 // scalable vector paths.
6427 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6428 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6429 all_of(enumerate(Mask), [](const auto &M) {
6430 return M.value() < 0 || M.value() == (int)M.index();
6431 }))
6432 return 0;
6433
6434 // Segmented shuffle matching.
6435 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
6436 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6437 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6439
6441 unsigned Segments =
6443 unsigned SegmentElts = VTy->getNumElements() / Segments;
6444
6445 // dupq zd.t, zn.t[idx]
6446 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6447 ST->isSVEorStreamingSVEAvailable() &&
6448 isDUPQMask(Mask, Segments, SegmentElts))
6449 return LT.first;
6450
6451 // mov zd.q, vn
6452 if (ST->isSVEorStreamingSVEAvailable() &&
6453 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
6454 return LT.first;
6455 }
6456
6457 // Check for broadcast loads, which are supported by the LD1R instruction.
6458 // In terms of code-size, the shuffle vector is free when a load + dup get
6459 // folded into a LD1R. That's what we check and return here. For performance
6460 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6461 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6462 // that we model the load + dup sequence slightly higher because LD1R is a
6463 // high latency instruction.
6464 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6465 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
6466 if (IsLoad && LT.second.isVector() &&
6467 isLegalBroadcastLoad(SrcTy->getElementType(),
6468 LT.second.getVectorElementCount()))
6469 return 0;
6470 }
6471
6472 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6473 // from the perfect shuffle tables.
6474 if (Mask.size() == 4 &&
6475 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
6476 (SrcTy->getScalarSizeInBits() == 16 ||
6477 SrcTy->getScalarSizeInBits() == 32) &&
6478 all_of(Mask, [](int E) { return E < 8; }))
6479 return getPerfectShuffleCost(Mask);
6480
6481 // Check for other shuffles that are not SK_ kinds but we have native
6482 // instructions for, for example ZIP and UZP.
6483 unsigned Unused;
6484 if (LT.second.isFixedLengthVector() &&
6485 LT.second.getVectorNumElements() == Mask.size() &&
6486 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc ||
6487 // Discrepancies between isTRNMask and ShuffleVectorInst::isTransposeMask
6488 // mean that we can end up with shuffles that satisfy isTRNMask, but end
6489 // up labelled as TTI::SK_InsertSubvector. (e.g. {2, 0}).
6490 Kind == TTI::SK_InsertSubvector) &&
6491 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6492 isTRNMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6493 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6494 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6495 LT.second.getVectorNumElements(), 16) ||
6496 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6497 LT.second.getVectorNumElements(), 32) ||
6498 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6499 LT.second.getVectorNumElements(), 64) ||
6500 // Check for non-zero lane splats
6501 all_of(drop_begin(Mask),
6502 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6503 return 1;
6504
6505 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6506 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6507 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6508 static const CostTblEntry ShuffleTbl[] = {
6509 // Broadcast shuffle kinds can be performed with 'dup'.
6510 {TTI::SK_Broadcast, MVT::v8i8, 1},
6511 {TTI::SK_Broadcast, MVT::v16i8, 1},
6512 {TTI::SK_Broadcast, MVT::v4i16, 1},
6513 {TTI::SK_Broadcast, MVT::v8i16, 1},
6514 {TTI::SK_Broadcast, MVT::v2i32, 1},
6515 {TTI::SK_Broadcast, MVT::v4i32, 1},
6516 {TTI::SK_Broadcast, MVT::v2i64, 1},
6517 {TTI::SK_Broadcast, MVT::v4f16, 1},
6518 {TTI::SK_Broadcast, MVT::v8f16, 1},
6519 {TTI::SK_Broadcast, MVT::v4bf16, 1},
6520 {TTI::SK_Broadcast, MVT::v8bf16, 1},
6521 {TTI::SK_Broadcast, MVT::v2f32, 1},
6522 {TTI::SK_Broadcast, MVT::v4f32, 1},
6523 {TTI::SK_Broadcast, MVT::v2f64, 1},
6524 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6525 // 'zip1/zip2' instructions.
6526 {TTI::SK_Transpose, MVT::v8i8, 1},
6527 {TTI::SK_Transpose, MVT::v16i8, 1},
6528 {TTI::SK_Transpose, MVT::v4i16, 1},
6529 {TTI::SK_Transpose, MVT::v8i16, 1},
6530 {TTI::SK_Transpose, MVT::v2i32, 1},
6531 {TTI::SK_Transpose, MVT::v4i32, 1},
6532 {TTI::SK_Transpose, MVT::v2i64, 1},
6533 {TTI::SK_Transpose, MVT::v4f16, 1},
6534 {TTI::SK_Transpose, MVT::v8f16, 1},
6535 {TTI::SK_Transpose, MVT::v4bf16, 1},
6536 {TTI::SK_Transpose, MVT::v8bf16, 1},
6537 {TTI::SK_Transpose, MVT::v2f32, 1},
6538 {TTI::SK_Transpose, MVT::v4f32, 1},
6539 {TTI::SK_Transpose, MVT::v2f64, 1},
6540 // Select shuffle kinds.
6541 // TODO: handle vXi8/vXi16.
6542 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
6543 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
6544 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
6545 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
6546 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
6547 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
6548 // PermuteSingleSrc shuffle kinds.
6549 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
6550 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
6551 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
6552 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
6553 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
6554 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
6555 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
6556 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
6557 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
6558 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
6559 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
6560 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
6561 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
6562 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
6563 // Reverse can be lowered with `rev`.
6564 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
6565 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
6566 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
6567 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
6568 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
6569 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
6570 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
6571 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
6572 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
6573 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
6574 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
6575 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
6576 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
6577 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
6578 // Splice can all be lowered as `ext`.
6579 {TTI::SK_Splice, MVT::v2i32, 1},
6580 {TTI::SK_Splice, MVT::v4i32, 1},
6581 {TTI::SK_Splice, MVT::v2i64, 1},
6582 {TTI::SK_Splice, MVT::v2f32, 1},
6583 {TTI::SK_Splice, MVT::v4f32, 1},
6584 {TTI::SK_Splice, MVT::v2f64, 1},
6585 {TTI::SK_Splice, MVT::v8f16, 1},
6586 {TTI::SK_Splice, MVT::v8bf16, 1},
6587 {TTI::SK_Splice, MVT::v8i16, 1},
6588 {TTI::SK_Splice, MVT::v16i8, 1},
6589 {TTI::SK_Splice, MVT::v4f16, 1},
6590 {TTI::SK_Splice, MVT::v4bf16, 1},
6591 {TTI::SK_Splice, MVT::v4i16, 1},
6592 {TTI::SK_Splice, MVT::v8i8, 1},
6593 // Broadcast shuffle kinds for scalable vectors
6594 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
6595 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6596 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6597 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6598 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6599 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6600 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6601 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6602 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6603 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6604 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6605 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6606 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6607 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6608 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6609 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6610 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6611 // Handle the cases for vector.reverse with scalable vectors
6612 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6613 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6614 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6615 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6616 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6617 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6618 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6619 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6620 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6621 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6622 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6623 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6624 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6625 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6626 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6627 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6628 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6629 };
6630 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6631 return LT.first * Entry->Cost;
6632 }
6633
6634 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6635 return getSpliceCost(SrcTy, Index, CostKind);
6636
6637 // Inserting a subvector can often be done with either a D, S or H register
6638 // move, so long as the inserted vector is "aligned".
6639 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6640 LT.second.getSizeInBits() <= 128 && SubTp) {
6641 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6642 if (SubLT.second.isVector()) {
6643 int NumElts = LT.second.getVectorNumElements();
6644 int NumSubElts = SubLT.second.getVectorNumElements();
6645 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6646 return SubLT.first;
6647 }
6648 }
6649
6650 // Restore optimal kind.
6651 if (IsExtractSubvector)
6653 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6654 Args, CxtI);
6655}
6656
6659 const DominatorTree &DT) {
6660 const auto &Strides = DenseMap<Value *, const SCEV *>();
6661 for (BasicBlock *BB : TheLoop->blocks()) {
6662 // Scan the instructions in the block and look for addresses that are
6663 // consecutive and decreasing.
6664 for (Instruction &I : *BB) {
6665 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6667 Type *AccessTy = getLoadStoreType(&I);
6668 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
6669 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6670 .value_or(0) < 0)
6671 return true;
6672 }
6673 }
6674 }
6675 return false;
6676}
6677
6679 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6681 // For cases like post-LTO vectorization, when we eventually know the trip
6682 // count, epilogue with fixed-width vectorization can be deleted if the trip
6683 // count is less than the epilogue iterations. That's why we prefer
6684 // fixed-width vectorization in epilogue in case of equal costs.
6685 if (IsEpilogue)
6686 return true;
6687 return ST->useFixedOverScalableIfEqualCost();
6688}
6689
6691 return ST->getEpilogueVectorizationMinVF();
6692}
6693
6695 if (!ST->hasSVE())
6696 return false;
6697
6698 // We don't currently support vectorisation with interleaving for SVE - with
6699 // such loops we're better off not using tail-folding. This gives us a chance
6700 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6701 if (TFI->IAI->hasGroups())
6702 return false;
6703
6705 if (TFI->LVL->getReductionVars().size())
6706 Required |= TailFoldingOpts::Reductions;
6707 if (TFI->LVL->getFixedOrderRecurrences().size())
6708 Required |= TailFoldingOpts::Recurrences;
6709
6710 // We call this to discover whether any load/store pointers in the loop have
6711 // negative strides. This will require extra work to reverse the loop
6712 // predicate, which may be expensive.
6715 *TFI->LVL->getDominatorTree()))
6716 Required |= TailFoldingOpts::Reverse;
6717 if (Required == TailFoldingOpts::Disabled)
6718 Required |= TailFoldingOpts::Simple;
6719
6720 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6721 Required))
6722 return false;
6723
6724 // Don't tail-fold for tight loops where we would be better off interleaving
6725 // with an unpredicated loop.
6726 unsigned NumInsns = 0;
6727 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6728 NumInsns += BB->size();
6729 }
6730
6731 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6732 return NumInsns >= SVETailFoldInsnThreshold;
6733}
6734
6737 StackOffset BaseOffset, bool HasBaseReg,
6738 int64_t Scale, unsigned AddrSpace) const {
6739 // Scaling factors are not free at all.
6740 // Operands | Rt Latency
6741 // -------------------------------------------
6742 // Rt, [Xn, Xm] | 4
6743 // -------------------------------------------
6744 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6745 // Rt, [Xn, Wm, <extend> #imm] |
6747 AM.BaseGV = BaseGV;
6748 AM.BaseOffs = BaseOffset.getFixed();
6749 AM.HasBaseReg = HasBaseReg;
6750 AM.Scale = Scale;
6751 AM.ScalableOffset = BaseOffset.getScalable();
6752 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6753 // Scale represents reg2 * scale, thus account for 1 if
6754 // it is not equal to 0 or 1.
6755 return AM.Scale != 0 && AM.Scale != 1;
6757}
6758
6760 const Instruction *I) const {
6762 // For the binary operators (e.g. or) we need to be more careful than
6763 // selects, here we only transform them if they are already at a natural
6764 // break point in the code - the end of a block with an unconditional
6765 // terminator.
6766 if (I->getOpcode() == Instruction::Or &&
6767 isa<UncondBrInst>(I->getNextNode()))
6768 return true;
6769
6770 if (I->getOpcode() == Instruction::Add ||
6771 I->getOpcode() == Instruction::Sub)
6772 return true;
6773 }
6775}
6776
6779 const TargetTransformInfo::LSRCost &C2) const {
6780 // AArch64 specific here is adding the number of instructions to the
6781 // comparison (though not as the first consideration, as some targets do)
6782 // along with changing the priority of the base additions.
6783 // TODO: Maybe a more nuanced tradeoff between instruction count
6784 // and number of registers? To be investigated at a later date.
6785 if (EnableLSRCostOpt)
6786 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6787 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6788 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6789 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6790
6792}
6793
6794static bool isSplatShuffle(Value *V) {
6795 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6796 return all_equal(Shuf->getShuffleMask());
6797 return false;
6798}
6799
6800/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6801/// or upper half of the vector elements.
6802static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6803 bool AllowSplat = false) {
6804 // Scalable types can't be extract shuffle vectors.
6805 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6806 return false;
6807
6808 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6809 auto *FullTy = FullV->getType();
6810 auto *HalfTy = HalfV->getType();
6811 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6812 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6813 };
6814
6815 auto extractHalf = [](Value *FullV, Value *HalfV) {
6816 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6817 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6818 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6819 };
6820
6821 ArrayRef<int> M1, M2;
6822 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6823 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6824 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6825 return false;
6826
6827 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6828 // it is not checked as an extract below.
6829 if (AllowSplat && isSplatShuffle(Op1))
6830 S1Op1 = nullptr;
6831 if (AllowSplat && isSplatShuffle(Op2))
6832 S2Op1 = nullptr;
6833
6834 // Check that the operands are half as wide as the result and we extract
6835 // half of the elements of the input vectors.
6836 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6837 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6838 return false;
6839
6840 // Check the mask extracts either the lower or upper half of vector
6841 // elements.
6842 int M1Start = 0;
6843 int M2Start = 0;
6844 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6845 if ((S1Op1 &&
6846 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6847 (S2Op1 &&
6848 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6849 return false;
6850
6851 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6852 (M2Start != 0 && M2Start != (NumElements / 2)))
6853 return false;
6854 if (S1Op1 && S2Op1 && M1Start != M2Start)
6855 return false;
6856
6857 return true;
6858}
6859
6860/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6861/// of the vector elements.
6862static bool areExtractExts(Value *Ext1, Value *Ext2) {
6863 auto areExtDoubled = [](Instruction *Ext) {
6864 return Ext->getType()->getScalarSizeInBits() ==
6865 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6866 };
6867
6868 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6869 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6870 !areExtDoubled(cast<Instruction>(Ext1)) ||
6871 !areExtDoubled(cast<Instruction>(Ext2)))
6872 return false;
6873
6874 return true;
6875}
6876
6877/// Check if Op could be used with vmull_high_p64 intrinsic.
6879 Value *VectorOperand = nullptr;
6880 ConstantInt *ElementIndex = nullptr;
6881 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6882 m_ConstantInt(ElementIndex))) &&
6883 ElementIndex->getValue() == 1 &&
6884 isa<FixedVectorType>(VectorOperand->getType()) &&
6885 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6886}
6887
6888/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6889static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6891}
6892
6894 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6895 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6896 if (!GEP || GEP->getNumOperands() != 2)
6897 return false;
6898
6899 Value *Base = GEP->getOperand(0);
6900 Value *Offsets = GEP->getOperand(1);
6901
6902 // We only care about scalar_base+vector_offsets.
6903 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6904 return false;
6905
6906 // Sink extends that would allow us to use 32-bit offset vectors.
6907 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6908 auto *OffsetsInst = cast<Instruction>(Offsets);
6909 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6910 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6911 Ops.push_back(&GEP->getOperandUse(1));
6912 }
6913
6914 // Sink the GEP.
6915 return true;
6916}
6917
6918/// We want to sink following cases:
6919/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6920/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6922 if (match(Op, m_VScale()))
6923 return true;
6924 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
6926 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6927 return true;
6928 }
6929 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
6931 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
6932 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
6933 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6934 return true;
6935 }
6936 return false;
6937}
6938
6939static bool isFNeg(Value *Op) { return match(Op, m_FNeg(m_Value())); }
6940
6941/// Check if sinking \p I's operands to I's basic block is profitable, because
6942/// the operands can be folded into a target instruction, e.g.
6943/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6947 switch (II->getIntrinsicID()) {
6948 case Intrinsic::aarch64_neon_smull:
6949 case Intrinsic::aarch64_neon_umull:
6950 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6951 /*AllowSplat=*/true)) {
6952 Ops.push_back(&II->getOperandUse(0));
6953 Ops.push_back(&II->getOperandUse(1));
6954 return true;
6955 }
6956 [[fallthrough]];
6957
6958 case Intrinsic::fma:
6959 case Intrinsic::fmuladd:
6960 if (isa<VectorType>(I->getType()) &&
6961 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6962 !ST->hasFullFP16())
6963 return false;
6964
6965 if (isFNeg(II->getOperand(0)))
6966 Ops.push_back(&II->getOperandUse(0));
6967 if (isFNeg(II->getOperand(1)))
6968 Ops.push_back(&II->getOperandUse(1));
6969
6970 [[fallthrough]];
6971 case Intrinsic::aarch64_neon_sqdmull:
6972 case Intrinsic::aarch64_neon_sqdmulh:
6973 case Intrinsic::aarch64_neon_sqrdmulh:
6974 // Sink splats for index lane variants
6975 if (isSplatShuffle(II->getOperand(0)))
6976 Ops.push_back(&II->getOperandUse(0));
6977 if (isSplatShuffle(II->getOperand(1)))
6978 Ops.push_back(&II->getOperandUse(1));
6979 return !Ops.empty();
6980 case Intrinsic::aarch64_neon_fmlal:
6981 case Intrinsic::aarch64_neon_fmlal2:
6982 case Intrinsic::aarch64_neon_fmlsl:
6983 case Intrinsic::aarch64_neon_fmlsl2:
6984 // Sink splats for index lane variants
6985 if (isSplatShuffle(II->getOperand(1)))
6986 Ops.push_back(&II->getOperandUse(1));
6987 if (isSplatShuffle(II->getOperand(2)))
6988 Ops.push_back(&II->getOperandUse(2));
6989 return !Ops.empty();
6990 case Intrinsic::aarch64_sve_ptest_first:
6991 case Intrinsic::aarch64_sve_ptest_last:
6992 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6993 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6994 Ops.push_back(&II->getOperandUse(0));
6995 return !Ops.empty();
6996 case Intrinsic::aarch64_sme_write_horiz:
6997 case Intrinsic::aarch64_sme_write_vert:
6998 case Intrinsic::aarch64_sme_writeq_horiz:
6999 case Intrinsic::aarch64_sme_writeq_vert: {
7000 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
7001 if (!Idx || Idx->getOpcode() != Instruction::Add)
7002 return false;
7003 Ops.push_back(&II->getOperandUse(1));
7004 return true;
7005 }
7006 case Intrinsic::aarch64_sme_read_horiz:
7007 case Intrinsic::aarch64_sme_read_vert:
7008 case Intrinsic::aarch64_sme_readq_horiz:
7009 case Intrinsic::aarch64_sme_readq_vert:
7010 case Intrinsic::aarch64_sme_ld1b_vert:
7011 case Intrinsic::aarch64_sme_ld1h_vert:
7012 case Intrinsic::aarch64_sme_ld1w_vert:
7013 case Intrinsic::aarch64_sme_ld1d_vert:
7014 case Intrinsic::aarch64_sme_ld1q_vert:
7015 case Intrinsic::aarch64_sme_st1b_vert:
7016 case Intrinsic::aarch64_sme_st1h_vert:
7017 case Intrinsic::aarch64_sme_st1w_vert:
7018 case Intrinsic::aarch64_sme_st1d_vert:
7019 case Intrinsic::aarch64_sme_st1q_vert:
7020 case Intrinsic::aarch64_sme_ld1b_horiz:
7021 case Intrinsic::aarch64_sme_ld1h_horiz:
7022 case Intrinsic::aarch64_sme_ld1w_horiz:
7023 case Intrinsic::aarch64_sme_ld1d_horiz:
7024 case Intrinsic::aarch64_sme_ld1q_horiz:
7025 case Intrinsic::aarch64_sme_st1b_horiz:
7026 case Intrinsic::aarch64_sme_st1h_horiz:
7027 case Intrinsic::aarch64_sme_st1w_horiz:
7028 case Intrinsic::aarch64_sme_st1d_horiz:
7029 case Intrinsic::aarch64_sme_st1q_horiz: {
7030 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
7031 if (!Idx || Idx->getOpcode() != Instruction::Add)
7032 return false;
7033 Ops.push_back(&II->getOperandUse(3));
7034 return true;
7035 }
7036 case Intrinsic::aarch64_neon_pmull:
7037 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
7038 return false;
7039 Ops.push_back(&II->getOperandUse(0));
7040 Ops.push_back(&II->getOperandUse(1));
7041 return true;
7042 case Intrinsic::aarch64_neon_pmull64:
7043 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
7044 II->getArgOperand(1)))
7045 return false;
7046 Ops.push_back(&II->getArgOperandUse(0));
7047 Ops.push_back(&II->getArgOperandUse(1));
7048 return true;
7049 case Intrinsic::masked_gather:
7050 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
7051 return false;
7052 Ops.push_back(&II->getArgOperandUse(0));
7053 return true;
7054 case Intrinsic::masked_scatter:
7055 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
7056 return false;
7057 Ops.push_back(&II->getArgOperandUse(1));
7058 return true;
7059 default:
7060 return false;
7061 }
7062 }
7063
7064 auto ShouldSinkCondition = [](Value *Cond,
7065 SmallVectorImpl<Use *> &Ops) -> bool {
7067 return false;
7069 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
7070 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
7071 return false;
7072 if (isa<CmpInst>(II->getOperand(0)))
7073 Ops.push_back(&II->getOperandUse(0));
7074 return true;
7075 };
7076
7077 switch (I->getOpcode()) {
7078 case Instruction::GetElementPtr:
7079 case Instruction::Add:
7080 case Instruction::Sub:
7081 // Sink vscales closer to uses for better isel
7082 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
7083 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
7084 Ops.push_back(&I->getOperandUse(Op));
7085 return true;
7086 }
7087 }
7088 break;
7089 case Instruction::Select: {
7090 if (!ShouldSinkCondition(I->getOperand(0), Ops))
7091 return false;
7092
7093 Ops.push_back(&I->getOperandUse(0));
7094 return true;
7095 }
7096 case Instruction::UncondBr:
7097 return false;
7098 case Instruction::CondBr: {
7099 if (!ShouldSinkCondition(cast<CondBrInst>(I)->getCondition(), Ops))
7100 return false;
7101
7102 Ops.push_back(&I->getOperandUse(0));
7103 return true;
7104 }
7105 case Instruction::FMul:
7106 // fmul with contract flag can be combined with fadd into fma.
7107 // Sinking fneg into this block enables fmls pattern.
7108 if (cast<FPMathOperator>(I)->hasAllowContract()) {
7109 if (isFNeg(I->getOperand(0)))
7110 Ops.push_back(&I->getOperandUse(0));
7111 if (isFNeg(I->getOperand(1)))
7112 Ops.push_back(&I->getOperandUse(1));
7113 }
7114 break;
7115
7116 // Type | BIC | ORN | EON
7117 // ----------------+-----------+-----------+-----------
7118 // scalar | Base | Base | Base
7119 // scalar w/shift | - | - | -
7120 // fixed vector | NEON/Base | NEON/Base | BSL2N/Base
7121 // scalable vector | SVE | - | BSL2N
7122 case Instruction::Xor:
7123 // EON only for scalars (possibly expanded fixed vectors)
7124 // and vectors using the SVE2/SME BSL2N instruction.
7125 if (I->getType()->isVectorTy() && ST->isNeonAvailable()) {
7126 bool HasBSL2N =
7127 ST->isSVEorStreamingSVEAvailable() && (ST->hasSVE2() || ST->hasSME());
7128 if (!HasBSL2N)
7129 break;
7130 }
7131 [[fallthrough]];
7132 case Instruction::And:
7133 case Instruction::Or:
7134 // Even though we could use the SVE2/SME BSL2N instruction,
7135 // it might pessimize with an extra MOV depending on register allocation.
7136 if (I->getOpcode() == Instruction::Or &&
7137 isa<ScalableVectorType>(I->getType()))
7138 break;
7139 // Shift can be fold into scalar AND/ORR/EOR,
7140 // but not the non-negated operand of BIC/ORN/EON.
7141 if (!(I->getType()->isVectorTy() && ST->hasNEON()) &&
7143 break;
7144 for (auto &Op : I->operands()) {
7145 // (and/or/xor X, (not Y)) -> (bic/orn/eon X, Y)
7146 if (match(Op.get(), m_Not(m_Value()))) {
7147 Ops.push_back(&Op);
7148 return true;
7149 }
7150 // (and/or/xor X, (splat (not Y))) -> (bic/orn/eon X, (splat Y))
7151 if (match(Op.get(),
7153 m_Value(), m_ZeroMask()))) {
7154 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
7155 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
7156 Ops.push_back(&Not);
7157 Ops.push_back(&InsertElt);
7158 Ops.push_back(&Op);
7159 return true;
7160 }
7161 }
7162 break;
7163 default:
7164 break;
7165 }
7166
7167 if (!I->getType()->isVectorTy())
7168 return !Ops.empty();
7169
7170 switch (I->getOpcode()) {
7171 case Instruction::Sub:
7172 case Instruction::Add: {
7173 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
7174 return false;
7175
7176 // If the exts' operands extract either the lower or upper elements, we
7177 // can sink them too.
7178 auto Ext1 = cast<Instruction>(I->getOperand(0));
7179 auto Ext2 = cast<Instruction>(I->getOperand(1));
7180 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
7181 Ops.push_back(&Ext1->getOperandUse(0));
7182 Ops.push_back(&Ext2->getOperandUse(0));
7183 }
7184
7185 Ops.push_back(&I->getOperandUse(0));
7186 Ops.push_back(&I->getOperandUse(1));
7187
7188 return true;
7189 }
7190 case Instruction::Or: {
7191 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
7192 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
7193 if (ST->hasNEON()) {
7194 Instruction *OtherAnd, *IA, *IB;
7195 Value *MaskValue;
7196 // MainAnd refers to And instruction that has 'Not' as one of its operands
7197 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
7198 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
7199 m_Instruction(IA)))))) {
7200 if (match(OtherAnd,
7201 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
7202 Instruction *MainAnd = I->getOperand(0) == OtherAnd
7203 ? cast<Instruction>(I->getOperand(1))
7204 : cast<Instruction>(I->getOperand(0));
7205
7206 // Both Ands should be in same basic block as Or
7207 if (I->getParent() != MainAnd->getParent() ||
7208 I->getParent() != OtherAnd->getParent())
7209 return false;
7210
7211 // Non-mask operands of both Ands should also be in same basic block
7212 if (I->getParent() != IA->getParent() ||
7213 I->getParent() != IB->getParent())
7214 return false;
7215
7216 Ops.push_back(
7217 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
7218 Ops.push_back(&I->getOperandUse(0));
7219 Ops.push_back(&I->getOperandUse(1));
7220
7221 return true;
7222 }
7223 }
7224 }
7225
7226 return false;
7227 }
7228 case Instruction::Mul: {
7229 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
7230 auto *Ty = cast<VectorType>(V->getType());
7231 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7232 if (Ty->isScalableTy())
7233 return false;
7234
7235 // Indexed variants of Mul exist for i16 and i32 element types only.
7236 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
7237 };
7238
7239 int NumZExts = 0, NumSExts = 0;
7240 for (auto &Op : I->operands()) {
7241 // Make sure we are not already sinking this operand
7242 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7243 continue;
7244
7245 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
7246 auto *Ext = cast<Instruction>(Op);
7247 auto *ExtOp = Ext->getOperand(0);
7248 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
7249 Ops.push_back(&Ext->getOperandUse(0));
7250 Ops.push_back(&Op);
7251
7252 if (isa<SExtInst>(Ext)) {
7253 NumSExts++;
7254 } else {
7255 NumZExts++;
7256 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
7257 if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
7258 I->getType()->getScalarSizeInBits())
7259 NumSExts++;
7260 }
7261
7262 continue;
7263 }
7264
7266 if (!Shuffle)
7267 continue;
7268
7269 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
7270 // operand and the s/zext can help create indexed s/umull. This is
7271 // especially useful to prevent i64 mul being scalarized.
7272 if (isSplatShuffle(Shuffle) &&
7273 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
7274 Ops.push_back(&Shuffle->getOperandUse(0));
7275 Ops.push_back(&Op);
7276 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
7277 NumSExts++;
7278 else
7279 NumZExts++;
7280 continue;
7281 }
7282
7283 Value *ShuffleOperand = Shuffle->getOperand(0);
7284 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
7285 if (!Insert)
7286 continue;
7287
7288 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
7289 if (!OperandInstr)
7290 continue;
7291
7292 ConstantInt *ElementConstant =
7293 dyn_cast<ConstantInt>(Insert->getOperand(2));
7294 // Check that the insertelement is inserting into element 0
7295 if (!ElementConstant || !ElementConstant->isZero())
7296 continue;
7297
7298 unsigned Opcode = OperandInstr->getOpcode();
7299 if (Opcode == Instruction::SExt)
7300 NumSExts++;
7301 else if (Opcode == Instruction::ZExt)
7302 NumZExts++;
7303 else {
7304 // If we find that the top bits are known 0, then we can sink and allow
7305 // the backend to generate a umull.
7306 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
7307 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
7308 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
7309 continue;
7310 NumZExts++;
7311 }
7312
7313 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
7314 // the And, just to hoist it again back to the load.
7315 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
7316 Ops.push_back(&Insert->getOperandUse(1));
7317 Ops.push_back(&Shuffle->getOperandUse(0));
7318 Ops.push_back(&Op);
7319 }
7320
7321 // It is profitable to sink if we found two of the same type of extends.
7322 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
7323 return true;
7324
7325 // Otherwise, see if we should sink splats for indexed variants.
7326 if (!ShouldSinkSplatForIndexedVariant(I))
7327 return false;
7328
7329 Ops.clear();
7330 if (isSplatShuffle(I->getOperand(0)))
7331 Ops.push_back(&I->getOperandUse(0));
7332 if (isSplatShuffle(I->getOperand(1)))
7333 Ops.push_back(&I->getOperandUse(1));
7334
7335 return !Ops.empty();
7336 }
7337 case Instruction::FMul: {
7338 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7339 if (I->getType()->isScalableTy())
7340 return !Ops.empty();
7341
7342 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
7343 !ST->hasFullFP16())
7344 return !Ops.empty();
7345
7346 // Sink splats for index lane variants
7347 if (isSplatShuffle(I->getOperand(0)))
7348 Ops.push_back(&I->getOperandUse(0));
7349 if (isSplatShuffle(I->getOperand(1)))
7350 Ops.push_back(&I->getOperandUse(1));
7351 return !Ops.empty();
7352 }
7353 default:
7354 return false;
7355 }
7356 return false;
7357}
static bool isAllActivePredicate(const SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineWhilelo(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST)
static void extractAttrFeatures(const Function &F, const AArch64TTIImpl *TTI, SmallVectorImpl< StringRef > &Features)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< int > Aarch64ForceUnrollThreshold("aarch64-force-unroll-threshold", cl::init(0), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in AArch64"))
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static bool isFNeg(Value *Op)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE, const DominatorTree &DT)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static std::optional< Instruction * > instCombineSVEVectorMlaU(InstCombiner &IC, IntrinsicInst &II)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
static Value * getCondition(Instruction *I)
Hexagon Common GEP
const HexagonInstrInfo * TII
#define _
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
#define LLVM_DEBUG(...)
Definition Debug.h:119
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool preferTailFoldingOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
APInt getPriorityMask(const Function &F) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
InstructionCost getBranchMispredictPenalty() const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const override
bool enableScalableVectorization() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
unsigned countLeadingOnes() const
Definition APInt.h:1647
void negate()
Negate this APInt in place.
Definition APInt.h:1491
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1084
unsigned logBase2() const
Definition APInt.h:1784
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:834
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:254
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743
@ ICMP_SLT
signed less than
Definition InstrTypes.h:769
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:770
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:746
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:744
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:745
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:751
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:747
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:749
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:768
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:756
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:839
bool isUnsigned() const
Definition InstrTypes.h:999
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
LLVM_ABI Constant * getSplatValue(bool AllowPoison=false) const
If all elements of the vector constant have the same value, return that value.
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:791
bool empty() const
Definition DenseMap.h:173
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:216
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool noSignedZeros() const
Definition FMF.h:67
bool noInfs() const
Definition FMF.h:66
bool approxFunc() const
Definition FMF.h:70
bool allowContract() const
Definition FMF.h:69
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:869
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2669
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2657
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:599
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:619
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:586
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:604
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2052
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:534
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2366
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2581
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1789
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1491
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2284
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1958
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2691
LLVM_ABI Value * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={}, function_ref< void(CallInst *)> SetFn=[](CallInst *) {})
Variant to create a possibly constant-folded intrinsic.
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1971
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1474
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:614
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2357
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1178
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2900
This instruction inserts a single (scalar) element into a VectorType value.
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
LLVM_ABI FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
static MVT getScalableVectorVT(MVT VT, unsigned NumElements)
bool isFixedLengthVector() const
MVT getVectorElementType() const
size_type size() const
Definition MapVector.h:58
Information for memory intrinsic cost model.
const Instruction * getInst() const
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:891
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
An instruction for storing to memory.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:730
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:310
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:288
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:147
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:158
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:286
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:993
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:400
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
LLVM_ABI APInt getCpuSupportsMask(ArrayRef< StringRef > Features)
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:888
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:852
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:858
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:986
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:934
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:967
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:864
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
match_bind< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
brc_match< Cond_t, match_bind< BasicBlock >, match_bind< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
LLVM_ABI Libcall getPOW(EVT RetVT)
getPOW - Return the POW_* value for the given types, or UNKNOWN_LIBCALL if there is none.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0,...
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Uninitialized
Definition Threading.h:60
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:328
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ FSub
Subtraction of floats.
@ FAddChainWithSubs
A chain of fadds and fsubs.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0,...
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:307
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:382
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
bool isFixedLengthVector() const
Definition ValueTypes.h:199
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:346
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:187
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:129
bool isVariant() const
Definition MCSchedule.h:150
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:264
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...