LLVM 23.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
81 "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
82 cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83
84namespace {
85class TailFoldingOption {
86 // These bitfields will only ever be set to something non-zero in operator=,
87 // when setting the -sve-tail-folding option. This option should always be of
88 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
89 // InitialBits is one of (disabled|all|simple). EnableBits represents
90 // additional flags we're enabling, and DisableBits for those flags we're
91 // disabling. The default flag is tracked in the variable NeedsDefault, since
92 // at the time of setting the option we may not know what the default value
93 // for the CPU is.
97
98 // This value needs to be initialised to true in case the user does not
99 // explicitly set the -sve-tail-folding option.
100 bool NeedsDefault = true;
101
102 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104 void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106 void setEnableBit(TailFoldingOpts Bit) {
107 EnableBits |= Bit;
108 DisableBits &= ~Bit;
109 }
110
111 void setDisableBit(TailFoldingOpts Bit) {
112 EnableBits &= ~Bit;
113 DisableBits |= Bit;
114 }
115
116 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
123 Bits |= EnableBits;
124 Bits &= ~DisableBits;
125
126 return Bits;
127 }
128
129 void reportError(std::string Opt) {
130 errs() << "invalid argument '" << Opt
131 << "' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
134 report_fatal_error("Unrecognised tail-folding option");
135 }
136
137public:
138
139 void operator=(const std::string &Val) {
140 // If the user explicitly sets -sve-tail-folding= then treat as an error.
141 if (Val.empty()) {
142 reportError("");
143 return;
144 }
145
146 // Since the user is explicitly setting the option we don't automatically
147 // need the default unless they require it.
148 setNeedsDefault(false);
149
150 SmallVector<StringRef, 4> TailFoldTypes;
151 StringRef(Val).split(TailFoldTypes, '+', -1, false);
152
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] == "disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] == "all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] == "default")
159 setNeedsDefault(true);
160 else if (TailFoldTypes[0] == "simple")
161 setInitialBits(TailFoldingOpts::Simple);
162 else {
163 StartIdx = 0;
164 setInitialBits(TailFoldingOpts::Disabled);
165 }
166
167 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168 if (TailFoldTypes[I] == "reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[I] == "noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[I] == "norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[I] == "noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
180 else
181 reportError(Val);
182 }
183 }
184
185 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186 return (getBits(DefaultBits) & Required) == Required;
187 }
188};
189} // namespace
190
191TailFoldingOption TailFoldingOptionLoc;
192
194 "sve-tail-folding",
195 cl::desc(
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
199 "tail-folding"
200 "\ndefault (Initial) Uses the default tail-folding settings for "
201 "the target CPU"
202 "\nall (Initial) All legal loop types will vectorize using "
203 "tail-folding"
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
209 "recurrences"
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
212 "predicates"
213 "\nnoreverse Inverse of above"),
215
216// Experimental option that will only be fully functional when the
217// code-generator is changed to use SVE instead of NEON for all fixed-width
218// operations.
220 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
221
222// Experimental option that will only be fully functional when the cost-model
223// and code-generator have been changed to avoid using scalable vector
224// instructions that are not legal in streaming SVE mode.
226 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
227
228static bool isSMEABIRoutineCall(const CallInst &CI,
229 const AArch64TargetLowering &TLI) {
230 const auto *F = CI.getCalledFunction();
231 return F &&
233}
234
235/// Returns true if the function has explicit operations that can only be
236/// lowered using incompatible instructions for the selected mode. This also
237/// returns true if the function F may use or modify ZA state.
239 const AArch64TargetLowering &TLI) {
240 for (const BasicBlock &BB : *F) {
241 for (const Instruction &I : BB) {
242 // Be conservative for now and assume that any call to inline asm or to
243 // intrinsics could could result in non-streaming ops (e.g. calls to
244 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
245 // all native LLVM instructions can be lowered to compatible instructions.
246 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
247 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
249 return true;
250 }
251 }
252 return false;
253}
254
256 SmallVectorImpl<StringRef> &Features) {
257 StringRef AttributeStr =
258 TTI->isMultiversionedFunction(F) ? "fmv-features" : "target-features";
259 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
260 FeatureStr.split(Features, ",");
261}
262
265 extractAttrFeatures(F, this, Features);
266 return AArch64::getCpuSupportsMask(Features);
267}
268
271 extractAttrFeatures(F, this, Features);
272 return AArch64::getFMVPriority(Features);
273}
274
276 return F.hasFnAttribute("fmv-features");
277}
278
279const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
280 AArch64::FeatureExecuteOnly,
281};
282
284 const Function *Callee) const {
285 SMECallAttrs CallAttrs(*Caller, *Callee);
286
287 // Never inline a function explicitly marked as being streaming,
288 // into a non-streaming function. Assume it was marked as streaming
289 // for a reason.
290 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
292 return false;
293
294 // When inlining, we should consider the body of the function, not the
295 // interface.
296 if (CallAttrs.callee().hasStreamingBody()) {
297 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
298 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
299 }
300
301 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
302 return false;
303
304 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
305 CallAttrs.requiresPreservingZT0() ||
306 CallAttrs.requiresPreservingAllZAState()) {
307 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
308 return false;
309 }
310
311 const TargetMachine &TM = getTLI()->getTargetMachine();
312 const FeatureBitset &CallerBits =
313 TM.getSubtargetImpl(*Caller)->getFeatureBits();
314 const FeatureBitset &CalleeBits =
315 TM.getSubtargetImpl(*Callee)->getFeatureBits();
316 // Adjust the feature bitsets by inverting some of the bits. This is needed
317 // for target features that represent restrictions rather than capabilities,
318 // for example a "+execute-only" callee can be inlined into a caller without
319 // "+execute-only", but not vice versa.
320 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
321 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
322
323 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
324}
325
327 const Function *Callee,
328 ArrayRef<Type *> Types) const {
329 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
330 return false;
331
332 // We need to ensure that argument promotion does not attempt to promote
333 // pointers to fixed-length vector types larger than 128 bits like
334 // <8 x float> (and pointers to aggregate types which have such fixed-length
335 // vector type members) into the values of the pointees. Such vector types
336 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
337 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
338 // types can be safely treated as 128-bit NEON types and they cannot be
339 // distinguished in IR.
340 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
341 auto FVTy = dyn_cast<FixedVectorType>(Ty);
342 return FVTy &&
343 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
344 }))
345 return false;
346
347 return true;
348}
349
350unsigned
352 unsigned DefaultCallPenalty) const {
353 // This function calculates a penalty for executing Call in F.
354 //
355 // There are two ways this function can be called:
356 // (1) F:
357 // call from F -> G (the call here is Call)
358 //
359 // For (1), Call.getCaller() == F, so it will always return a high cost if
360 // a streaming-mode change is required (thus promoting the need to inline the
361 // function)
362 //
363 // (2) F:
364 // call from F -> G (the call here is not Call)
365 // G:
366 // call from G -> H (the call here is Call)
367 //
368 // For (2), if after inlining the body of G into F the call to H requires a
369 // streaming-mode change, and the call to G from F would also require a
370 // streaming-mode change, then there is benefit to do the streaming-mode
371 // change only once and avoid inlining of G into F.
372
373 SMEAttrs FAttrs(*F);
374 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
375
376 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
377 if (F == Call.getCaller()) // (1)
378 return CallPenaltyChangeSM * DefaultCallPenalty;
379 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
380 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
381 }
382
383 return DefaultCallPenalty;
384}
385
389
390 if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
391 return true;
392
394 ST->isSVEorStreamingSVEAvailable() &&
395 !ST->disableMaximizeScalableBandwidth();
396}
397
398/// Calculate the cost of materializing a 64-bit value. This helper
399/// method might only calculate a fraction of a larger immediate. Therefore it
400/// is valid to return a cost of ZERO.
402 // Check if the immediate can be encoded within an instruction.
403 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
404 return 0;
405
406 if (Val < 0)
407 Val = ~Val;
408
409 // Calculate how many moves we will need to materialize this constant.
411 AArch64_IMM::expandMOVImm(Val, 64, Insn);
412 return Insn.size();
413}
414
415/// Calculate the cost of materializing the given constant.
419 assert(Ty->isIntegerTy());
420
421 unsigned BitSize = Ty->getPrimitiveSizeInBits();
422 if (BitSize == 0)
423 return ~0U;
424
425 // Sign-extend all constants to a multiple of 64-bit.
426 APInt ImmVal = Imm;
427 if (BitSize & 0x3f)
428 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
429
430 // Split the constant into 64-bit chunks and calculate the cost for each
431 // chunk.
433 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
434 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
435 int64_t Val = Tmp.getSExtValue();
436 Cost += getIntImmCost(Val);
437 }
438 // We need at least one instruction to materialze the constant.
439 return std::max<InstructionCost>(1, Cost);
440}
441
443 const APInt &Imm, Type *Ty,
445 Instruction *Inst) const {
446 assert(Ty->isIntegerTy());
447
448 unsigned BitSize = Ty->getPrimitiveSizeInBits();
449 // There is no cost model for constants with a bit size of 0. Return TCC_Free
450 // here, so that constant hoisting will ignore this constant.
451 if (BitSize == 0)
452 return TTI::TCC_Free;
453
454 unsigned ImmIdx = ~0U;
455 switch (Opcode) {
456 default:
457 return TTI::TCC_Free;
458 case Instruction::GetElementPtr:
459 // Always hoist the base address of a GetElementPtr.
460 if (Idx == 0)
461 return 2 * TTI::TCC_Basic;
462 return TTI::TCC_Free;
463 case Instruction::Store:
464 ImmIdx = 0;
465 break;
466 case Instruction::Add:
467 case Instruction::Sub:
468 case Instruction::Mul:
469 case Instruction::UDiv:
470 case Instruction::SDiv:
471 case Instruction::URem:
472 case Instruction::SRem:
473 case Instruction::And:
474 case Instruction::Or:
475 case Instruction::Xor:
476 case Instruction::ICmp:
477 ImmIdx = 1;
478 break;
479 // Always return TCC_Free for the shift value of a shift instruction.
480 case Instruction::Shl:
481 case Instruction::LShr:
482 case Instruction::AShr:
483 if (Idx == 1)
484 return TTI::TCC_Free;
485 break;
486 case Instruction::Trunc:
487 case Instruction::ZExt:
488 case Instruction::SExt:
489 case Instruction::IntToPtr:
490 case Instruction::PtrToInt:
491 case Instruction::BitCast:
492 case Instruction::PHI:
493 case Instruction::Call:
494 case Instruction::Select:
495 case Instruction::Ret:
496 case Instruction::Load:
497 break;
498 }
499
500 if (Idx == ImmIdx) {
501 int NumConstants = (BitSize + 63) / 64;
503 return (Cost <= NumConstants * TTI::TCC_Basic)
504 ? static_cast<int>(TTI::TCC_Free)
505 : Cost;
506 }
508}
509
512 const APInt &Imm, Type *Ty,
514 assert(Ty->isIntegerTy());
515
516 unsigned BitSize = Ty->getPrimitiveSizeInBits();
517 // There is no cost model for constants with a bit size of 0. Return TCC_Free
518 // here, so that constant hoisting will ignore this constant.
519 if (BitSize == 0)
520 return TTI::TCC_Free;
521
522 // Most (all?) AArch64 intrinsics do not support folding immediates into the
523 // selected instruction, so we compute the materialization cost for the
524 // immediate directly.
525 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
527
528 switch (IID) {
529 default:
530 return TTI::TCC_Free;
531 case Intrinsic::sadd_with_overflow:
532 case Intrinsic::uadd_with_overflow:
533 case Intrinsic::ssub_with_overflow:
534 case Intrinsic::usub_with_overflow:
535 case Intrinsic::smul_with_overflow:
536 case Intrinsic::umul_with_overflow:
537 if (Idx == 1) {
538 int NumConstants = (BitSize + 63) / 64;
540 return (Cost <= NumConstants * TTI::TCC_Basic)
541 ? static_cast<int>(TTI::TCC_Free)
542 : Cost;
543 }
544 break;
545 case Intrinsic::experimental_stackmap:
546 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
547 return TTI::TCC_Free;
548 break;
549 case Intrinsic::experimental_patchpoint_void:
550 case Intrinsic::experimental_patchpoint:
551 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
552 return TTI::TCC_Free;
553 break;
554 case Intrinsic::experimental_gc_statepoint:
555 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
556 return TTI::TCC_Free;
557 break;
558 }
560}
561
563AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
564 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
565 if (TyWidth == 32 || TyWidth == 64)
567 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
568 return TTI::PSK_Software;
569}
570
571static bool isUnpackedVectorVT(EVT VecVT) {
572 return VecVT.isScalableVector() &&
574}
575
577 const IntrinsicCostAttributes &ICA) {
578 // We need to know at least the number of elements in the vector of buckets
579 // and the size of each element to update.
580 if (ICA.getArgTypes().size() < 2)
582
583 // Only interested in costing for the hardware instruction from SVE2.
584 if (!ST->hasSVE2())
586
587 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
588 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
589 unsigned TotalHistCnts = 1;
590
591 unsigned EltSize = EltTy->getScalarSizeInBits();
592 // Only allow (up to 64b) integers or pointers
593 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
595
596 // FIXME: We should be able to generate histcnt for fixed-length vectors
597 // using ptrue with a specific VL.
598 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
599 unsigned EC = VTy->getElementCount().getKnownMinValue();
600 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
602
603 // HistCnt only supports 32b and 64b element types
604 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
605
606 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
608
609 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
610 TotalHistCnts = EC / NaturalVectorWidth;
611
612 return InstructionCost(BaseHistCntCost * TotalHistCnts);
613 }
614
616}
617
621 // The code-generator is currently not able to handle scalable vectors
622 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
623 // it. This change will be removed when code-generation for these types is
624 // sufficiently reliable.
625 auto *RetTy = ICA.getReturnType();
626 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
627 if (VTy->getElementCount() == ElementCount::getScalable(1))
629
630 switch (ICA.getID()) {
631 case Intrinsic::experimental_vector_histogram_add: {
632 InstructionCost HistCost = getHistogramCost(ST, ICA);
633 // If the cost isn't valid, we may still be able to scalarize
634 if (HistCost.isValid())
635 return HistCost;
636 break;
637 }
638 case Intrinsic::umin:
639 case Intrinsic::umax:
640 case Intrinsic::smin:
641 case Intrinsic::smax: {
642 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
643 MVT::v8i16, MVT::v2i32, MVT::v4i32,
644 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
645 MVT::nxv2i64};
646 auto LT = getTypeLegalizationCost(RetTy);
647 // v2i64 types get converted to cmp+bif hence the cost of 2
648 if (LT.second == MVT::v2i64)
649 return LT.first * 2;
650 if (any_of(ValidMinMaxTys, equal_to(LT.second)))
651 return LT.first;
652 break;
653 }
654 case Intrinsic::scmp:
655 case Intrinsic::ucmp: {
656 static const CostTblEntry BitreverseTbl[] = {
657 {Intrinsic::scmp, MVT::i32, 3}, // cmp+cset+csinv
658 {Intrinsic::scmp, MVT::i64, 3}, // cmp+cset+csinv
659 {Intrinsic::scmp, MVT::v8i8, 3}, // cmgt+cmgt+sub
660 {Intrinsic::scmp, MVT::v16i8, 3}, // cmgt+cmgt+sub
661 {Intrinsic::scmp, MVT::v4i16, 3}, // cmgt+cmgt+sub
662 {Intrinsic::scmp, MVT::v8i16, 3}, // cmgt+cmgt+sub
663 {Intrinsic::scmp, MVT::v2i32, 3}, // cmgt+cmgt+sub
664 {Intrinsic::scmp, MVT::v4i32, 3}, // cmgt+cmgt+sub
665 {Intrinsic::scmp, MVT::v1i64, 3}, // cmgt+cmgt+sub
666 {Intrinsic::scmp, MVT::v2i64, 3}, // cmgt+cmgt+sub
667 };
668 const auto LT = getTypeLegalizationCost(RetTy);
669 const auto *Entry =
670 CostTableLookup(BitreverseTbl, Intrinsic::scmp, LT.second);
671 if (Entry)
672 return Entry->Cost * LT.first;
673 break;
674 }
675 case Intrinsic::sadd_sat:
676 case Intrinsic::ssub_sat:
677 case Intrinsic::uadd_sat:
678 case Intrinsic::usub_sat: {
679 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
680 MVT::v8i16, MVT::v2i32, MVT::v4i32,
681 MVT::v2i64};
682 auto LT = getTypeLegalizationCost(RetTy);
683 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
684 // need to extend the type, as it uses shr(qadd(shl, shl)).
685 unsigned Instrs =
686 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
687 if (any_of(ValidSatTys, equal_to(LT.second)))
688 return LT.first * Instrs;
689
691 uint64_t VectorSize = TS.getKnownMinValue();
692
693 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
694 return LT.first * Instrs;
695
696 break;
697 }
698 case Intrinsic::abs: {
699 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
700 MVT::v8i16, MVT::v2i32, MVT::v4i32,
701 MVT::v2i64, MVT::nxv16i8, MVT::nxv8i16,
702 MVT::nxv4i32, MVT::nxv2i64};
703 auto LT = getTypeLegalizationCost(RetTy);
704 if (any_of(ValidAbsTys, equal_to(LT.second)))
705 return LT.first;
706 break;
707 }
708 case Intrinsic::bswap: {
709 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
710 MVT::v4i32, MVT::v2i64};
711 auto LT = getTypeLegalizationCost(RetTy);
712 if (any_of(ValidAbsTys, equal_to(LT.second)) &&
713 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
714 return LT.first;
715 break;
716 }
717 case Intrinsic::fma:
718 case Intrinsic::fmuladd: {
719 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
720 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
721 Type *EltTy = RetTy->getScalarType();
722 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
723 (EltTy->isHalfTy() && ST->hasFullFP16()))
724 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
725 break;
726 }
727 case Intrinsic::stepvector: {
728 InstructionCost Cost = 1; // Cost of the `index' instruction
729 auto LT = getTypeLegalizationCost(RetTy);
730 // Legalisation of illegal vectors involves an `index' instruction plus
731 // (LT.first - 1) vector adds.
732 if (LT.first > 1) {
733 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
734 InstructionCost AddCost =
735 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
736 Cost += AddCost * (LT.first - 1);
737 }
738 return Cost;
739 }
740 case Intrinsic::vector_extract:
741 case Intrinsic::vector_insert: {
742 // If both the vector and subvector types are legal types and the index
743 // is 0, then this should be a no-op or simple operation; return a
744 // relatively low cost.
745
746 // If arguments aren't actually supplied, then we cannot determine the
747 // value of the index. We also want to skip predicate types.
748 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
750 break;
751
752 LLVMContext &C = RetTy->getContext();
753 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
754 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
755 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
756 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
757 // Skip this if either the vector or subvector types are unpacked
758 // SVE types; they may get lowered to stack stores and loads.
759 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
760 break;
761
763 getTLI()->getTypeConversion(C, SubVecVT);
765 getTLI()->getTypeConversion(C, VecVT);
766 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
767 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
768 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
769 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
770 return TTI::TCC_Free;
771 break;
772 }
773 case Intrinsic::bitreverse: {
774 static const CostTblEntry BitreverseTbl[] = {
775 {Intrinsic::bitreverse, MVT::i32, 1},
776 {Intrinsic::bitreverse, MVT::i64, 1},
777 {Intrinsic::bitreverse, MVT::v8i8, 1},
778 {Intrinsic::bitreverse, MVT::v16i8, 1},
779 {Intrinsic::bitreverse, MVT::v4i16, 2},
780 {Intrinsic::bitreverse, MVT::v8i16, 2},
781 {Intrinsic::bitreverse, MVT::v2i32, 2},
782 {Intrinsic::bitreverse, MVT::v4i32, 2},
783 {Intrinsic::bitreverse, MVT::v1i64, 2},
784 {Intrinsic::bitreverse, MVT::v2i64, 2},
785 };
786 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
787 const auto *Entry =
788 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
789 if (Entry) {
790 // Cost Model is using the legal type(i32) that i8 and i16 will be
791 // converted to +1 so that we match the actual lowering cost
792 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
793 TLI->getValueType(DL, RetTy, true) == MVT::i16)
794 return LegalisationCost.first * Entry->Cost + 1;
795
796 return LegalisationCost.first * Entry->Cost;
797 }
798 break;
799 }
800 case Intrinsic::ctpop: {
801 if (!ST->hasNEON()) {
802 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
803 return getTypeLegalizationCost(RetTy).first * 12;
804 }
805 static const CostTblEntry CtpopCostTbl[] = {
806 {ISD::CTPOP, MVT::v2i64, 4},
807 {ISD::CTPOP, MVT::v4i32, 3},
808 {ISD::CTPOP, MVT::v8i16, 2},
809 {ISD::CTPOP, MVT::v16i8, 1},
810 {ISD::CTPOP, MVT::i64, 4},
811 {ISD::CTPOP, MVT::v2i32, 3},
812 {ISD::CTPOP, MVT::v4i16, 2},
813 {ISD::CTPOP, MVT::v8i8, 1},
814 {ISD::CTPOP, MVT::i32, 5},
815 };
816 auto LT = getTypeLegalizationCost(RetTy);
817 MVT MTy = LT.second;
818 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
819 // Extra cost of +1 when illegal vector types are legalized by promoting
820 // the integer type.
821 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
822 RetTy->getScalarSizeInBits()
823 ? 1
824 : 0;
825 return LT.first * Entry->Cost + ExtraCost;
826 }
827 break;
828 }
829 case Intrinsic::sadd_with_overflow:
830 case Intrinsic::uadd_with_overflow:
831 case Intrinsic::ssub_with_overflow:
832 case Intrinsic::usub_with_overflow:
833 case Intrinsic::smul_with_overflow:
834 case Intrinsic::umul_with_overflow: {
835 static const CostTblEntry WithOverflowCostTbl[] = {
836 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
837 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
838 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
839 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
840 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
841 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
842 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
843 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
844 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
845 {Intrinsic::usub_with_overflow, MVT::i8, 3},
846 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
847 {Intrinsic::usub_with_overflow, MVT::i16, 3},
848 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
849 {Intrinsic::usub_with_overflow, MVT::i32, 1},
850 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
851 {Intrinsic::usub_with_overflow, MVT::i64, 1},
852 {Intrinsic::smul_with_overflow, MVT::i8, 5},
853 {Intrinsic::umul_with_overflow, MVT::i8, 4},
854 {Intrinsic::smul_with_overflow, MVT::i16, 5},
855 {Intrinsic::umul_with_overflow, MVT::i16, 4},
856 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
857 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
858 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
859 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
860 };
861 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
862 if (MTy.isSimple())
863 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
864 MTy.getSimpleVT()))
865 return Entry->Cost;
866 break;
867 }
868 case Intrinsic::fptosi_sat:
869 case Intrinsic::fptoui_sat: {
870 if (ICA.getArgTypes().empty())
871 break;
872 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
873 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
874 EVT MTy = TLI->getValueType(DL, RetTy);
875 // Check for the legal types, which are where the size of the input and the
876 // output are the same, or we are using cvt f64->i32 or f32->i64.
877 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
878 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
879 LT.second == MVT::v2f64)) {
880 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
881 (LT.second == MVT::f64 && MTy == MVT::i32) ||
882 (LT.second == MVT::f32 && MTy == MVT::i64)))
883 return LT.first;
884 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
885 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
886 MTy.getScalarSizeInBits() == 64)
887 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
888 }
889 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
890 // f32.
891 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
892 return LT.first + getIntrinsicInstrCost(
893 {ICA.getID(),
894 RetTy,
895 {ICA.getArgTypes()[0]->getWithNewType(
896 Type::getFloatTy(RetTy->getContext()))}},
897 CostKind);
898 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
899 (LT.second == MVT::f16 && MTy == MVT::i64) ||
900 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
901 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
902 return LT.first;
903 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
904 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
905 MTy.getScalarSizeInBits() == 32)
906 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
907 // Extending vector types v8f16->v8i32. These current scalarize but the
908 // codegen could be better.
909 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
910 MTy.getScalarSizeInBits() == 64)
911 return MTy.getVectorNumElements() * 3;
912
913 // If we can we use a legal convert followed by a min+max
914 if ((LT.second.getScalarType() == MVT::f32 ||
915 LT.second.getScalarType() == MVT::f64 ||
916 LT.second.getScalarType() == MVT::f16) &&
917 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
918 Type *LegalTy =
919 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
920 if (LT.second.isVector())
921 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
923 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
924 : Intrinsic::umin,
925 LegalTy, {LegalTy, LegalTy});
927 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
928 : Intrinsic::umax,
929 LegalTy, {LegalTy, LegalTy});
931 return LT.first * Cost +
932 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
933 : 1);
934 }
935 // Otherwise we need to follow the default expansion that clamps the value
936 // using a float min/max with a fcmp+sel for nan handling when signed.
937 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
938 RetTy = RetTy->getScalarType();
939 if (LT.second.isVector()) {
940 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
941 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
942 }
943 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
945 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
947 Cost +=
948 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
950 if (IsSigned) {
951 Type *CondTy = RetTy->getWithNewBitWidth(1);
952 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
954 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
956 }
957 return LT.first * Cost;
958 }
959 case Intrinsic::fshl:
960 case Intrinsic::fshr: {
961 if (ICA.getArgs().empty())
962 break;
963
964 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
965
966 // ROTR / ROTL is a funnel shift with equal first and second operand. For
967 // ROTR on integer registers (i32/i64) this can be done in a single ror
968 // instruction. A fshl with a non-constant shift uses a neg + ror.
969 if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] &&
970 (RetTy->getPrimitiveSizeInBits() == 32 ||
971 RetTy->getPrimitiveSizeInBits() == 64)) {
972 InstructionCost NegCost =
973 (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0;
974 return 1 + NegCost;
975 }
976
977 // TODO: Add handling for fshl where third argument is not a constant.
978 if (!OpInfoZ.isConstant())
979 break;
980
981 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
982 if (OpInfoZ.isUniform()) {
983 static const CostTblEntry FshlTbl[] = {
984 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
985 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
986 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
987 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
988 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
989 // to avoid having to duplicate the costs.
990 const auto *Entry =
991 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
992 if (Entry)
993 return LegalisationCost.first * Entry->Cost;
994 }
995
996 auto TyL = getTypeLegalizationCost(RetTy);
997 if (!RetTy->isIntegerTy())
998 break;
999
1000 // Estimate cost manually, as types like i8 and i16 will get promoted to
1001 // i32 and CostTableLookup will ignore the extra conversion cost.
1002 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
1003 RetTy->getScalarSizeInBits() < 64) ||
1004 (RetTy->getScalarSizeInBits() % 64 != 0);
1005 unsigned ExtraCost = HigherCost ? 1 : 0;
1006 if (RetTy->getScalarSizeInBits() == 32 ||
1007 RetTy->getScalarSizeInBits() == 64)
1008 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
1009 // extr instruction.
1010 else if (HigherCost)
1011 ExtraCost = 1;
1012 else
1013 break;
1014 return TyL.first + ExtraCost;
1015 }
1016 case Intrinsic::get_active_lane_mask: {
1017 auto RetTy = cast<VectorType>(ICA.getReturnType());
1018 EVT RetVT = getTLI()->getValueType(DL, RetTy);
1019 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1020 if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
1021 break;
1022
1023 if (RetTy->isScalableTy()) {
1024 if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
1026 break;
1027
1028 auto LT = getTypeLegalizationCost(RetTy);
1029 InstructionCost Cost = LT.first;
1030 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
1031 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
1032 // nxv32i1 = get_active_lane_mask(base, idx) ->
1033 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
1034 if (ST->hasSVE2p1() || ST->hasSME2()) {
1035 Cost /= 2;
1036 if (Cost == 1)
1037 return Cost;
1038 }
1039
1040 // If more than one whilelo intrinsic is required, include the extra cost
1041 // required by the saturating add & select required to increment the
1042 // start value after the first intrinsic call.
1043 Type *OpTy = ICA.getArgTypes()[0];
1044 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
1045 InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
1046 Type *CondTy = OpTy->getWithNewBitWidth(1);
1047 SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
1049 return Cost + (SplitCost * (Cost - 1));
1050 } else if (!getTLI()->isTypeLegal(RetVT)) {
1051 // We don't have enough context at this point to determine if the mask
1052 // is going to be kept live after the block, which will force the vXi1
1053 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1054 // For now, we just assume the vectorizer created this intrinsic and
1055 // the result will be the input for a PHI. In this case the cost will
1056 // be extremely high for fixed-width vectors.
1057 // NOTE: getScalarizationOverhead returns a cost that's far too
1058 // pessimistic for the actual generated codegen. In reality there are
1059 // two instructions generated per lane.
1060 return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
1061 }
1062 break;
1063 }
1064 case Intrinsic::experimental_vector_match: {
1065 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
1066 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1067 unsigned SearchSize = NeedleTy->getNumElements();
1068 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
1069 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1070 // Neoverse V3, these are cheap operations with the same latency as a
1071 // vector ADD. In most cases, however, we also need to do an extra DUP.
1072 // For fixed-length vectors we currently need an extra five--six
1073 // instructions besides the MATCH.
1075 if (isa<FixedVectorType>(RetTy))
1076 Cost += 10;
1077 return Cost;
1078 }
1079 break;
1080 }
1081 case Intrinsic::cttz: {
1082 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1083 if (LT.second == MVT::v8i8 || LT.second == MVT::v16i8)
1084 return LT.first * 2;
1085 if (LT.second == MVT::v4i16 || LT.second == MVT::v8i16 ||
1086 LT.second == MVT::v2i32 || LT.second == MVT::v4i32)
1087 return LT.first * 3;
1088 break;
1089 }
1090 case Intrinsic::experimental_cttz_elts: {
1091 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1092 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1093 // This will consist of a SVE brkb and a cntp instruction. These
1094 // typically have the same latency and half the throughput as a vector
1095 // add instruction.
1096 return 4;
1097 }
1098 break;
1099 }
1100 case Intrinsic::loop_dependence_raw_mask:
1101 case Intrinsic::loop_dependence_war_mask: {
1102 // The whilewr/rw instructions require SVE2 or SME.
1103 if (ST->hasSVE2() || ST->hasSME()) {
1104 EVT VecVT = getTLI()->getValueType(DL, RetTy);
1105 unsigned EltSizeInBytes =
1106 cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
1107 if (!is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes) ||
1108 VecVT.getVectorMinNumElements() != (16 / EltSizeInBytes))
1109 break;
1110 // For fixed-vector types we need to AND the mask with a ptrue vl<N>.
1111 return isa<FixedVectorType>(RetTy) ? 2 : 1;
1112 }
1113 break;
1114 }
1115 case Intrinsic::experimental_vector_extract_last_active:
1116 if (ST->isSVEorStreamingSVEAvailable()) {
1117 auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1118 // This should turn into chained clastb instructions.
1119 return LegalCost;
1120 }
1121 break;
1122 case Intrinsic::pow: {
1123 // For scalar calls we know the target has the libcall, and for fixed-width
1124 // vectors we know for the worst case it can be scalarised.
1125 EVT VT = getTLI()->getValueType(DL, RetTy);
1126 RTLIB::Libcall LC = RTLIB::getPOW(VT);
1127 bool HasLibcall = getTLI()->getLibcallImpl(LC) != RTLIB::Unsupported;
1128 bool CanLowerWithLibcalls = !isa<ScalableVectorType>(RetTy) || HasLibcall;
1129
1130 // If we know that the call can be lowered with libcalls then it's safe to
1131 // reduce the costs in some cases. This is important for scalable vectors,
1132 // since we cannot scalarize the call in the absence of a vector math
1133 // library.
1134 if (CanLowerWithLibcalls && ICA.getInst() && !ICA.getArgs().empty()) {
1135 // If we know the fast math flags and the exponent is a constant then the
1136 // cost may be less for some exponents like 0.25 and 0.75.
1137 const Constant *ExpC = dyn_cast<Constant>(ICA.getArgs()[1]);
1138 if (ExpC && isa<VectorType>(ExpC->getType()))
1139 ExpC = ExpC->getSplatValue();
1140 if (auto *ExpF = dyn_cast_or_null<ConstantFP>(ExpC)) {
1141 // The argument must be a FP constant.
1142 bool Is025 = ExpF->getValueAPF().isExactlyValue(0.25);
1143 bool Is075 = ExpF->getValueAPF().isExactlyValue(0.75);
1144 FastMathFlags FMF = ICA.getInst()->getFastMathFlags();
1145 if ((Is025 || Is075) && FMF.noInfs() && FMF.approxFunc() &&
1146 (!Is025 || FMF.noSignedZeros())) {
1147 IntrinsicCostAttributes Attrs(Intrinsic::sqrt, RetTy, {RetTy}, FMF);
1149 if (Is025)
1150 return 2 * Sqrt;
1152 getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
1153 return (Sqrt * 2) + FMul;
1154 }
1155 // TODO: For 1/3 exponents we expect the cbrt call to be slightly
1156 // cheaper than pow.
1157 }
1158 }
1159
1160 if (HasLibcall)
1161 return getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
1162 break;
1163 }
1164 case Intrinsic::sqrt:
1165 case Intrinsic::fabs:
1166 case Intrinsic::ceil:
1167 case Intrinsic::floor:
1168 case Intrinsic::nearbyint:
1169 case Intrinsic::round:
1170 case Intrinsic::rint:
1171 case Intrinsic::roundeven:
1172 case Intrinsic::trunc:
1173 case Intrinsic::minnum:
1174 case Intrinsic::maxnum:
1175 case Intrinsic::minimum:
1176 case Intrinsic::maximum: {
1177 if (isa<ScalableVectorType>(RetTy) && ST->isSVEorStreamingSVEAvailable()) {
1178 auto LT = getTypeLegalizationCost(RetTy);
1179 return LT.first;
1180 }
1181 break;
1182 }
1183 default:
1184 break;
1185 }
1187}
1188
1189/// The function will remove redundant reinterprets casting in the presence
1190/// of the control flow
1191static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1192 IntrinsicInst &II) {
1194 auto RequiredType = II.getType();
1195
1196 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1197 assert(PN && "Expected Phi Node!");
1198
1199 // Don't create a new Phi unless we can remove the old one.
1200 if (!PN->hasOneUse())
1201 return std::nullopt;
1202
1203 for (Value *IncValPhi : PN->incoming_values()) {
1204 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1205 if (!Reinterpret ||
1206 Reinterpret->getIntrinsicID() !=
1207 Intrinsic::aarch64_sve_convert_to_svbool ||
1208 RequiredType != Reinterpret->getArgOperand(0)->getType())
1209 return std::nullopt;
1210 }
1211
1212 // Create the new Phi
1213 IC.Builder.SetInsertPoint(PN);
1214 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1215 Worklist.push_back(PN);
1216
1217 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1218 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1219 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1220 Worklist.push_back(Reinterpret);
1221 }
1222
1223 // Cleanup Phi Node and reinterprets
1224 return IC.replaceInstUsesWith(II, NPN);
1225}
1226
1227// A collection of properties common to SVE intrinsics that allow for combines
1228// to be written without needing to know the specific intrinsic.
1230 //
1231 // Helper routines for common intrinsic definitions.
1232 //
1233
1234 // e.g. llvm.aarch64.sve.add pg, op1, op2
1235 // with IID ==> llvm.aarch64.sve.add_u
1236 static SVEIntrinsicInfo
1243
1244 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1251
1252 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1258
1259 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1265
1266 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1267 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1268 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1269 return SVEIntrinsicInfo()
1272 }
1273
1274 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1275 // llvm.aarch64.sve.ld1 pg, ptr
1282
1283 // All properties relate to predication and thus having a general predicate
1284 // is the minimum requirement to say there is intrinsic info to act on.
1285 explicit operator bool() const { return hasGoverningPredicate(); }
1286
1287 //
1288 // Properties relating to the governing predicate.
1289 //
1290
1292 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1293 }
1294
1296 assert(hasGoverningPredicate() && "Propery not set!");
1297 return GoverningPredicateIdx;
1298 }
1299
1301 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1302 GoverningPredicateIdx = Index;
1303 return *this;
1304 }
1305
1306 //
1307 // Properties relating to operations the intrinsic could be transformed into.
1308 // NOTE: This does not mean such a transformation is always possible, but the
1309 // knowledge makes it possible to reuse existing optimisations without needing
1310 // to embed specific handling for each intrinsic. For example, instruction
1311 // simplification can be used to optimise an intrinsic's active lanes.
1312 //
1313
1315 return UndefIntrinsic != Intrinsic::not_intrinsic;
1316 }
1317
1319 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1320 return UndefIntrinsic;
1321 }
1322
1324 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1325 UndefIntrinsic = IID;
1326 return *this;
1327 }
1328
1329 bool hasMatchingIROpode() const { return IROpcode != 0; }
1330
1331 unsigned getMatchingIROpode() const {
1332 assert(hasMatchingIROpode() && "Propery not set!");
1333 return IROpcode;
1334 }
1335
1337 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1338 IROpcode = Opcode;
1339 return *this;
1340 }
1341
1342 //
1343 // Properties relating to the result of inactive lanes.
1344 //
1345
1347 return ResultLanes == InactiveLanesTakenFromOperand;
1348 }
1349
1351 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1352 return OperandIdxForInactiveLanes;
1353 }
1354
1356 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1357 ResultLanes = InactiveLanesTakenFromOperand;
1358 OperandIdxForInactiveLanes = Index;
1359 return *this;
1360 }
1361
1363 return ResultLanes == InactiveLanesAreNotDefined;
1364 }
1365
1367 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1368 ResultLanes = InactiveLanesAreNotDefined;
1369 return *this;
1370 }
1371
1373 return ResultLanes == InactiveLanesAreUnused;
1374 }
1375
1377 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1378 ResultLanes = InactiveLanesAreUnused;
1379 return *this;
1380 }
1381
1382 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1383 // inactiveLanesAreZeroed =
1384 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1385 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1386
1388 ResultIsZeroInitialized = true;
1389 return *this;
1390 }
1391
1392 //
1393 // The first operand of unary merging operations is typically only used to
1394 // set the result for inactive lanes. Knowing this allows us to deadcode the
1395 // operand when we can prove there are no inactive lanes.
1396 //
1397
1399 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1400 }
1401
1403 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1404 return OperandIdxWithNoActiveLanes;
1405 }
1406
1408 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1409 OperandIdxWithNoActiveLanes = Index;
1410 return *this;
1411 }
1412
1413private:
1414 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1415
1416 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1417 unsigned IROpcode = 0;
1418
1419 enum PredicationStyle {
1421 InactiveLanesTakenFromOperand,
1422 InactiveLanesAreNotDefined,
1423 InactiveLanesAreUnused
1424 } ResultLanes = Uninitialized;
1425
1426 bool ResultIsZeroInitialized = false;
1427 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1428 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1429};
1430
1432 // Some SVE intrinsics do not use scalable vector types, but since they are
1433 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1434 if (!isa<ScalableVectorType>(II.getType()) &&
1435 all_of(II.args(), [&](const Value *V) {
1436 return !isa<ScalableVectorType>(V->getType());
1437 }))
1438 return SVEIntrinsicInfo();
1439
1440 Intrinsic::ID IID = II.getIntrinsicID();
1441 switch (IID) {
1442 default:
1443 break;
1444 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1445 case Intrinsic::aarch64_sve_fcvt_f16f32:
1446 case Intrinsic::aarch64_sve_fcvt_f16f64:
1447 case Intrinsic::aarch64_sve_fcvt_f32f16:
1448 case Intrinsic::aarch64_sve_fcvt_f32f64:
1449 case Intrinsic::aarch64_sve_fcvt_f64f16:
1450 case Intrinsic::aarch64_sve_fcvt_f64f32:
1451 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1452 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1453 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1454 case Intrinsic::aarch64_sve_fcvtzs:
1455 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1456 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1457 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1458 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1459 case Intrinsic::aarch64_sve_fcvtzu:
1460 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1461 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1462 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1463 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1464 case Intrinsic::aarch64_sve_scvtf:
1465 case Intrinsic::aarch64_sve_scvtf_f16i32:
1466 case Intrinsic::aarch64_sve_scvtf_f16i64:
1467 case Intrinsic::aarch64_sve_scvtf_f32i64:
1468 case Intrinsic::aarch64_sve_scvtf_f64i32:
1469 case Intrinsic::aarch64_sve_ucvtf:
1470 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1471 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1472 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1473 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1475
1476 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1477 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1478 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1479 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1481
1482 case Intrinsic::aarch64_sve_fabd:
1483 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1484 case Intrinsic::aarch64_sve_fadd:
1485 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1486 .setMatchingIROpcode(Instruction::FAdd);
1487 case Intrinsic::aarch64_sve_fdiv:
1488 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1489 .setMatchingIROpcode(Instruction::FDiv);
1490 case Intrinsic::aarch64_sve_fmax:
1491 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1492 case Intrinsic::aarch64_sve_fmaxnm:
1493 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1494 case Intrinsic::aarch64_sve_fmin:
1495 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1496 case Intrinsic::aarch64_sve_fminnm:
1497 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1498 case Intrinsic::aarch64_sve_fmla:
1499 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1500 case Intrinsic::aarch64_sve_fmls:
1501 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1502 case Intrinsic::aarch64_sve_fmul:
1503 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1504 .setMatchingIROpcode(Instruction::FMul);
1505 case Intrinsic::aarch64_sve_fmulx:
1506 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1507 case Intrinsic::aarch64_sve_fnmla:
1508 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1509 case Intrinsic::aarch64_sve_fnmls:
1510 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1511 case Intrinsic::aarch64_sve_fsub:
1512 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1513 .setMatchingIROpcode(Instruction::FSub);
1514 case Intrinsic::aarch64_sve_add:
1515 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1516 .setMatchingIROpcode(Instruction::Add);
1517 case Intrinsic::aarch64_sve_mla:
1518 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1519 case Intrinsic::aarch64_sve_mls:
1520 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1521 case Intrinsic::aarch64_sve_mul:
1522 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1523 .setMatchingIROpcode(Instruction::Mul);
1524 case Intrinsic::aarch64_sve_sabd:
1525 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1526 case Intrinsic::aarch64_sve_sdiv:
1527 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1528 .setMatchingIROpcode(Instruction::SDiv);
1529 case Intrinsic::aarch64_sve_smax:
1530 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1531 case Intrinsic::aarch64_sve_smin:
1532 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1533 case Intrinsic::aarch64_sve_smulh:
1534 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1535 case Intrinsic::aarch64_sve_sub:
1536 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1537 .setMatchingIROpcode(Instruction::Sub);
1538 case Intrinsic::aarch64_sve_uabd:
1539 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1540 case Intrinsic::aarch64_sve_udiv:
1541 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1542 .setMatchingIROpcode(Instruction::UDiv);
1543 case Intrinsic::aarch64_sve_umax:
1544 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1545 case Intrinsic::aarch64_sve_umin:
1546 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1547 case Intrinsic::aarch64_sve_umulh:
1548 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1549 case Intrinsic::aarch64_sve_asr:
1550 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1551 .setMatchingIROpcode(Instruction::AShr);
1552 case Intrinsic::aarch64_sve_lsl:
1553 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1554 .setMatchingIROpcode(Instruction::Shl);
1555 case Intrinsic::aarch64_sve_lsr:
1556 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1557 .setMatchingIROpcode(Instruction::LShr);
1558 case Intrinsic::aarch64_sve_and:
1559 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1560 .setMatchingIROpcode(Instruction::And);
1561 case Intrinsic::aarch64_sve_bic:
1562 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1563 case Intrinsic::aarch64_sve_eor:
1564 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1565 .setMatchingIROpcode(Instruction::Xor);
1566 case Intrinsic::aarch64_sve_orr:
1567 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1568 .setMatchingIROpcode(Instruction::Or);
1569 case Intrinsic::aarch64_sve_shsub:
1570 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_shsub_u);
1571 case Intrinsic::aarch64_sve_shsubr:
1573 case Intrinsic::aarch64_sve_sqrshl:
1574 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqrshl_u);
1575 case Intrinsic::aarch64_sve_sqshl:
1576 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqshl_u);
1577 case Intrinsic::aarch64_sve_sqsub:
1578 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1579 case Intrinsic::aarch64_sve_srshl:
1580 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_srshl_u);
1581 case Intrinsic::aarch64_sve_uhsub:
1582 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uhsub_u);
1583 case Intrinsic::aarch64_sve_uhsubr:
1585 case Intrinsic::aarch64_sve_uqrshl:
1586 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqrshl_u);
1587 case Intrinsic::aarch64_sve_uqshl:
1588 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqshl_u);
1589 case Intrinsic::aarch64_sve_uqsub:
1590 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1591 case Intrinsic::aarch64_sve_urshl:
1592 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_urshl_u);
1593
1594 case Intrinsic::aarch64_sve_add_u:
1596 Instruction::Add);
1597 case Intrinsic::aarch64_sve_and_u:
1599 Instruction::And);
1600 case Intrinsic::aarch64_sve_asr_u:
1602 Instruction::AShr);
1603 case Intrinsic::aarch64_sve_eor_u:
1605 Instruction::Xor);
1606 case Intrinsic::aarch64_sve_fadd_u:
1608 Instruction::FAdd);
1609 case Intrinsic::aarch64_sve_fdiv_u:
1611 Instruction::FDiv);
1612 case Intrinsic::aarch64_sve_fmul_u:
1614 Instruction::FMul);
1615 case Intrinsic::aarch64_sve_fsub_u:
1617 Instruction::FSub);
1618 case Intrinsic::aarch64_sve_lsl_u:
1620 Instruction::Shl);
1621 case Intrinsic::aarch64_sve_lsr_u:
1623 Instruction::LShr);
1624 case Intrinsic::aarch64_sve_mul_u:
1626 Instruction::Mul);
1627 case Intrinsic::aarch64_sve_orr_u:
1629 Instruction::Or);
1630 case Intrinsic::aarch64_sve_sdiv_u:
1632 Instruction::SDiv);
1633 case Intrinsic::aarch64_sve_sub_u:
1635 Instruction::Sub);
1636 case Intrinsic::aarch64_sve_udiv_u:
1638 Instruction::UDiv);
1639
1640 case Intrinsic::aarch64_sve_addqv:
1641 case Intrinsic::aarch64_sve_and_z:
1642 case Intrinsic::aarch64_sve_bic_z:
1643 case Intrinsic::aarch64_sve_brka_z:
1644 case Intrinsic::aarch64_sve_brkb_z:
1645 case Intrinsic::aarch64_sve_brkn_z:
1646 case Intrinsic::aarch64_sve_brkpa_z:
1647 case Intrinsic::aarch64_sve_brkpb_z:
1648 case Intrinsic::aarch64_sve_cntp:
1649 case Intrinsic::aarch64_sve_compact:
1650 case Intrinsic::aarch64_sve_eor_z:
1651 case Intrinsic::aarch64_sve_eorv:
1652 case Intrinsic::aarch64_sve_eorqv:
1653 case Intrinsic::aarch64_sve_nand_z:
1654 case Intrinsic::aarch64_sve_nor_z:
1655 case Intrinsic::aarch64_sve_orn_z:
1656 case Intrinsic::aarch64_sve_orr_z:
1657 case Intrinsic::aarch64_sve_orv:
1658 case Intrinsic::aarch64_sve_orqv:
1659 case Intrinsic::aarch64_sve_pnext:
1660 case Intrinsic::aarch64_sve_rdffr_z:
1661 case Intrinsic::aarch64_sve_saddv:
1662 case Intrinsic::aarch64_sve_uaddv:
1663 case Intrinsic::aarch64_sve_umaxv:
1664 case Intrinsic::aarch64_sve_umaxqv:
1665 case Intrinsic::aarch64_sve_cmpeq:
1666 case Intrinsic::aarch64_sve_cmpeq_wide:
1667 case Intrinsic::aarch64_sve_cmpge:
1668 case Intrinsic::aarch64_sve_cmpge_wide:
1669 case Intrinsic::aarch64_sve_cmpgt:
1670 case Intrinsic::aarch64_sve_cmpgt_wide:
1671 case Intrinsic::aarch64_sve_cmphi:
1672 case Intrinsic::aarch64_sve_cmphi_wide:
1673 case Intrinsic::aarch64_sve_cmphs:
1674 case Intrinsic::aarch64_sve_cmphs_wide:
1675 case Intrinsic::aarch64_sve_cmple_wide:
1676 case Intrinsic::aarch64_sve_cmplo_wide:
1677 case Intrinsic::aarch64_sve_cmpls_wide:
1678 case Intrinsic::aarch64_sve_cmplt_wide:
1679 case Intrinsic::aarch64_sve_cmpne:
1680 case Intrinsic::aarch64_sve_cmpne_wide:
1681 case Intrinsic::aarch64_sve_facge:
1682 case Intrinsic::aarch64_sve_facgt:
1683 case Intrinsic::aarch64_sve_fcmpeq:
1684 case Intrinsic::aarch64_sve_fcmpge:
1685 case Intrinsic::aarch64_sve_fcmpgt:
1686 case Intrinsic::aarch64_sve_fcmpne:
1687 case Intrinsic::aarch64_sve_fcmpuo:
1688 case Intrinsic::aarch64_sve_ld1:
1689 case Intrinsic::aarch64_sve_ld1_gather:
1690 case Intrinsic::aarch64_sve_ld1_gather_index:
1691 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1692 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1693 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1694 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1695 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1696 case Intrinsic::aarch64_sve_ld1q_gather_index:
1697 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1698 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1699 case Intrinsic::aarch64_sve_ld1ro:
1700 case Intrinsic::aarch64_sve_ld1rq:
1701 case Intrinsic::aarch64_sve_ld1udq:
1702 case Intrinsic::aarch64_sve_ld1uwq:
1703 case Intrinsic::aarch64_sve_ld2_sret:
1704 case Intrinsic::aarch64_sve_ld2q_sret:
1705 case Intrinsic::aarch64_sve_ld3_sret:
1706 case Intrinsic::aarch64_sve_ld3q_sret:
1707 case Intrinsic::aarch64_sve_ld4_sret:
1708 case Intrinsic::aarch64_sve_ld4q_sret:
1709 case Intrinsic::aarch64_sve_ldff1:
1710 case Intrinsic::aarch64_sve_ldff1_gather:
1711 case Intrinsic::aarch64_sve_ldff1_gather_index:
1712 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1713 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1714 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1715 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1716 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1717 case Intrinsic::aarch64_sve_ldnf1:
1718 case Intrinsic::aarch64_sve_ldnt1:
1719 case Intrinsic::aarch64_sve_ldnt1_gather:
1720 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1721 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1722 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1724
1725 case Intrinsic::aarch64_sve_prf:
1726 case Intrinsic::aarch64_sve_prfb_gather_index:
1727 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1728 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1729 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1730 case Intrinsic::aarch64_sve_prfd_gather_index:
1731 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1732 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1733 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1734 case Intrinsic::aarch64_sve_prfh_gather_index:
1735 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1736 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1737 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1738 case Intrinsic::aarch64_sve_prfw_gather_index:
1739 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1740 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1741 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1743
1744 case Intrinsic::aarch64_sve_st1_scatter:
1745 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1746 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1747 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1748 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1749 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1750 case Intrinsic::aarch64_sve_st1dq:
1751 case Intrinsic::aarch64_sve_st1q_scatter_index:
1752 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1753 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1754 case Intrinsic::aarch64_sve_st1wq:
1755 case Intrinsic::aarch64_sve_stnt1:
1756 case Intrinsic::aarch64_sve_stnt1_scatter:
1757 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1758 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1759 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1761 case Intrinsic::aarch64_sve_st2:
1762 case Intrinsic::aarch64_sve_st2q:
1764 case Intrinsic::aarch64_sve_st3:
1765 case Intrinsic::aarch64_sve_st3q:
1767 case Intrinsic::aarch64_sve_st4:
1768 case Intrinsic::aarch64_sve_st4q:
1770 }
1771
1772 return SVEIntrinsicInfo();
1773}
1774
1775static bool isAllActivePredicate(Value *Pred) {
1776 Value *UncastedPred;
1777
1778 // Look through predicate casts that only remove lanes.
1780 m_Value(UncastedPred)))) {
1781 auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
1782 Pred = UncastedPred;
1783
1785 m_Value(UncastedPred))))
1786 // If the predicate has the same or less lanes than the uncasted predicate
1787 // then we know the casting has no effect.
1788 if (OrigPredTy->getMinNumElements() <=
1789 cast<ScalableVectorType>(UncastedPred->getType())
1790 ->getMinNumElements())
1791 Pred = UncastedPred;
1792 }
1793
1794 auto *C = dyn_cast<Constant>(Pred);
1795 return C && C->isAllOnesValue();
1796}
1797
1798// Simplify `V` by only considering the operations that affect active lanes.
1799// This function should only return existing Values or newly created Constants.
1800static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1801 auto *Dup = dyn_cast<IntrinsicInst>(V);
1802 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1803 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1805 cast<VectorType>(V->getType())->getElementCount(),
1806 cast<Constant>(Dup->getOperand(2)));
1807
1808 return V;
1809}
1810
1811static std::optional<Instruction *>
1813 const SVEIntrinsicInfo &IInfo) {
1814 const unsigned Opc = IInfo.getMatchingIROpode();
1815 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1816
1817 Value *Pg = II.getOperand(0);
1818 Value *Op1 = II.getOperand(1);
1819 Value *Op2 = II.getOperand(2);
1820 const DataLayout &DL = II.getDataLayout();
1821
1822 // Canonicalise constants to the RHS.
1824 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1825 IC.replaceOperand(II, 1, Op2);
1826 IC.replaceOperand(II, 2, Op1);
1827 return &II;
1828 }
1829
1830 // Only active lanes matter when simplifying the operation.
1831 Op1 = stripInactiveLanes(Op1, Pg);
1832 Op2 = stripInactiveLanes(Op2, Pg);
1833
1834 Value *SimpleII;
1835 if (auto FII = dyn_cast<FPMathOperator>(&II))
1836 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1837 else
1838 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1839
1840 // An SVE intrinsic's result is always defined. However, this is not the case
1841 // for its equivalent IR instruction (e.g. when shifting by an amount more
1842 // than the data's bitwidth). Simplifications to an undefined result must be
1843 // ignored to preserve the intrinsic's expected behaviour.
1844 if (!SimpleII || isa<UndefValue>(SimpleII))
1845 return std::nullopt;
1846
1847 if (IInfo.inactiveLanesAreNotDefined())
1848 return IC.replaceInstUsesWith(II, SimpleII);
1849
1850 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1851
1852 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1853 if (SimpleII == Inactive)
1854 return IC.replaceInstUsesWith(II, SimpleII);
1855
1856 // Inactive lanes must be preserved.
1857 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1858 return IC.replaceInstUsesWith(II, SimpleII);
1859}
1860
1861// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1862// to operations with less strict inactive lane requirements.
1863static std::optional<Instruction *>
1865 const SVEIntrinsicInfo &IInfo) {
1866 if (!IInfo.hasGoverningPredicate())
1867 return std::nullopt;
1868
1869 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1870
1871 // If there are no active lanes.
1872 if (match(OpPredicate, m_ZeroInt())) {
1874 return IC.replaceInstUsesWith(
1875 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1876
1877 if (IInfo.inactiveLanesAreUnused()) {
1878 if (IInfo.resultIsZeroInitialized())
1880
1881 return IC.eraseInstFromFunction(II);
1882 }
1883 }
1884
1885 // If there are no inactive lanes.
1886 if (isAllActivePredicate(OpPredicate)) {
1887 if (IInfo.hasOperandWithNoActiveLanes()) {
1888 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1889 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1890 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1891 }
1892
1893 if (IInfo.hasMatchingUndefIntrinsic()) {
1894 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1895 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1896 II.setCalledFunction(NewDecl);
1897 return &II;
1898 }
1899 }
1900
1901 // Operation specific simplifications.
1902 if (IInfo.hasMatchingIROpode() &&
1904 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1905
1906 return std::nullopt;
1907}
1908
1909// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1910// => (binop (pred) (from_svbool _) (from_svbool _))
1911//
1912// The above transformation eliminates a `to_svbool` in the predicate
1913// operand of bitwise operation `binop` by narrowing the vector width of
1914// the operation. For example, it would convert a `<vscale x 16 x i1>
1915// and` into a `<vscale x 4 x i1> and`. This is profitable because
1916// to_svbool must zero the new lanes during widening, whereas
1917// from_svbool is free.
1918static std::optional<Instruction *>
1920 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1921 if (!BinOp)
1922 return std::nullopt;
1923
1924 auto IntrinsicID = BinOp->getIntrinsicID();
1925 switch (IntrinsicID) {
1926 case Intrinsic::aarch64_sve_and_z:
1927 case Intrinsic::aarch64_sve_bic_z:
1928 case Intrinsic::aarch64_sve_eor_z:
1929 case Intrinsic::aarch64_sve_nand_z:
1930 case Intrinsic::aarch64_sve_nor_z:
1931 case Intrinsic::aarch64_sve_orn_z:
1932 case Intrinsic::aarch64_sve_orr_z:
1933 break;
1934 default:
1935 return std::nullopt;
1936 }
1937
1938 auto BinOpPred = BinOp->getOperand(0);
1939 auto BinOpOp1 = BinOp->getOperand(1);
1940 auto BinOpOp2 = BinOp->getOperand(2);
1941
1942 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1943 if (!PredIntr ||
1944 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1945 return std::nullopt;
1946
1947 auto PredOp = PredIntr->getOperand(0);
1948 auto PredOpTy = cast<VectorType>(PredOp->getType());
1949 if (PredOpTy != II.getType())
1950 return std::nullopt;
1951
1952 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1953 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1954 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1955 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1956 if (BinOpOp1 == BinOpOp2)
1957 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1958 else
1959 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1960 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1961
1962 auto NarrowedBinOp =
1963 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1964 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1965}
1966
1967static std::optional<Instruction *>
1969 // If the reinterpret instruction operand is a PHI Node
1970 if (isa<PHINode>(II.getArgOperand(0)))
1971 return processPhiNode(IC, II);
1972
1973 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1974 return BinOpCombine;
1975
1976 // Ignore converts to/from svcount_t.
1977 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1978 isa<TargetExtType>(II.getType()))
1979 return std::nullopt;
1980
1981 SmallVector<Instruction *, 32> CandidatesForRemoval;
1982 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1983
1984 const auto *IVTy = cast<VectorType>(II.getType());
1985
1986 // Walk the chain of conversions.
1987 while (Cursor) {
1988 // If the type of the cursor has fewer lanes than the final result, zeroing
1989 // must take place, which breaks the equivalence chain.
1990 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1991 if (CursorVTy->getElementCount().getKnownMinValue() <
1992 IVTy->getElementCount().getKnownMinValue())
1993 break;
1994
1995 // If the cursor has the same type as I, it is a viable replacement.
1996 if (Cursor->getType() == IVTy)
1997 EarliestReplacement = Cursor;
1998
1999 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
2000
2001 // If this is not an SVE conversion intrinsic, this is the end of the chain.
2002 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
2003 Intrinsic::aarch64_sve_convert_to_svbool ||
2004 IntrinsicCursor->getIntrinsicID() ==
2005 Intrinsic::aarch64_sve_convert_from_svbool))
2006 break;
2007
2008 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
2009 Cursor = IntrinsicCursor->getOperand(0);
2010 }
2011
2012 // If no viable replacement in the conversion chain was found, there is
2013 // nothing to do.
2014 if (!EarliestReplacement)
2015 return std::nullopt;
2016
2017 return IC.replaceInstUsesWith(II, EarliestReplacement);
2018}
2019
2020static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
2021 IntrinsicInst &II) {
2022 // svsel(ptrue, x, y) => x
2023 auto *OpPredicate = II.getOperand(0);
2024 if (isAllActivePredicate(OpPredicate))
2025 return IC.replaceInstUsesWith(II, II.getOperand(1));
2026
2027 auto Select =
2028 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
2029 return IC.replaceInstUsesWith(II, Select);
2030}
2031
2032static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
2033 IntrinsicInst &II) {
2034 Value *Pg = II.getOperand(1);
2035
2036 // sve.dup(V, all_active, X) ==> splat(X)
2037 if (isAllActivePredicate(Pg)) {
2038 auto *RetTy = cast<ScalableVectorType>(II.getType());
2039 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
2040 II.getArgOperand(2));
2041 return IC.replaceInstUsesWith(II, Splat);
2042 }
2043
2045 m_SpecificInt(AArch64SVEPredPattern::vl1))))
2046 return std::nullopt;
2047
2048 // sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0
2049 Value *Insert = IC.Builder.CreateInsertElement(
2050 II.getArgOperand(0), II.getArgOperand(2), uint64_t(0));
2051 return IC.replaceInstUsesWith(II, Insert);
2052}
2053
2054static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
2055 IntrinsicInst &II) {
2056 // Replace DupX with a regular IR splat.
2057 auto *RetTy = cast<ScalableVectorType>(II.getType());
2058 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
2059 II.getArgOperand(0));
2060 Splat->takeName(&II);
2061 return IC.replaceInstUsesWith(II, Splat);
2062}
2063
2064static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
2065 IntrinsicInst &II) {
2066 LLVMContext &Ctx = II.getContext();
2067
2068 if (!isAllActivePredicate(II.getArgOperand(0)))
2069 return std::nullopt;
2070
2071 // Check that we have a compare of zero..
2072 auto *SplatValue =
2074 if (!SplatValue || !SplatValue->isZero())
2075 return std::nullopt;
2076
2077 // ..against a dupq
2078 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
2079 if (!DupQLane ||
2080 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
2081 return std::nullopt;
2082
2083 // Where the dupq is a lane 0 replicate of a vector insert
2084 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
2085 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
2086 return std::nullopt;
2087
2088 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
2089 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
2090 return std::nullopt;
2091
2092 // Where the vector insert is a fixed constant vector insert into undef at
2093 // index zero
2094 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
2095 return std::nullopt;
2096
2097 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
2098 return std::nullopt;
2099
2100 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
2101 if (!ConstVec)
2102 return std::nullopt;
2103
2104 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
2105 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
2106 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
2107 return std::nullopt;
2108
2109 unsigned NumElts = VecTy->getNumElements();
2110 unsigned PredicateBits = 0;
2111
2112 // Expand intrinsic operands to a 16-bit byte level predicate
2113 for (unsigned I = 0; I < NumElts; ++I) {
2114 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
2115 if (!Arg)
2116 return std::nullopt;
2117 if (!Arg->isZero())
2118 PredicateBits |= 1 << (I * (16 / NumElts));
2119 }
2120
2121 // If all bits are zero bail early with an empty predicate
2122 if (PredicateBits == 0) {
2123 auto *PFalse = Constant::getNullValue(II.getType());
2124 PFalse->takeName(&II);
2125 return IC.replaceInstUsesWith(II, PFalse);
2126 }
2127
2128 // Calculate largest predicate type used (where byte predicate is largest)
2129 unsigned Mask = 8;
2130 for (unsigned I = 0; I < 16; ++I)
2131 if ((PredicateBits & (1 << I)) != 0)
2132 Mask |= (I % 8);
2133
2134 unsigned PredSize = Mask & -Mask;
2135 auto *PredType = ScalableVectorType::get(
2136 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
2137
2138 // Ensure all relevant bits are set
2139 for (unsigned I = 0; I < 16; I += PredSize)
2140 if ((PredicateBits & (1 << I)) == 0)
2141 return std::nullopt;
2142
2143 auto *PTruePat =
2144 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2145 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2146 {PredType}, {PTruePat});
2147 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
2148 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
2149 auto *ConvertFromSVBool =
2150 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
2151 {II.getType()}, {ConvertToSVBool});
2152
2153 ConvertFromSVBool->takeName(&II);
2154 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
2155}
2156
2157static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2158 IntrinsicInst &II) {
2159 Value *Pg = II.getArgOperand(0);
2160 Value *Vec = II.getArgOperand(1);
2161 auto IntrinsicID = II.getIntrinsicID();
2162 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2163
2164 // lastX(splat(X)) --> X
2165 if (auto *SplatVal = getSplatValue(Vec))
2166 return IC.replaceInstUsesWith(II, SplatVal);
2167
2168 // If x and/or y is a splat value then:
2169 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2170 Value *LHS, *RHS;
2171 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
2172 if (isSplatValue(LHS) || isSplatValue(RHS)) {
2173 auto *OldBinOp = cast<BinaryOperator>(Vec);
2174 auto OpC = OldBinOp->getOpcode();
2175 auto *NewLHS =
2176 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
2177 auto *NewRHS =
2178 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
2180 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
2181 return IC.replaceInstUsesWith(II, NewBinOp);
2182 }
2183 }
2184
2185 auto *C = dyn_cast<Constant>(Pg);
2186 if (IsAfter && C && C->isNullValue()) {
2187 // The intrinsic is extracting lane 0 so use an extract instead.
2188 auto *IdxTy = Type::getInt64Ty(II.getContext());
2189 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
2190 Extract->insertBefore(II.getIterator());
2191 Extract->takeName(&II);
2192 return IC.replaceInstUsesWith(II, Extract);
2193 }
2194
2195 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
2196 if (!IntrPG)
2197 return std::nullopt;
2198
2199 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2200 return std::nullopt;
2201
2202 const auto PTruePattern =
2203 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2204
2205 // Can the intrinsic's predicate be converted to a known constant index?
2206 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2207 if (!MinNumElts)
2208 return std::nullopt;
2209
2210 unsigned Idx = MinNumElts - 1;
2211 // Increment the index if extracting the element after the last active
2212 // predicate element.
2213 if (IsAfter)
2214 ++Idx;
2215
2216 // Ignore extracts whose index is larger than the known minimum vector
2217 // length. NOTE: This is an artificial constraint where we prefer to
2218 // maintain what the user asked for until an alternative is proven faster.
2219 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2220 if (Idx >= PgVTy->getMinNumElements())
2221 return std::nullopt;
2222
2223 // The intrinsic is extracting a fixed lane so use an extract instead.
2224 auto *IdxTy = Type::getInt64Ty(II.getContext());
2225 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2226 Extract->insertBefore(II.getIterator());
2227 Extract->takeName(&II);
2228 return IC.replaceInstUsesWith(II, Extract);
2229}
2230
2231static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2232 IntrinsicInst &II) {
2233 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2234 // integer variant across a variety of micro-architectures. Replace scalar
2235 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2236 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2237 // depending on the micro-architecture, but has been observed as generally
2238 // being faster, particularly when the CLAST[AB] op is a loop-carried
2239 // dependency.
2240 Value *Pg = II.getArgOperand(0);
2241 Value *Fallback = II.getArgOperand(1);
2242 Value *Vec = II.getArgOperand(2);
2243 Type *Ty = II.getType();
2244
2245 if (!Ty->isIntegerTy())
2246 return std::nullopt;
2247
2248 Type *FPTy;
2249 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2250 default:
2251 return std::nullopt;
2252 case 16:
2253 FPTy = IC.Builder.getHalfTy();
2254 break;
2255 case 32:
2256 FPTy = IC.Builder.getFloatTy();
2257 break;
2258 case 64:
2259 FPTy = IC.Builder.getDoubleTy();
2260 break;
2261 }
2262
2263 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2264 auto *FPVTy = VectorType::get(
2265 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2266 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2267 auto *FPII = IC.Builder.CreateIntrinsic(
2268 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2269 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2270 return IC.replaceInstUsesWith(II, FPIItoInt);
2271}
2272
2273static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2274 IntrinsicInst &II) {
2275 LLVMContext &Ctx = II.getContext();
2276 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2277 // can work with RDFFR_PP for ptest elimination.
2278 auto *AllPat =
2279 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2280 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2281 {II.getType()}, {AllPat});
2282 auto *RDFFR =
2283 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue});
2284 RDFFR->takeName(&II);
2285 return IC.replaceInstUsesWith(II, RDFFR);
2286}
2287
2288static std::optional<Instruction *>
2290 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2291
2292 if (Pattern == AArch64SVEPredPattern::all) {
2294 II.getType(), ElementCount::getScalable(NumElts));
2295 Cnt->takeName(&II);
2296 return IC.replaceInstUsesWith(II, Cnt);
2297 }
2298
2299 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2300
2301 return MinNumElts && NumElts >= MinNumElts
2302 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2303 II, ConstantInt::get(II.getType(), MinNumElts)))
2304 : std::nullopt;
2305}
2306
2307static std::optional<Instruction *>
2309 const AArch64Subtarget *ST) {
2310 if (!ST->isStreaming())
2311 return std::nullopt;
2312
2313 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2314 // with SVEPredPattern::all
2315 Value *Cnt =
2317 Cnt->takeName(&II);
2318 return IC.replaceInstUsesWith(II, Cnt);
2319}
2320
2321static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2322 IntrinsicInst &II) {
2323 Value *PgVal = II.getArgOperand(0);
2324 Value *OpVal = II.getArgOperand(1);
2325
2326 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2327 // Later optimizations prefer this form.
2328 if (PgVal == OpVal &&
2329 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2330 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2331 Value *Ops[] = {PgVal, OpVal};
2332 Type *Tys[] = {PgVal->getType()};
2333
2334 auto *PTest =
2335 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2336 PTest->takeName(&II);
2337
2338 return IC.replaceInstUsesWith(II, PTest);
2339 }
2340
2343
2344 if (!Pg || !Op)
2345 return std::nullopt;
2346
2347 Intrinsic::ID OpIID = Op->getIntrinsicID();
2348
2349 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2350 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2351 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2352 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2353 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2354
2355 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2356
2357 PTest->takeName(&II);
2358 return IC.replaceInstUsesWith(II, PTest);
2359 }
2360
2361 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2362 // Later optimizations may rewrite sequence to use the flag-setting variant
2363 // of instruction X to remove PTEST.
2364 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2365 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2366 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2367 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2368 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2369 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2370 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2371 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2372 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2373 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2374 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2375 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2376 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2377 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2378 Type *Tys[] = {Pg->getType()};
2379
2380 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2381 PTest->takeName(&II);
2382
2383 return IC.replaceInstUsesWith(II, PTest);
2384 }
2385
2386 return std::nullopt;
2387}
2388
2389template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2390static std::optional<Instruction *>
2392 bool MergeIntoAddendOp) {
2393 Value *P = II.getOperand(0);
2394 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2395 if (MergeIntoAddendOp) {
2396 AddendOp = II.getOperand(1);
2397 Mul = II.getOperand(2);
2398 } else {
2399 AddendOp = II.getOperand(2);
2400 Mul = II.getOperand(1);
2401 }
2402
2404 m_Value(MulOp1))))
2405 return std::nullopt;
2406
2407 if (!Mul->hasOneUse())
2408 return std::nullopt;
2409
2410 Instruction *FMFSource = nullptr;
2411 if (II.getType()->isFPOrFPVectorTy()) {
2412 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2413 // Stop the combine when the flags on the inputs differ in case dropping
2414 // flags would lead to us missing out on more beneficial optimizations.
2415 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2416 return std::nullopt;
2417 if (!FAddFlags.allowContract())
2418 return std::nullopt;
2419 FMFSource = &II;
2420 }
2421
2422 CallInst *Res;
2423 if (MergeIntoAddendOp)
2424 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2425 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2426 else
2427 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2428 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2429
2430 return IC.replaceInstUsesWith(II, Res);
2431}
2432
2433static std::optional<Instruction *>
2435 Value *Pred = II.getOperand(0);
2436 Value *PtrOp = II.getOperand(1);
2437 Type *VecTy = II.getType();
2438
2439 if (isAllActivePredicate(Pred)) {
2440 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2441 Load->copyMetadata(II);
2442 return IC.replaceInstUsesWith(II, Load);
2443 }
2444
2445 CallInst *MaskedLoad =
2446 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2447 Pred, ConstantAggregateZero::get(VecTy));
2448 MaskedLoad->copyMetadata(II);
2449 return IC.replaceInstUsesWith(II, MaskedLoad);
2450}
2451
2452static std::optional<Instruction *>
2454 Value *VecOp = II.getOperand(0);
2455 Value *Pred = II.getOperand(1);
2456 Value *PtrOp = II.getOperand(2);
2457
2458 if (isAllActivePredicate(Pred)) {
2459 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2460 Store->copyMetadata(II);
2461 return IC.eraseInstFromFunction(II);
2462 }
2463
2464 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2465 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2466 MaskedStore->copyMetadata(II);
2467 return IC.eraseInstFromFunction(II);
2468}
2469
2471 switch (Intrinsic) {
2472 case Intrinsic::aarch64_sve_fmul_u:
2473 return Instruction::BinaryOps::FMul;
2474 case Intrinsic::aarch64_sve_fadd_u:
2475 return Instruction::BinaryOps::FAdd;
2476 case Intrinsic::aarch64_sve_fsub_u:
2477 return Instruction::BinaryOps::FSub;
2478 default:
2479 return Instruction::BinaryOpsEnd;
2480 }
2481}
2482
2483static std::optional<Instruction *>
2485 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2486 if (II.isStrictFP())
2487 return std::nullopt;
2488
2489 auto *OpPredicate = II.getOperand(0);
2490 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2491 if (BinOpCode == Instruction::BinaryOpsEnd ||
2492 !isAllActivePredicate(OpPredicate))
2493 return std::nullopt;
2494 auto BinOp = IC.Builder.CreateBinOpFMF(
2495 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2496 return IC.replaceInstUsesWith(II, BinOp);
2497}
2498
2499static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2500 IntrinsicInst &II) {
2501 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2502 Intrinsic::aarch64_sve_mla>(
2503 IC, II, true))
2504 return MLA;
2505 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2506 Intrinsic::aarch64_sve_mad>(
2507 IC, II, false))
2508 return MAD;
2509 return std::nullopt;
2510}
2511
2512static std::optional<Instruction *>
2514 if (auto FMLA =
2515 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2516 Intrinsic::aarch64_sve_fmla>(IC, II,
2517 true))
2518 return FMLA;
2519 if (auto FMAD =
2520 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2521 Intrinsic::aarch64_sve_fmad>(IC, II,
2522 false))
2523 return FMAD;
2524 if (auto FMLA =
2525 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2526 Intrinsic::aarch64_sve_fmla>(IC, II,
2527 true))
2528 return FMLA;
2529 return std::nullopt;
2530}
2531
2532static std::optional<Instruction *>
2534 if (auto FMLA =
2535 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2536 Intrinsic::aarch64_sve_fmla>(IC, II,
2537 true))
2538 return FMLA;
2539 if (auto FMAD =
2540 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2541 Intrinsic::aarch64_sve_fmad>(IC, II,
2542 false))
2543 return FMAD;
2544 if (auto FMLA_U =
2545 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2546 Intrinsic::aarch64_sve_fmla_u>(
2547 IC, II, true))
2548 return FMLA_U;
2549 return instCombineSVEVectorBinOp(IC, II);
2550}
2551
2552static std::optional<Instruction *>
2554 if (auto FMLS =
2555 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2556 Intrinsic::aarch64_sve_fmls>(IC, II,
2557 true))
2558 return FMLS;
2559 if (auto FMSB =
2560 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2561 Intrinsic::aarch64_sve_fnmsb>(
2562 IC, II, false))
2563 return FMSB;
2564 if (auto FMLS =
2565 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2566 Intrinsic::aarch64_sve_fmls>(IC, II,
2567 true))
2568 return FMLS;
2569 return std::nullopt;
2570}
2571
2572static std::optional<Instruction *>
2574 if (auto FMLS =
2575 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2576 Intrinsic::aarch64_sve_fmls>(IC, II,
2577 true))
2578 return FMLS;
2579 if (auto FMSB =
2580 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2581 Intrinsic::aarch64_sve_fnmsb>(
2582 IC, II, false))
2583 return FMSB;
2584 if (auto FMLS_U =
2585 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2586 Intrinsic::aarch64_sve_fmls_u>(
2587 IC, II, true))
2588 return FMLS_U;
2589 return instCombineSVEVectorBinOp(IC, II);
2590}
2591
2592static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2593 IntrinsicInst &II) {
2594 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2595 Intrinsic::aarch64_sve_mls>(
2596 IC, II, true))
2597 return MLS;
2598 return std::nullopt;
2599}
2600
2601static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2602 IntrinsicInst &II) {
2603 Value *UnpackArg = II.getArgOperand(0);
2604 auto *RetTy = cast<ScalableVectorType>(II.getType());
2605 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2606 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2607
2608 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2609 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2610 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2611 ScalarArg =
2612 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2613 Value *NewVal =
2614 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2615 NewVal->takeName(&II);
2616 return IC.replaceInstUsesWith(II, NewVal);
2617 }
2618
2619 return std::nullopt;
2620}
2621static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2622 IntrinsicInst &II) {
2623 auto *OpVal = II.getOperand(0);
2624 auto *OpIndices = II.getOperand(1);
2625 VectorType *VTy = cast<VectorType>(II.getType());
2626
2627 // Check whether OpIndices is a constant splat value < minimal element count
2628 // of result.
2629 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2630 if (!SplatValue ||
2631 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2632 return std::nullopt;
2633
2634 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2635 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2636 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2637 auto *VectorSplat =
2638 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2639
2640 VectorSplat->takeName(&II);
2641 return IC.replaceInstUsesWith(II, VectorSplat);
2642}
2643
2644static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2645 IntrinsicInst &II) {
2646 Value *A, *B;
2647 Type *RetTy = II.getType();
2648 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2649 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2650
2651 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2652 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2653 if ((match(II.getArgOperand(0),
2655 match(II.getArgOperand(1),
2657 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2658 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2659 auto *TyA = cast<ScalableVectorType>(A->getType());
2660 if (TyA == B->getType() &&
2662 auto *SubVec = IC.Builder.CreateInsertVector(
2663 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2664 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2665 TyA->getMinNumElements());
2666 ConcatVec->takeName(&II);
2667 return IC.replaceInstUsesWith(II, ConcatVec);
2668 }
2669 }
2670
2671 return std::nullopt;
2672}
2673
2674static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2675 IntrinsicInst &II) {
2676 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2677 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2678 Value *A, *B;
2679 if (match(II.getArgOperand(0),
2682 m_Specific(A), m_Specific(B))))
2683 return IC.replaceInstUsesWith(
2684 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2685
2686 return std::nullopt;
2687}
2688
2689static std::optional<Instruction *>
2691 Value *Mask = II.getOperand(0);
2692 Value *BasePtr = II.getOperand(1);
2693 Value *Index = II.getOperand(2);
2694 Type *Ty = II.getType();
2695 Value *PassThru = ConstantAggregateZero::get(Ty);
2696
2697 // Contiguous gather => masked load.
2698 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2699 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2700 Value *IndexBase;
2702 m_Value(IndexBase), m_SpecificInt(1)))) {
2703 Align Alignment =
2704 BasePtr->getPointerAlignment(II.getDataLayout());
2705
2706 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2707 BasePtr, IndexBase);
2708 CallInst *MaskedLoad =
2709 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2710 MaskedLoad->takeName(&II);
2711 return IC.replaceInstUsesWith(II, MaskedLoad);
2712 }
2713
2714 return std::nullopt;
2715}
2716
2717static std::optional<Instruction *>
2719 Value *Val = II.getOperand(0);
2720 Value *Mask = II.getOperand(1);
2721 Value *BasePtr = II.getOperand(2);
2722 Value *Index = II.getOperand(3);
2723 Type *Ty = Val->getType();
2724
2725 // Contiguous scatter => masked store.
2726 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2727 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2728 Value *IndexBase;
2730 m_Value(IndexBase), m_SpecificInt(1)))) {
2731 Align Alignment =
2732 BasePtr->getPointerAlignment(II.getDataLayout());
2733
2734 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2735 BasePtr, IndexBase);
2736 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2737
2738 return IC.eraseInstFromFunction(II);
2739 }
2740
2741 return std::nullopt;
2742}
2743
2744static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2745 IntrinsicInst &II) {
2747 Value *Pred = II.getOperand(0);
2748 Value *Vec = II.getOperand(1);
2749 Value *DivVec = II.getOperand(2);
2750
2751 Value *SplatValue = getSplatValue(DivVec);
2752 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2753 if (!SplatConstantInt)
2754 return std::nullopt;
2755
2756 APInt Divisor = SplatConstantInt->getValue();
2757 const int64_t DivisorValue = Divisor.getSExtValue();
2758 if (DivisorValue == -1)
2759 return std::nullopt;
2760 if (DivisorValue == 1)
2761 IC.replaceInstUsesWith(II, Vec);
2762
2763 if (Divisor.isPowerOf2()) {
2764 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2765 auto ASRD = IC.Builder.CreateIntrinsic(
2766 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2767 return IC.replaceInstUsesWith(II, ASRD);
2768 }
2769 if (Divisor.isNegatedPowerOf2()) {
2770 Divisor.negate();
2771 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2772 auto ASRD = IC.Builder.CreateIntrinsic(
2773 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2774 auto NEG = IC.Builder.CreateIntrinsic(
2775 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2776 return IC.replaceInstUsesWith(II, NEG);
2777 }
2778
2779 return std::nullopt;
2780}
2781
2782bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2783 size_t VecSize = Vec.size();
2784 if (VecSize == 1)
2785 return true;
2786 if (!isPowerOf2_64(VecSize))
2787 return false;
2788 size_t HalfVecSize = VecSize / 2;
2789
2790 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2791 RHS != Vec.end(); LHS++, RHS++) {
2792 if (*LHS != nullptr && *RHS != nullptr) {
2793 if (*LHS == *RHS)
2794 continue;
2795 else
2796 return false;
2797 }
2798 if (!AllowPoison)
2799 return false;
2800 if (*LHS == nullptr && *RHS != nullptr)
2801 *LHS = *RHS;
2802 }
2803
2804 Vec.resize(HalfVecSize);
2805 SimplifyValuePattern(Vec, AllowPoison);
2806 return true;
2807}
2808
2809// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2810// to dupqlane(f64(C)) where C is A concatenated with B
2811static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2812 IntrinsicInst &II) {
2813 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2814 if (!match(II.getOperand(0),
2816 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2817 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2818 return std::nullopt;
2819 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2820
2821 // Insert the scalars into a container ordered by InsertElement index
2822 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2823 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2824 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2825 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2826 CurrentInsertElt = InsertElt->getOperand(0);
2827 }
2828
2829 bool AllowPoison =
2830 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2831 if (!SimplifyValuePattern(Elts, AllowPoison))
2832 return std::nullopt;
2833
2834 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2835 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2836 for (size_t I = 0; I < Elts.size(); I++) {
2837 if (Elts[I] == nullptr)
2838 continue;
2839 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2840 IC.Builder.getInt64(I));
2841 }
2842 if (InsertEltChain == nullptr)
2843 return std::nullopt;
2844
2845 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2846 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2847 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2848 // be narrowed back to the original type.
2849 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2850 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2851 IIScalableTy->getMinNumElements() /
2852 PatternWidth;
2853
2854 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2855 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2856 auto *WideShuffleMaskTy =
2857 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2858
2859 auto InsertSubvector = IC.Builder.CreateInsertVector(
2860 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2861 uint64_t(0));
2862 auto WideBitcast =
2863 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2864 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2865 auto WideShuffle = IC.Builder.CreateShuffleVector(
2866 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2867 auto NarrowBitcast =
2868 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2869
2870 return IC.replaceInstUsesWith(II, NarrowBitcast);
2871}
2872
2873static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2874 IntrinsicInst &II) {
2875 Value *A = II.getArgOperand(0);
2876 Value *B = II.getArgOperand(1);
2877 if (A == B)
2878 return IC.replaceInstUsesWith(II, A);
2879
2880 return std::nullopt;
2881}
2882
2883static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2884 IntrinsicInst &II) {
2885 Value *Pred = II.getOperand(0);
2886 Value *Vec = II.getOperand(1);
2887 Value *Shift = II.getOperand(2);
2888
2889 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2890 Value *AbsPred, *MergedValue;
2892 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2894 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2895
2896 return std::nullopt;
2897
2898 // Transform is valid if any of the following are true:
2899 // * The ABS merge value is an undef or non-negative
2900 // * The ABS predicate is all active
2901 // * The ABS predicate and the SRSHL predicates are the same
2902 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2903 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2904 return std::nullopt;
2905
2906 // Only valid when the shift amount is non-negative, otherwise the rounding
2907 // behaviour of SRSHL cannot be ignored.
2908 if (!match(Shift, m_NonNegative()))
2909 return std::nullopt;
2910
2911 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2912 {II.getType()}, {Pred, Vec, Shift});
2913
2914 return IC.replaceInstUsesWith(II, LSL);
2915}
2916
2917static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2918 IntrinsicInst &II) {
2919 Value *Vec = II.getOperand(0);
2920
2921 if (getSplatValue(Vec) == II.getOperand(1))
2922 return IC.replaceInstUsesWith(II, Vec);
2923
2924 return std::nullopt;
2925}
2926
2927static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2928 IntrinsicInst &II) {
2929 // If this barrier is post-dominated by identical one we can remove it
2930 auto *NI = II.getNextNode();
2931 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2932 auto CanSkipOver = [](Instruction *I) {
2933 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2934 };
2935 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2936 auto *NIBB = NI->getParent();
2937 NI = NI->getNextNode();
2938 if (!NI) {
2939 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2940 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2941 else
2942 break;
2943 }
2944 }
2945 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2946 if (NextII && II.isIdenticalTo(NextII))
2947 return IC.eraseInstFromFunction(II);
2948
2949 return std::nullopt;
2950}
2951
2952static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
2953 IntrinsicInst &II) {
2954 return IC.replaceInstUsesWith(
2955 II,
2956 IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
2957 {II.getType(), II.getOperand(0)->getType()},
2958 {II.getOperand(0), II.getOperand(1)}));
2959}
2960
2961static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2962 IntrinsicInst &II) {
2964 return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
2965 return std::nullopt;
2966}
2967
2968static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2970 unsigned NumBits) {
2971 Value *Passthru = II.getOperand(0);
2972 Value *Pg = II.getOperand(1);
2973 Value *Op = II.getOperand(2);
2974
2975 // Convert UXT[BHW] to AND.
2976 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
2977 auto *Ty = cast<VectorType>(II.getType());
2978 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
2979 auto *Mask = ConstantInt::get(Ty, MaskValue);
2980 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
2981 {Pg, Op, Mask});
2982 return IC.replaceInstUsesWith(II, And);
2983 }
2984
2985 return std::nullopt;
2986}
2987
2988static std::optional<Instruction *>
2990 SMEAttrs FnSMEAttrs(*II.getFunction());
2991 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2992 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
2993 return IC.replaceInstUsesWith(
2994 II, ConstantInt::getBool(II.getType(), IsStreaming));
2995 return std::nullopt;
2996}
2997
2998std::optional<Instruction *>
3000 IntrinsicInst &II) const {
3002 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
3003 return I;
3004
3005 Intrinsic::ID IID = II.getIntrinsicID();
3006 switch (IID) {
3007 default:
3008 break;
3009 case Intrinsic::aarch64_dmb:
3010 return instCombineDMB(IC, II);
3011 case Intrinsic::aarch64_neon_fmaxnm:
3012 case Intrinsic::aarch64_neon_fminnm:
3013 return instCombineMaxMinNM(IC, II);
3014 case Intrinsic::aarch64_sve_convert_from_svbool:
3015 return instCombineConvertFromSVBool(IC, II);
3016 case Intrinsic::aarch64_sve_dup:
3017 return instCombineSVEDup(IC, II);
3018 case Intrinsic::aarch64_sve_dup_x:
3019 return instCombineSVEDupX(IC, II);
3020 case Intrinsic::aarch64_sve_cmpne:
3021 case Intrinsic::aarch64_sve_cmpne_wide:
3022 return instCombineSVECmpNE(IC, II);
3023 case Intrinsic::aarch64_sve_rdffr:
3024 return instCombineRDFFR(IC, II);
3025 case Intrinsic::aarch64_sve_lasta:
3026 case Intrinsic::aarch64_sve_lastb:
3027 return instCombineSVELast(IC, II);
3028 case Intrinsic::aarch64_sve_clasta_n:
3029 case Intrinsic::aarch64_sve_clastb_n:
3030 return instCombineSVECondLast(IC, II);
3031 case Intrinsic::aarch64_sve_cntd:
3032 return instCombineSVECntElts(IC, II, 2);
3033 case Intrinsic::aarch64_sve_cntw:
3034 return instCombineSVECntElts(IC, II, 4);
3035 case Intrinsic::aarch64_sve_cnth:
3036 return instCombineSVECntElts(IC, II, 8);
3037 case Intrinsic::aarch64_sve_cntb:
3038 return instCombineSVECntElts(IC, II, 16);
3039 case Intrinsic::aarch64_sme_cntsd:
3040 return instCombineSMECntsd(IC, II, ST);
3041 case Intrinsic::aarch64_sve_ptest_any:
3042 case Intrinsic::aarch64_sve_ptest_first:
3043 case Intrinsic::aarch64_sve_ptest_last:
3044 return instCombineSVEPTest(IC, II);
3045 case Intrinsic::aarch64_sve_fadd:
3046 return instCombineSVEVectorFAdd(IC, II);
3047 case Intrinsic::aarch64_sve_fadd_u:
3048 return instCombineSVEVectorFAddU(IC, II);
3049 case Intrinsic::aarch64_sve_fmul_u:
3050 return instCombineSVEVectorBinOp(IC, II);
3051 case Intrinsic::aarch64_sve_fsub:
3052 return instCombineSVEVectorFSub(IC, II);
3053 case Intrinsic::aarch64_sve_fsub_u:
3054 return instCombineSVEVectorFSubU(IC, II);
3055 case Intrinsic::aarch64_sve_add:
3056 return instCombineSVEVectorAdd(IC, II);
3057 case Intrinsic::aarch64_sve_add_u:
3058 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3059 Intrinsic::aarch64_sve_mla_u>(
3060 IC, II, true);
3061 case Intrinsic::aarch64_sve_sub:
3062 return instCombineSVEVectorSub(IC, II);
3063 case Intrinsic::aarch64_sve_sub_u:
3064 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3065 Intrinsic::aarch64_sve_mls_u>(
3066 IC, II, true);
3067 case Intrinsic::aarch64_sve_tbl:
3068 return instCombineSVETBL(IC, II);
3069 case Intrinsic::aarch64_sve_uunpkhi:
3070 case Intrinsic::aarch64_sve_uunpklo:
3071 case Intrinsic::aarch64_sve_sunpkhi:
3072 case Intrinsic::aarch64_sve_sunpklo:
3073 return instCombineSVEUnpack(IC, II);
3074 case Intrinsic::aarch64_sve_uzp1:
3075 return instCombineSVEUzp1(IC, II);
3076 case Intrinsic::aarch64_sve_zip1:
3077 case Intrinsic::aarch64_sve_zip2:
3078 return instCombineSVEZip(IC, II);
3079 case Intrinsic::aarch64_sve_ld1_gather_index:
3080 return instCombineLD1GatherIndex(IC, II);
3081 case Intrinsic::aarch64_sve_st1_scatter_index:
3082 return instCombineST1ScatterIndex(IC, II);
3083 case Intrinsic::aarch64_sve_ld1:
3084 return instCombineSVELD1(IC, II, DL);
3085 case Intrinsic::aarch64_sve_st1:
3086 return instCombineSVEST1(IC, II, DL);
3087 case Intrinsic::aarch64_sve_sdiv:
3088 return instCombineSVESDIV(IC, II);
3089 case Intrinsic::aarch64_sve_sel:
3090 return instCombineSVESel(IC, II);
3091 case Intrinsic::aarch64_sve_srshl:
3092 return instCombineSVESrshl(IC, II);
3093 case Intrinsic::aarch64_sve_dupq_lane:
3094 return instCombineSVEDupqLane(IC, II);
3095 case Intrinsic::aarch64_sve_insr:
3096 return instCombineSVEInsr(IC, II);
3097 case Intrinsic::aarch64_sve_whilelo:
3098 return instCombineWhilelo(IC, II);
3099 case Intrinsic::aarch64_sve_ptrue:
3100 return instCombinePTrue(IC, II);
3101 case Intrinsic::aarch64_sve_uxtb:
3102 return instCombineSVEUxt(IC, II, 8);
3103 case Intrinsic::aarch64_sve_uxth:
3104 return instCombineSVEUxt(IC, II, 16);
3105 case Intrinsic::aarch64_sve_uxtw:
3106 return instCombineSVEUxt(IC, II, 32);
3107 case Intrinsic::aarch64_sme_in_streaming_mode:
3108 return instCombineInStreamingMode(IC, II);
3109 }
3110
3111 return std::nullopt;
3112}
3113
3115 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
3116 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
3117 std::function<void(Instruction *, unsigned, APInt, APInt &)>
3118 SimplifyAndSetOp) const {
3119 switch (II.getIntrinsicID()) {
3120 default:
3121 break;
3122 case Intrinsic::aarch64_neon_fcvtxn:
3123 case Intrinsic::aarch64_neon_rshrn:
3124 case Intrinsic::aarch64_neon_sqrshrn:
3125 case Intrinsic::aarch64_neon_sqrshrun:
3126 case Intrinsic::aarch64_neon_sqshrn:
3127 case Intrinsic::aarch64_neon_sqshrun:
3128 case Intrinsic::aarch64_neon_sqxtn:
3129 case Intrinsic::aarch64_neon_sqxtun:
3130 case Intrinsic::aarch64_neon_uqrshrn:
3131 case Intrinsic::aarch64_neon_uqshrn:
3132 case Intrinsic::aarch64_neon_uqxtn:
3133 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
3134 break;
3135 }
3136
3137 return std::nullopt;
3138}
3139
3141 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3143}
3144
3147 switch (K) {
3149 return TypeSize::getFixed(64);
3151 if (ST->useSVEForFixedLengthVectors() &&
3152 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
3153 return TypeSize::getFixed(
3154 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
3155 else if (ST->isNeonAvailable())
3156 return TypeSize::getFixed(128);
3157 else
3158 return TypeSize::getFixed(0);
3160 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3162 return TypeSize::getScalable(128);
3163 else
3164 return TypeSize::getScalable(0);
3165 }
3166 llvm_unreachable("Unsupported register kind");
3167}
3168
3169bool AArch64TTIImpl::isSingleExtWideningInstruction(
3170 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3171 Type *SrcOverrideTy) const {
3172 // A helper that returns a vector type from the given type. The number of
3173 // elements in type Ty determines the vector width.
3174 auto toVectorTy = [&](Type *ArgTy) {
3175 return VectorType::get(ArgTy->getScalarType(),
3176 cast<VectorType>(DstTy)->getElementCount());
3177 };
3178
3179 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3180 // i32, i64]. SVE doesn't generally have the same set of instructions to
3181 // perform an extend with the add/sub/mul. There are SMULLB style
3182 // instructions, but they operate on top/bottom, requiring some sort of lane
3183 // interleaving to be used with zext/sext.
3184 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3185 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3186 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3187 return false;
3188
3189 Type *SrcTy = SrcOverrideTy;
3190 switch (Opcode) {
3191 case Instruction::Add: // UADDW(2), SADDW(2).
3192 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3193 // The second operand needs to be an extend
3194 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
3195 if (!SrcTy)
3196 SrcTy =
3197 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
3198 break;
3199 }
3200
3201 if (Opcode == Instruction::Sub)
3202 return false;
3203
3204 // UADDW(2), SADDW(2) can be commutted.
3205 if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
3206 if (!SrcTy)
3207 SrcTy =
3208 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3209 break;
3210 }
3211 return false;
3212 }
3213 default:
3214 return false;
3215 }
3216
3217 // Legalize the destination type and ensure it can be used in a widening
3218 // operation.
3219 auto DstTyL = getTypeLegalizationCost(DstTy);
3220 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3221 return false;
3222
3223 // Legalize the source type and ensure it can be used in a widening
3224 // operation.
3225 assert(SrcTy && "Expected some SrcTy");
3226 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3227 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3228 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3229 return false;
3230
3231 // Get the total number of vector elements in the legalized types.
3232 InstructionCost NumDstEls =
3233 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3234 InstructionCost NumSrcEls =
3235 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3236
3237 // Return true if the legalized types have the same number of vector elements
3238 // and the destination element type size is twice that of the source type.
3239 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3240}
3241
3242Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3244 Type *SrcOverrideTy) const {
3245 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3246 Opcode != Instruction::Mul)
3247 return nullptr;
3248
3249 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3250 // i32, i64]. SVE doesn't generally have the same set of instructions to
3251 // perform an extend with the add/sub/mul. There are SMULLB style
3252 // instructions, but they operate on top/bottom, requiring some sort of lane
3253 // interleaving to be used with zext/sext.
3254 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3255 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3256 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3257 return nullptr;
3258
3259 auto getScalarSizeWithOverride = [&](const Value *V) {
3260 if (SrcOverrideTy)
3261 return SrcOverrideTy->getScalarSizeInBits();
3262 return cast<Instruction>(V)
3263 ->getOperand(0)
3264 ->getType()
3265 ->getScalarSizeInBits();
3266 };
3267
3268 unsigned MaxEltSize = 0;
3269 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3270 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3271 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3272 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3273 MaxEltSize = std::max(EltSize0, EltSize1);
3274 } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
3275 isa<SExtInst, ZExtInst>(Args[1])) {
3276 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3277 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3278 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3279 // enough.
3280 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3281 return nullptr;
3282 MaxEltSize = DstEltSize / 2;
3283 } else if (Opcode == Instruction::Mul &&
3284 (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
3285 // If one of the operands is a Zext and the other has enough zero bits
3286 // to be treated as unsigned, we can still generate a umull, meaning the
3287 // zext is free.
3288 KnownBits Known =
3289 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3290 if (Args[0]->getType()->getScalarSizeInBits() -
3291 Known.Zero.countLeadingOnes() >
3292 DstTy->getScalarSizeInBits() / 2)
3293 return nullptr;
3294
3295 MaxEltSize =
3296 getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
3297 } else
3298 return nullptr;
3299
3300 if (MaxEltSize * 2 > DstEltSize)
3301 return nullptr;
3302
3303 Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
3304 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3305 return nullptr;
3306 return ExtTy;
3307}
3308
3309// s/urhadd instructions implement the following pattern, making the
3310// extends free:
3311// %x = add ((zext i8 -> i16), 1)
3312// %y = (zext i8 -> i16)
3313// trunc i16 (lshr (add %x, %y), 1) -> i8
3314//
3316 Type *Src) const {
3317 // The source should be a legal vector type.
3318 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3319 (Src->isScalableTy() && !ST->hasSVE2()))
3320 return false;
3321
3322 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3323 return false;
3324
3325 // Look for trunc/shl/add before trying to match the pattern.
3326 const Instruction *Add = ExtUser;
3327 auto *AddUser =
3328 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3329 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3330 Add = AddUser;
3331
3332 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3333 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3334 return false;
3335
3336 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3337 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3338 Src->getScalarSizeInBits() !=
3339 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3340 return false;
3341
3342 // Try to match the whole pattern. Ext could be either the first or second
3343 // m_ZExtOrSExt matched.
3344 Instruction *Ex1, *Ex2;
3345 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3346 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
3347 return false;
3348
3349 // Ensure both extends are of the same type
3350 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3351 Ex1->getOpcode() == Ex2->getOpcode())
3352 return true;
3353
3354 return false;
3355}
3356
3358 Type *Src,
3361 const Instruction *I) const {
3362 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3363 assert(ISD && "Invalid opcode");
3364 // If the cast is observable, and it is used by a widening instruction (e.g.,
3365 // uaddl, saddw, etc.), it may be free.
3366 if (I && I->hasOneUser()) {
3367 auto *SingleUser = cast<Instruction>(*I->user_begin());
3368 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3369 if (Type *ExtTy = isBinExtWideningInstruction(
3370 SingleUser->getOpcode(), Dst, Operands,
3371 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3372 // The cost from Src->Src*2 needs to be added if required, the cost from
3373 // Src*2->ExtTy is free.
3374 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3375 Type *DoubleSrcTy =
3376 Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
3377 return getCastInstrCost(Opcode, DoubleSrcTy, Src,
3379 }
3380
3381 return 0;
3382 }
3383
3384 if (isSingleExtWideningInstruction(
3385 SingleUser->getOpcode(), Dst, Operands,
3386 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3387 // For adds only count the second operand as free if both operands are
3388 // extends but not the same operation. (i.e both operands are not free in
3389 // add(sext, zext)).
3390 if (SingleUser->getOpcode() == Instruction::Add) {
3391 if (I == SingleUser->getOperand(1) ||
3392 (isa<CastInst>(SingleUser->getOperand(1)) &&
3393 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3394 return 0;
3395 } else {
3396 // Others are free so long as isSingleExtWideningInstruction
3397 // returned true.
3398 return 0;
3399 }
3400 }
3401
3402 // The cast will be free for the s/urhadd instructions
3403 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3404 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3405 return 0;
3406 }
3407
3408 EVT SrcTy = TLI->getValueType(DL, Src);
3409 EVT DstTy = TLI->getValueType(DL, Dst);
3410
3411 if (!SrcTy.isSimple() || !DstTy.isSimple())
3412 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3413
3414 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3415 // we use fcvtx under SVE2. Give them invalid costs.
3416 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3417 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3418 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3420
3421 static const TypeConversionCostTblEntry BF16Tbl[] = {
3422 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3423 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3424 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3425 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3426 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3427 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3428 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3429 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3430 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3431 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3432 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3433 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3434 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3435 };
3436
3437 if (ST->hasBF16())
3438 if (const auto *Entry = ConvertCostTableLookup(
3439 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3440 return Entry->Cost;
3441
3442 // We have to estimate a cost of fixed length operation upon
3443 // SVE registers(operations) with the number of registers required
3444 // for a fixed type to be represented upon SVE registers.
3445 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3446 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3447 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3448 ST->useSVEForFixedLengthVectors(WiderTy)) {
3449 std::pair<InstructionCost, MVT> LT =
3450 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3451 unsigned NumElements =
3452 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3453 return LT.first *
3455 Opcode,
3456 ScalableVectorType::get(Dst->getScalarType(), NumElements),
3457 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3458 CostKind, I);
3459 }
3460
3461 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3462 // The cost of unpacking twice is artificially increased for now in order
3463 // to avoid regressions against NEON, which will use tbl instructions directly
3464 // instead of multiple layers of [s|u]unpk[lo|hi].
3465 // We use the unpacks in cases where the destination type is illegal and
3466 // requires splitting of the input, even if the input type itself is legal.
3467 const unsigned int SVE_EXT_COST = 1;
3468 const unsigned int SVE_FCVT_COST = 1;
3469 const unsigned int SVE_UNPACK_ONCE = 4;
3470 const unsigned int SVE_UNPACK_TWICE = 16;
3471
3472 static const TypeConversionCostTblEntry ConversionTbl[] = {
3473 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3474 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3475 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3476 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3477 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3478 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3479 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3480 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3481 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3482 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3483 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3484 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3485 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3486 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3487 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3488 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3489 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3490 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3491 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3492 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3493
3494 // Truncations on nxvmiN
3495 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3496 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3497 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3498 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3499 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3500 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3501 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3502 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3503 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3504 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3505 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3506 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3507 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3508 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3509 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3510 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3511 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3512 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3513 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3514 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3515 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3516 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3517 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3518 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3519 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3520 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3521 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3522 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3523 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3524 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3525 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3526 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3527 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3528
3529 // The number of shll instructions for the extension.
3530 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3531 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3532 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3533 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3534 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3535 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3536 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3537 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3538 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3539 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3540 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3541 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3542 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3543 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3544 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3545 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3546
3547 // FP Ext and trunc
3548 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3549 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3550 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3551 // FP16
3552 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3553 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3554 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3555 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3556 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3557 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3558 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3559 // BF16 (uses shift)
3560 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3561 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3562 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3563 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3564 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3565 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3566 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3567 // FP Ext and trunc
3568 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3569 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3570 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3571 // FP16
3572 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3573 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3574 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3575 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3576 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3577 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3578 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3579 // BF16 (more complex, with +bf16 is handled above)
3580 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3581 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3582 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3583 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3584 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3585 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3586 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3587 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3588
3589 // LowerVectorINT_TO_FP:
3590 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3591 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3592 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3593 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3594 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3595 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3596
3597 // SVE: to nxv2f16
3598 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3599 SVE_EXT_COST + SVE_FCVT_COST},
3600 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3601 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3602 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3603 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3604 SVE_EXT_COST + SVE_FCVT_COST},
3605 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3606 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3607 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3608
3609 // SVE: to nxv4f16
3610 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3611 SVE_EXT_COST + SVE_FCVT_COST},
3612 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3613 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3614 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3615 SVE_EXT_COST + SVE_FCVT_COST},
3616 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3617 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3618
3619 // SVE: to nxv8f16
3620 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3621 SVE_EXT_COST + SVE_FCVT_COST},
3622 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3623 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3624 SVE_EXT_COST + SVE_FCVT_COST},
3625 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3626
3627 // SVE: to nxv16f16
3628 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3629 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3630 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3631 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3632
3633 // Complex: to v2f32
3634 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3635 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3636 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3637 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3638
3639 // SVE: to nxv2f32
3640 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3641 SVE_EXT_COST + SVE_FCVT_COST},
3642 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3643 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3644 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3645 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3646 SVE_EXT_COST + SVE_FCVT_COST},
3647 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3648 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3649 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3650
3651 // Complex: to v4f32
3652 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3653 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3654 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3655 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3656
3657 // SVE: to nxv4f32
3658 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3659 SVE_EXT_COST + SVE_FCVT_COST},
3660 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3661 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3662 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3663 SVE_EXT_COST + SVE_FCVT_COST},
3664 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3665 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3666
3667 // Complex: to v8f32
3668 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3669 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3670 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3671 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3672
3673 // SVE: to nxv8f32
3674 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3675 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3676 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3677 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3678 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3679 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3680 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3681 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3682
3683 // SVE: to nxv16f32
3684 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3685 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3686 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3687 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3688
3689 // Complex: to v16f32
3690 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3691 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3692
3693 // Complex: to v2f64
3694 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3695 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3696 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3697 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3698 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3699 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3700
3701 // SVE: to nxv2f64
3702 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3703 SVE_EXT_COST + SVE_FCVT_COST},
3704 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3705 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3706 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3707 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3708 SVE_EXT_COST + SVE_FCVT_COST},
3709 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3710 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3711 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3712
3713 // Complex: to v4f64
3714 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3715 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3716
3717 // SVE: to nxv4f64
3718 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3719 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3720 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3721 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3722 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3723 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3724 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3725 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3726 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3727 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3728 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3729 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3730
3731 // SVE: to nxv8f64
3732 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3733 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3734 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3735 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3736 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3737 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3738 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3739 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3740
3741 // LowerVectorFP_TO_INT
3742 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3743 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3744 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3745 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3746 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3747 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3748
3749 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3750 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3751 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3752 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3753 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3754 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3755 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3756
3757 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3758 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3759 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3760 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3761 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3762
3763 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3764 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3765 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3766 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3767 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3768 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3769 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3770
3771 // Complex, from nxv2f32.
3772 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3773 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3774 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3775 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3776 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3777 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3778 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3779 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3780
3781 // Complex, from nxv2f64.
3782 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3783 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3784 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3785 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3786 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3787 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3788 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3789 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3790 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3791 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3792
3793 // Complex, from nxv4f32.
3794 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3795 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3796 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3797 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3798 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3799 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3800 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3801 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3802 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3803 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3804
3805 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3806 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3807 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3808 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3809 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3810
3811 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3812 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3813 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3814 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3815 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3816 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3817 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3818
3819 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3820 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3821 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3822 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3823 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3824
3825 // Complex, from nxv8f16.
3826 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3827 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3828 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3829 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3830 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3831 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3832 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3833 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3834 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3835 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3836
3837 // Complex, from nxv4f16.
3838 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3839 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3840 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3841 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3842 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3843 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3844 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3845 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3846
3847 // Complex, from nxv2f16.
3848 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3849 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3850 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3851 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3852 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3853 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3854 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3855 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3856
3857 // Truncate from nxvmf32 to nxvmf16.
3858 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3859 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3860 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3861
3862 // Truncate from nxvmf32 to nxvmbf16.
3863 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3864 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
3865 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
3866
3867 // Truncate from nxvmf64 to nxvmf16.
3868 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3869 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3870 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3871
3872 // Truncate from nxvmf64 to nxvmbf16.
3873 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
3874 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
3875 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
3876
3877 // Truncate from nxvmf64 to nxvmf32.
3878 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3879 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3880 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3881
3882 // Extend from nxvmf16 to nxvmf32.
3883 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3884 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3885 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3886
3887 // Extend from nxvmbf16 to nxvmf32.
3888 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
3889 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
3890 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
3891
3892 // Extend from nxvmf16 to nxvmf64.
3893 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3894 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3895 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3896
3897 // Extend from nxvmbf16 to nxvmf64.
3898 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
3899 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
3900 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
3901
3902 // Extend from nxvmf32 to nxvmf64.
3903 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3904 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3905 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3906
3907 // Bitcasts from float to integer
3908 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3909 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3910 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3911
3912 // Bitcasts from integer to float
3913 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3914 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3915 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3916
3917 // Add cost for extending to illegal -too wide- scalable vectors.
3918 // zero/sign extend are implemented by multiple unpack operations,
3919 // where each operation has a cost of 1.
3920 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3921 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3922 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3923 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3924 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3925 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3926
3927 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3928 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3929 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3930 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3931 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3932 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3933 };
3934
3935 if (const auto *Entry = ConvertCostTableLookup(
3936 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3937 return Entry->Cost;
3938
3939 static const TypeConversionCostTblEntry FP16Tbl[] = {
3940 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3941 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3942 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3943 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3944 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3945 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3946 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3947 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3948 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3949 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3950 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3951 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3952 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3953 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3954 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3955 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3956 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3957 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3958 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3959 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3960 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3961 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3962 };
3963
3964 if (ST->hasFullFP16())
3965 if (const auto *Entry = ConvertCostTableLookup(
3966 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3967 return Entry->Cost;
3968
3969 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3970 // double-rounding issues.
3971 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3972 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3974 return cast<FixedVectorType>(Dst)->getNumElements() *
3975 getCastInstrCost(Opcode, Dst->getScalarType(),
3976 Src->getScalarType(), CCH, CostKind) +
3978 true, CostKind) +
3980 false, CostKind);
3981
3982 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3984 ST->isSVEorStreamingSVEAvailable() &&
3985 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3987 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3989 // The standard behaviour in the backend for these cases is to split the
3990 // extend up into two parts:
3991 // 1. Perform an extending load or masked load up to the legal type.
3992 // 2. Extend the loaded data to the final type.
3993 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3994 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3996 Opcode, LegalTy, Src, CCH, CostKind, I);
3998 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3999 return Part1 + Part2;
4000 }
4001
4002 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
4003 // but we also want to include the TTI::CastContextHint::Masked case too.
4004 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
4006 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
4008
4009 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
4010}
4011
4014 VectorType *VecTy, unsigned Index,
4016
4017 // Make sure we were given a valid extend opcode.
4018 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
4019 "Invalid opcode");
4020
4021 // We are extending an element we extract from a vector, so the source type
4022 // of the extend is the element type of the vector.
4023 auto *Src = VecTy->getElementType();
4024
4025 // Sign- and zero-extends are for integer types only.
4026 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
4027
4028 // Get the cost for the extract. We compute the cost (if any) for the extend
4029 // below.
4030 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
4031 CostKind, Index, nullptr, nullptr);
4032
4033 // Legalize the types.
4034 auto VecLT = getTypeLegalizationCost(VecTy);
4035 auto DstVT = TLI->getValueType(DL, Dst);
4036 auto SrcVT = TLI->getValueType(DL, Src);
4037
4038 // If the resulting type is still a vector and the destination type is legal,
4039 // we may get the extension for free. If not, get the default cost for the
4040 // extend.
4041 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
4042 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4043 CostKind);
4044
4045 // The destination type should be larger than the element type. If not, get
4046 // the default cost for the extend.
4047 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
4048 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4049 CostKind);
4050
4051 switch (Opcode) {
4052 default:
4053 llvm_unreachable("Opcode should be either SExt or ZExt");
4054
4055 // For sign-extends, we only need a smov, which performs the extension
4056 // automatically.
4057 case Instruction::SExt:
4058 return Cost;
4059
4060 // For zero-extends, the extend is performed automatically by a umov unless
4061 // the destination type is i64 and the element type is i8 or i16.
4062 case Instruction::ZExt:
4063 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
4064 return Cost;
4065 }
4066
4067 // If we are unable to perform the extend for free, get the default cost.
4068 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4069 CostKind);
4070}
4071
4074 const Instruction *I) const {
4076 return Opcode == Instruction::PHI ? 0 : 1;
4077 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
4078 // Branches are assumed to be predicted.
4079 return 0;
4080}
4081
4082InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
4083 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4084 const Instruction *I, Value *Scalar,
4085 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4086 TTI::VectorInstrContext VIC) const {
4087 assert(Val->isVectorTy() && "This must be a vector type");
4088
4089 if (Index != -1U) {
4090 // Legalize the type.
4091 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4092
4093 // This type is legalized to a scalar type.
4094 if (!LT.second.isVector())
4095 return 0;
4096
4097 // The type may be split. For fixed-width vectors we can normalize the
4098 // index to the new type.
4099 if (LT.second.isFixedLengthVector()) {
4100 unsigned Width = LT.second.getVectorNumElements();
4101 Index = Index % Width;
4102 }
4103
4104 // The element at index zero is already inside the vector.
4105 // - For a insert-element or extract-element
4106 // instruction that extracts integers, an explicit FPR -> GPR move is
4107 // needed. So it has non-zero cost.
4108 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
4109 return 0;
4110
4111 // This is recognising a LD1 single-element structure to one lane of one
4112 // register instruction. I.e., if this is an `insertelement` instruction,
4113 // and its second operand is a load, then we will generate a LD1, which
4114 // are expensive instructions on some uArchs.
4115 if (VIC == TTI::VectorInstrContext::Load) {
4116 if (ST->hasFastLD1Single())
4117 return 0;
4118 return CostKind == TTI::TCK_CodeSize
4119 ? 0
4121 }
4122
4123 // i1 inserts and extract will include an extra cset or cmp of the vector
4124 // value. Increase the cost by 1 to account.
4125 if (Val->getScalarSizeInBits() == 1)
4126 return CostKind == TTI::TCK_CodeSize
4127 ? 2
4128 : ST->getVectorInsertExtractBaseCost() + 1;
4129
4130 // FIXME:
4131 // If the extract-element and insert-element instructions could be
4132 // simplified away (e.g., could be combined into users by looking at use-def
4133 // context), they have no cost. This is not done in the first place for
4134 // compile-time considerations.
4135 }
4136
4137 // In case of Neon, if there exists extractelement from lane != 0 such that
4138 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
4139 // 2. extractelement result feeds into fmul.
4140 // 3. Other operand of fmul is an extractelement from lane 0 or lane
4141 // equivalent to 0.
4142 // then the extractelement can be merged with fmul in the backend and it
4143 // incurs no cost.
4144 // e.g.
4145 // define double @foo(<2 x double> %a) {
4146 // %1 = extractelement <2 x double> %a, i32 0
4147 // %2 = extractelement <2 x double> %a, i32 1
4148 // %res = fmul double %1, %2
4149 // ret double %res
4150 // }
4151 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4152 auto ExtractCanFuseWithFmul = [&]() {
4153 // We bail out if the extract is from lane 0.
4154 if (Index == 0)
4155 return false;
4156
4157 // Check if the scalar element type of the vector operand of ExtractElement
4158 // instruction is one of the allowed types.
4159 auto IsAllowedScalarTy = [&](const Type *T) {
4160 return T->isFloatTy() || T->isDoubleTy() ||
4161 (T->isHalfTy() && ST->hasFullFP16());
4162 };
4163
4164 // Check if the extractelement user is scalar fmul.
4165 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4166 // Check if the user is scalar fmul.
4167 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
4168 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4169 !BO->getType()->isVectorTy();
4170 };
4171
4172 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4173 // certain scalar type and a certain vector register width.
4174 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4175 auto RegWidth =
4177 .getFixedValue();
4178 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4179 };
4180
4181 // Check if the type constraints on input vector type and result scalar type
4182 // of extractelement instruction are satisfied.
4183 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4184 return false;
4185
4186 if (Scalar) {
4187 DenseMap<User *, unsigned> UserToExtractIdx;
4188 for (auto *U : Scalar->users()) {
4189 if (!IsUserFMulScalarTy(U))
4190 return false;
4191 // Recording entry for the user is important. Index value is not
4192 // important.
4193 UserToExtractIdx[U];
4194 }
4195 if (UserToExtractIdx.empty())
4196 return false;
4197 for (auto &[S, U, L] : ScalarUserAndIdx) {
4198 for (auto *U : S->users()) {
4199 if (UserToExtractIdx.contains(U)) {
4200 auto *FMul = cast<BinaryOperator>(U);
4201 auto *Op0 = FMul->getOperand(0);
4202 auto *Op1 = FMul->getOperand(1);
4203 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4204 UserToExtractIdx[U] = L;
4205 break;
4206 }
4207 }
4208 }
4209 }
4210 for (auto &[U, L] : UserToExtractIdx) {
4211 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4212 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4213 return false;
4214 }
4215 } else {
4216 const auto *EE = cast<ExtractElementInst>(I);
4217
4218 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
4219 if (!IdxOp)
4220 return false;
4221
4222 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
4223 if (!IsUserFMulScalarTy(U))
4224 return false;
4225
4226 // Check if the other operand of extractelement is also extractelement
4227 // from lane equivalent to 0.
4228 const auto *BO = cast<BinaryOperator>(U);
4229 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4230 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
4231 if (OtherEE) {
4232 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
4233 if (!IdxOp)
4234 return false;
4235 return IsExtractLaneEquivalentToZero(
4236 cast<ConstantInt>(OtherEE->getIndexOperand())
4237 ->getValue()
4238 .getZExtValue(),
4239 OtherEE->getType()->getScalarSizeInBits());
4240 }
4241 return true;
4242 });
4243 }
4244 return true;
4245 };
4246
4247 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4248 ExtractCanFuseWithFmul())
4249 return 0;
4250
4251 // All other insert/extracts cost this much.
4252 return CostKind == TTI::TCK_CodeSize ? 1
4253 : ST->getVectorInsertExtractBaseCost();
4254}
4255
4257 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4258 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4259 // Treat insert at lane 0 into a poison vector as having zero cost. This
4260 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4261 // single dup) are treated as cheap.
4262 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4263 isa<PoisonValue>(Op0))
4264 return 0;
4265 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr,
4266 nullptr, {}, VIC);
4267}
4268
4270 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4271 Value *Scalar, ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4272 TTI::VectorInstrContext VIC) const {
4273 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4274 ScalarUserAndIdx, VIC);
4275}
4276
4279 TTI::TargetCostKind CostKind, unsigned Index,
4280 TTI::VectorInstrContext VIC) const {
4281 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I,
4282 nullptr, {}, VIC);
4283}
4284
4288 unsigned Index) const {
4289 if (isa<FixedVectorType>(Val))
4291 Index);
4292
4293 // This typically requires both while and lastb instructions in order
4294 // to extract the last element. If this is in a loop the while
4295 // instruction can at least be hoisted out, although it will consume a
4296 // predicate register. The cost should be more expensive than the base
4297 // extract cost, which is 2 for most CPUs.
4298 return CostKind == TTI::TCK_CodeSize
4299 ? 2
4300 : ST->getVectorInsertExtractBaseCost() + 1;
4301}
4302
4304 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4305 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
4306 TTI::VectorInstrContext VIC) const {
4309 if (Ty->getElementType()->isFloatingPointTy())
4310 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4311 CostKind);
4312 unsigned VecInstCost =
4313 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4314 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4315}
4316
4317std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4319 TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4320 std::function<InstructionCost(Type *)> InstCost) const {
4321 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4322 return std::nullopt;
4323 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4324 return std::nullopt;
4325 // If we have +sve-b16b16 the operation can be promoted to SVE.
4326 if (CanUseSVE && ST->hasSVEB16B16() && ST->isNonStreamingSVEorSME2Available())
4327 return std::nullopt;
4328
4329 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4330 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4332 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4333 Cost *= 2;
4334 Cost += InstCost(PromotedTy);
4335 if (IncludeTrunc)
4336 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4338 return Cost;
4339}
4340
4342 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4344 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4345
4346 // The code-generator is currently not able to handle scalable vectors
4347 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4348 // it. This change will be removed when code-generation for these types is
4349 // sufficiently reliable.
4350 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4351 if (VTy->getElementCount() == ElementCount::getScalable(1))
4353
4354 // TODO: Handle more cost kinds.
4356 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4357 Op2Info, Args, CxtI);
4358
4359 // Legalize the type.
4360 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4361 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4362
4363 // Increase the cost for half and bfloat types if not architecturally
4364 // supported.
4365 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4366 ISD == ISD::FDIV || ISD == ISD::FREM)
4367 if (auto PromotedCost = getFP16BF16PromoteCost(
4368 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4369 // There is not native support for fdiv/frem even with +sve-b16b16.
4370 /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
4371 [&](Type *PromotedTy) {
4372 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4373 Op1Info, Op2Info);
4374 }))
4375 return *PromotedCost;
4376
4377 // If the operation is a widening instruction (smull or umull) and both
4378 // operands are extends the cost can be cheaper by considering that the
4379 // operation will operate on the narrowest type size possible (double the
4380 // largest input size) and a further extend.
4381 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
4382 if (ExtTy != Ty)
4383 return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
4384 getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
4386 return LT.first;
4387 }
4388
4389 switch (ISD) {
4390 default:
4391 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4392 Op2Info);
4393 case ISD::SREM:
4394 case ISD::SDIV:
4395 /*
4396 Notes for sdiv/srem specific costs:
4397 1. This only considers the cases where the divisor is constant, uniform and
4398 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4399 result in some form of (ldr + adrp), corresponding to constant vectors, or
4400 scalarization of the division operation.
4401 2. Constant divisors, either negative in whole or partially, don't result in
4402 significantly different codegen as compared to positive constant divisors.
4403 So, we don't consider negative divisors separately.
4404 3. If the codegen is significantly different with SVE, it has been indicated
4405 using comments at appropriate places.
4406
4407 sdiv specific cases:
4408 -----------------------------------------------------------------------
4409 codegen | pow-of-2 | Type
4410 -----------------------------------------------------------------------
4411 add + cmp + csel + asr | Y | i64
4412 add + cmp + csel + asr | Y | i32
4413 -----------------------------------------------------------------------
4414
4415 srem specific cases:
4416 -----------------------------------------------------------------------
4417 codegen | pow-of-2 | Type
4418 -----------------------------------------------------------------------
4419 negs + and + and + csneg | Y | i64
4420 negs + and + and + csneg | Y | i32
4421 -----------------------------------------------------------------------
4422
4423 other sdiv/srem cases:
4424 -------------------------------------------------------------------------
4425 common codegen | + srem | + sdiv | pow-of-2 | Type
4426 -------------------------------------------------------------------------
4427 smulh + asr + add + add | - | - | N | i64
4428 smull + lsr + add + add | - | - | N | i32
4429 usra | and + sub | sshr | Y | <2 x i64>
4430 2 * (scalar code) | - | - | N | <2 x i64>
4431 usra | bic + sub | sshr + neg | Y | <4 x i32>
4432 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4433 + sshr + usra | | | |
4434 -------------------------------------------------------------------------
4435 */
4436 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4437 InstructionCost AddCost =
4438 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4439 Op1Info.getNoProps(), Op2Info.getNoProps());
4440 InstructionCost AsrCost =
4441 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4442 Op1Info.getNoProps(), Op2Info.getNoProps());
4443 InstructionCost MulCost =
4444 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4445 Op1Info.getNoProps(), Op2Info.getNoProps());
4446 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4447 // have similar cost.
4448 auto VT = TLI->getValueType(DL, Ty);
4449 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4450 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4451 // Neg can be folded into the asr instruction.
4452 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4453 : (3 * AsrCost + AddCost);
4454 } else {
4455 return MulCost + AsrCost + 2 * AddCost;
4456 }
4457 } else if (VT.isVector()) {
4458 InstructionCost UsraCost = 2 * AsrCost;
4459 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4460 // Division with scalable types corresponds to native 'asrd'
4461 // instruction when SVE is available.
4462 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4463
4464 // One more for the negation in SDIV
4466 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4467 if (Ty->isScalableTy() && ST->hasSVE())
4468 Cost += 2 * AsrCost;
4469 else {
4470 Cost +=
4471 UsraCost +
4472 (ISD == ISD::SDIV
4473 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4474 : 2 * AddCost);
4475 }
4476 return Cost;
4477 } else if (LT.second == MVT::v2i64) {
4478 return VT.getVectorNumElements() *
4479 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4480 Op1Info.getNoProps(),
4481 Op2Info.getNoProps());
4482 } else {
4483 // When SVE is available, we get:
4484 // smulh + lsr + add/sub + asr + add/sub.
4485 if (Ty->isScalableTy() && ST->hasSVE())
4486 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4487 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4488 }
4489 }
4490 }
4491 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4492 LT.second.isFixedLengthVector()) {
4493 // FIXME: When the constant vector is non-uniform, this may result in
4494 // loading the vector from constant pool or in some cases, may also result
4495 // in scalarization. For now, we are approximating this with the
4496 // scalarization cost.
4497 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4498 CostKind, -1, nullptr, nullptr);
4499 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4500 CostKind, -1, nullptr, nullptr);
4501 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4502 return ExtractCost + InsertCost +
4503 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4504 CostKind, Op1Info.getNoProps(),
4505 Op2Info.getNoProps());
4506 }
4507 [[fallthrough]];
4508 case ISD::UDIV:
4509 case ISD::UREM: {
4510 auto VT = TLI->getValueType(DL, Ty);
4511 if (Op2Info.isConstant()) {
4512 // If the operand is a power of 2 we can use the shift or and cost.
4513 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4514 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4515 Op1Info.getNoProps(),
4516 Op2Info.getNoProps());
4517 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4518 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4519 Op1Info.getNoProps(),
4520 Op2Info.getNoProps());
4521
4522 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4523 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4524 // The MULHU will be expanded to UMULL for the types not listed below,
4525 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4526 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4527 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4528 LT.second == MVT::nxv16i8;
4529 bool Is128bit = LT.second.is128BitVector();
4530
4531 InstructionCost MulCost =
4532 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4533 Op1Info.getNoProps(), Op2Info.getNoProps());
4534 InstructionCost AddCost =
4535 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4536 Op1Info.getNoProps(), Op2Info.getNoProps());
4537 InstructionCost ShrCost =
4538 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4539 Op1Info.getNoProps(), Op2Info.getNoProps());
4540 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4541 (HasMULH ? 0 : ShrCost) + // UMULL shift
4542 AddCost * 2 + ShrCost;
4543 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4544 }
4545 }
4546
4547 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4548 // emitted by the backend even when those functions are not declared in the
4549 // module.
4550 if (!VT.isVector() && VT.getSizeInBits() > 64)
4551 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4552
4554 Opcode, Ty, CostKind, Op1Info, Op2Info);
4555 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4556 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4557 // SDIV/UDIV operations are lowered using SVE, then we can have less
4558 // costs.
4559 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4560 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4561 static const CostTblEntry DivTbl[]{
4562 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4563 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4564 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4565 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4566 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4567 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4568
4569 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4570 if (nullptr != Entry)
4571 return Entry->Cost;
4572 }
4573 // For 8/16-bit elements, the cost is higher because the type
4574 // requires promotion and possibly splitting:
4575 if (LT.second.getScalarType() == MVT::i8)
4576 Cost *= 8;
4577 else if (LT.second.getScalarType() == MVT::i16)
4578 Cost *= 4;
4579 return Cost;
4580 } else {
4581 // If one of the operands is a uniform constant then the cost for each
4582 // element is Cost for insertion, extraction and division.
4583 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4584 // operation with scalar type
4585 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4586 (Op2Info.isConstant() && Op2Info.isUniform())) {
4587 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4589 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4590 return (4 + DivCost) * VTy->getNumElements();
4591 }
4592 }
4593 // On AArch64, without SVE, vector divisions are expanded
4594 // into scalar divisions of each pair of elements.
4595 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4596 -1, nullptr, nullptr);
4597 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4598 nullptr, nullptr);
4599 }
4600
4601 // TODO: if one of the arguments is scalar, then it's not necessary to
4602 // double the cost of handling the vector elements.
4603 Cost += Cost;
4604 }
4605 return Cost;
4606 }
4607 case ISD::MUL:
4608 // When SVE is available, then we can lower the v2i64 operation using
4609 // the SVE mul instruction, which has a lower cost.
4610 if (LT.second == MVT::v2i64 && ST->hasSVE())
4611 return LT.first;
4612
4613 // When SVE is not available, there is no MUL.2d instruction,
4614 // which means mul <2 x i64> is expensive as elements are extracted
4615 // from the vectors and the muls scalarized.
4616 // As getScalarizationOverhead is a bit too pessimistic, we
4617 // estimate the cost for a i64 vector directly here, which is:
4618 // - four 2-cost i64 extracts,
4619 // - two 2-cost i64 inserts, and
4620 // - two 1-cost muls.
4621 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4622 // LT.first = 2 the cost is 28.
4623 if (LT.second != MVT::v2i64)
4624 return LT.first;
4625 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4626 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4627 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4628 nullptr, nullptr) *
4629 2 +
4630 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4631 nullptr, nullptr));
4632 case ISD::ADD:
4633 case ISD::XOR:
4634 case ISD::OR:
4635 case ISD::AND:
4636 case ISD::SRL:
4637 case ISD::SRA:
4638 case ISD::SHL:
4639 // These nodes are marked as 'custom' for combining purposes only.
4640 // We know that they are legal. See LowerAdd in ISelLowering.
4641 return LT.first;
4642
4643 case ISD::FNEG:
4644 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4645 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4646 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4647 CxtI &&
4648 ((CxtI->hasOneUse() &&
4649 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4650 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4651 return 0;
4652 [[fallthrough]];
4653 case ISD::FADD:
4654 case ISD::FSUB:
4655 if (!Ty->getScalarType()->isFP128Ty())
4656 return LT.first;
4657 [[fallthrough]];
4658 case ISD::FMUL:
4659 case ISD::FDIV:
4660 // These nodes are marked as 'custom' just to lower them to SVE.
4661 // We know said lowering will incur no additional cost.
4662 if (!Ty->getScalarType()->isFP128Ty())
4663 return 2 * LT.first;
4664
4665 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4666 Op2Info);
4667 case ISD::FREM:
4668 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4669 // those functions are not declared in the module.
4670 if (!Ty->isVectorTy())
4671 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4672 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4673 Op2Info);
4674 }
4675}
4676
4679 const SCEV *Ptr,
4681 // Address computations in vectorized code with non-consecutive addresses will
4682 // likely result in more instructions compared to scalar code where the
4683 // computation can more often be merged into the index mode. The resulting
4684 // extra micro-ops can significantly decrease throughput.
4685 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4686 int MaxMergeDistance = 64;
4687
4688 if (PtrTy->isVectorTy() && SE &&
4689 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4690 return NumVectorInstToHideOverhead;
4691
4692 // In many cases the address computation is not merged into the instruction
4693 // addressing mode.
4694 return 1;
4695}
4696
4697/// Check whether Opcode1 has less throughput according to the scheduling
4698/// model than Opcode2.
4700 unsigned Opcode1, unsigned Opcode2) const {
4701 const MCSchedModel &Sched = ST->getSchedModel();
4702 const TargetInstrInfo *TII = ST->getInstrInfo();
4703 if (!Sched.hasInstrSchedModel())
4704 return false;
4705
4706 const MCSchedClassDesc *SCD1 =
4707 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4708 const MCSchedClassDesc *SCD2 =
4709 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4710 // We cannot handle variant scheduling classes without an MI. If we need to
4711 // support them for any of the instructions we query the information of we
4712 // might need to add a way to resolve them without a MI or not use the
4713 // scheduling info.
4714 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4715 "Cannot handle variant scheduling classes without an MI");
4716 if (!SCD1->isValid() || !SCD2->isValid())
4717 return false;
4718
4719 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4721}
4722
4724 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4726 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4727 // We don't lower some vector selects well that are wider than the register
4728 // width. TODO: Improve this with different cost kinds.
4729 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4730 // We would need this many instructions to hide the scalarization happening.
4731 const int AmortizationCost = 20;
4732
4733 // If VecPred is not set, check if we can get a predicate from the context
4734 // instruction, if its type matches the requested ValTy.
4735 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4736 CmpPredicate CurrentPred;
4737 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4738 m_Value())))
4739 VecPred = CurrentPred;
4740 }
4741 // Check if we have a compare/select chain that can be lowered using
4742 // a (F)CMxx & BFI pair.
4743 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4744 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4745 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4746 VecPred == CmpInst::FCMP_UNE) {
4747 static const auto ValidMinMaxTys = {
4748 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4749 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4750 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4751
4752 auto LT = getTypeLegalizationCost(ValTy);
4753 if (any_of(ValidMinMaxTys, equal_to(LT.second)) ||
4754 (ST->hasFullFP16() &&
4755 any_of(ValidFP16MinMaxTys, equal_to(LT.second))))
4756 return LT.first;
4757 }
4758
4759 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4760 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4761 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4762 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4763 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4764 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4765 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4766 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4767 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4768 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4769 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4770 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4771
4772 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4773 EVT SelValTy = TLI->getValueType(DL, ValTy);
4774 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4775 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4776 SelCondTy.getSimpleVT(),
4777 SelValTy.getSimpleVT()))
4778 return Entry->Cost;
4779 }
4780 }
4781
4782 if (Opcode == Instruction::FCmp) {
4783 if (auto PromotedCost = getFP16BF16PromoteCost(
4784 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4785 // TODO: Consider costing SVE FCMPs.
4786 /*CanUseSVE=*/false, [&](Type *PromotedTy) {
4788 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4789 CostKind, Op1Info, Op2Info);
4790 if (isa<VectorType>(PromotedTy))
4792 Instruction::Trunc,
4796 return Cost;
4797 }))
4798 return *PromotedCost;
4799
4800 auto LT = getTypeLegalizationCost(ValTy);
4801 // Model unknown fp compares as a libcall.
4802 if (LT.second.getScalarType() != MVT::f64 &&
4803 LT.second.getScalarType() != MVT::f32 &&
4804 LT.second.getScalarType() != MVT::f16)
4805 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4806 {ValTy, ValTy}, CostKind);
4807
4808 // Some comparison operators require expanding to multiple compares + or.
4809 unsigned Factor = 1;
4810 if (!CondTy->isVectorTy() &&
4811 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4812 Factor = 2; // fcmp with 2 selects
4813 else if (isa<FixedVectorType>(ValTy) &&
4814 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4815 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4816 Factor = 3; // fcmxx+fcmyy+or
4817 else if (isa<ScalableVectorType>(ValTy) &&
4818 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4819 Factor = 3; // fcmxx+fcmyy+or
4820
4821 if (isa<ScalableVectorType>(ValTy) &&
4823 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4824 AArch64::FCMEQv4f32))
4825 Factor *= 2;
4826
4827 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4828 }
4829
4830 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4831 // icmp(and, 0) as free, as we can make use of ands, but only if the
4832 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4833 // providing it will not cause performance regressions.
4834 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4835 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4836 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4837 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4838 if (match(I->getOperand(1), m_Zero()))
4839 return 0;
4840
4841 // x >= 1 / x < 1 -> x > 0 / x <= 0
4842 if (match(I->getOperand(1), m_One()) &&
4843 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4844 return 0;
4845
4846 // x <= -1 / x > -1 -> x > 0 / x <= 0
4847 if (match(I->getOperand(1), m_AllOnes()) &&
4848 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4849 return 0;
4850 }
4851
4852 // The base case handles scalable vectors fine for now, since it treats the
4853 // cost as 1 * legalization cost.
4854 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4855 Op1Info, Op2Info, I);
4856}
4857
4859AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4861 if (ST->requiresStrictAlign()) {
4862 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4863 // a bunch of instructions when strict align is enabled.
4864 return Options;
4865 }
4866 Options.AllowOverlappingLoads = true;
4867 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4868 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4869 // TODO: Though vector loads usually perform well on AArch64, in some targets
4870 // they may wake up the FP unit, which raises the power consumption. Perhaps
4871 // they could be used with no holds barred (-O3).
4872 Options.LoadSizes = {8, 4, 2, 1};
4873 Options.AllowedTailExpansions = {3, 5, 6};
4874 return Options;
4875}
4876
4878 return ST->hasSVE();
4879}
4880
4884 switch (MICA.getID()) {
4885 case Intrinsic::masked_scatter:
4886 case Intrinsic::masked_gather:
4887 return getGatherScatterOpCost(MICA, CostKind);
4888 case Intrinsic::masked_load:
4889 case Intrinsic::masked_expandload:
4890 case Intrinsic::masked_store:
4891 return getMaskedMemoryOpCost(MICA, CostKind);
4892 }
4894}
4895
4899 Type *Src = MICA.getDataType();
4900
4901 if (useNeonVector(Src))
4903 auto LT = getTypeLegalizationCost(Src);
4904 if (!LT.first.isValid())
4906
4907 // Return an invalid cost for element types that we are unable to lower.
4908 auto *VT = cast<VectorType>(Src);
4909 if (VT->getElementType()->isIntegerTy(1))
4911
4912 // The code-generator is currently not able to handle scalable vectors
4913 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4914 // it. This change will be removed when code-generation for these types is
4915 // sufficiently reliable.
4916 if (VT->getElementCount() == ElementCount::getScalable(1))
4918
4919 InstructionCost MemOpCost = LT.first;
4920 if (MICA.getID() == Intrinsic::masked_expandload) {
4921 if (!isLegalMaskedExpandLoad(Src, MICA.getAlignment()))
4923
4924 // Operation will be split into expand of masked.load
4925 MemOpCost *= 2;
4926 }
4927
4928 // If we need to split the memory operation, we will also need to split the
4929 // mask. This will likely lead to overestimating the cost in some cases if
4930 // multiple memory operations use the same mask, but we often don't have
4931 // enough context to figure that out here.
4932 //
4933 // If the elements being loaded are bytes then the mask will already be split,
4934 // since the number of bits in a P register matches the number of bytes in a
4935 // Z register.
4936 if (LT.first > 1 && LT.second.getScalarSizeInBits() > 8)
4937 return MemOpCost * 2;
4938
4939 return MemOpCost;
4940}
4941
4942// This function returns gather/scatter overhead either from
4943// user-provided value or specialized values per-target from \p ST.
4944static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4945 const AArch64Subtarget *ST) {
4946 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4947 "Should be called on only load or stores.");
4948 switch (Opcode) {
4949 case Instruction::Load:
4950 if (SVEGatherOverhead.getNumOccurrences() > 0)
4951 return SVEGatherOverhead;
4952 return ST->getGatherOverhead();
4953 break;
4954 case Instruction::Store:
4955 if (SVEScatterOverhead.getNumOccurrences() > 0)
4956 return SVEScatterOverhead;
4957 return ST->getScatterOverhead();
4958 break;
4959 default:
4960 llvm_unreachable("Shouldn't have reached here");
4961 }
4962}
4963
4967
4968 unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather ||
4969 MICA.getID() == Intrinsic::vp_gather)
4970 ? Instruction::Load
4971 : Instruction::Store;
4972
4973 Type *DataTy = MICA.getDataType();
4974 Align Alignment = MICA.getAlignment();
4975 const Instruction *I = MICA.getInst();
4976
4977 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
4979 auto *VT = cast<VectorType>(DataTy);
4980 auto LT = getTypeLegalizationCost(DataTy);
4981 if (!LT.first.isValid())
4983
4984 // Return an invalid cost for element types that we are unable to lower.
4985 if (!LT.second.isVector() ||
4986 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
4987 VT->getElementType()->isIntegerTy(1))
4989
4990 // The code-generator is currently not able to handle scalable vectors
4991 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4992 // it. This change will be removed when code-generation for these types is
4993 // sufficiently reliable.
4994 if (VT->getElementCount() == ElementCount::getScalable(1))
4996
4997 ElementCount LegalVF = LT.second.getVectorElementCount();
4998 InstructionCost MemOpCost =
4999 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
5000 {TTI::OK_AnyValue, TTI::OP_None}, I);
5001 // Add on an overhead cost for using gathers/scatters.
5002 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
5003 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
5004}
5005
5007 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
5008}
5009
5011 Align Alignment,
5012 unsigned AddressSpace,
5014 TTI::OperandValueInfo OpInfo,
5015 const Instruction *I) const {
5016 EVT VT = TLI->getValueType(DL, Ty, true);
5017 // Type legalization can't handle structs
5018 if (VT == MVT::Other)
5019 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
5020 CostKind);
5021
5022 auto LT = getTypeLegalizationCost(Ty);
5023 if (!LT.first.isValid())
5025
5026 // The code-generator is currently not able to handle scalable vectors
5027 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5028 // it. This change will be removed when code-generation for these types is
5029 // sufficiently reliable.
5030 // We also only support full register predicate loads and stores.
5031 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5032 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
5033 (VTy->getElementType()->isIntegerTy(1) &&
5034 !VTy->getElementCount().isKnownMultipleOf(
5037
5038 // TODO: consider latency as well for TCK_SizeAndLatency.
5040 return LT.first;
5041
5043 return 1;
5044
5045 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
5046 LT.second.is128BitVector() && Alignment < Align(16)) {
5047 // Unaligned stores are extremely inefficient. We don't split all
5048 // unaligned 128-bit stores because the negative impact that has shown in
5049 // practice on inlined block copy code.
5050 // We make such stores expensive so that we will only vectorize if there
5051 // are 6 other instructions getting vectorized.
5052 const int AmortizationCost = 6;
5053
5054 return LT.first * 2 * AmortizationCost;
5055 }
5056
5057 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
5058 if (Ty->isPtrOrPtrVectorTy())
5059 return LT.first;
5060
5061 if (useNeonVector(Ty)) {
5062 // Check truncating stores and extending loads.
5063 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
5064 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
5065 if (VT == MVT::v4i8)
5066 return 2;
5067 // Otherwise we need to scalarize.
5068 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
5069 }
5070 EVT EltVT = VT.getVectorElementType();
5071 unsigned EltSize = EltVT.getScalarSizeInBits();
5072 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
5073 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
5074 return LT.first;
5075 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
5076 // widening to v4i8, which produces suboptimal results.
5077 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
5078 return LT.first;
5079
5080 // Check non-power-of-2 loads/stores for legal vector element types with
5081 // NEON. Non-power-of-2 memory ops will get broken down to a set of
5082 // operations on smaller power-of-2 ops, including ld1/st1.
5083 LLVMContext &C = Ty->getContext();
5085 SmallVector<EVT> TypeWorklist;
5086 TypeWorklist.push_back(VT);
5087 while (!TypeWorklist.empty()) {
5088 EVT CurrVT = TypeWorklist.pop_back_val();
5089 unsigned CurrNumElements = CurrVT.getVectorNumElements();
5090 if (isPowerOf2_32(CurrNumElements)) {
5091 Cost += 1;
5092 continue;
5093 }
5094
5095 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
5096 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
5097 TypeWorklist.push_back(
5098 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
5099 }
5100 return Cost;
5101 }
5102
5103 return LT.first;
5104}
5105
5107 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
5108 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
5109 bool UseMaskForCond, bool UseMaskForGaps) const {
5110 assert(Factor >= 2 && "Invalid interleave factor");
5111 auto *VecVTy = cast<VectorType>(VecTy);
5112
5113 if (VecTy->isScalableTy() && !ST->hasSVE())
5115
5116 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
5117 // only have lowering for power-of-2 factors.
5118 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
5119 // InterleavedAccessPass for ld3/st3
5120 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
5122
5123 // Vectorization for masked interleaved accesses is only enabled for scalable
5124 // VF.
5125 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
5127
5128 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
5129 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
5130 auto *SubVecTy =
5131 VectorType::get(VecVTy->getElementType(),
5132 VecVTy->getElementCount().divideCoefficientBy(Factor));
5133
5134 // ldN/stN only support legal vector types of size 64 or 128 in bits.
5135 // Accesses having vector types that are a multiple of 128 bits can be
5136 // matched to more than one ldN/stN instruction.
5137 bool UseScalable;
5138 if (MinElts % Factor == 0 &&
5139 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
5140 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
5141 }
5142
5143 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5144 Alignment, AddressSpace, CostKind,
5145 UseMaskForCond, UseMaskForGaps);
5146}
5147
5152 for (auto *I : Tys) {
5153 if (!I->isVectorTy())
5154 continue;
5155 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
5156 128)
5157 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
5158 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
5159 }
5160 return Cost;
5161}
5162
5164 Align Alignment) const {
5165 // Neon types should be scalarised when we are not choosing to use SVE.
5166 if (useNeonVector(DataTy))
5167 return false;
5168
5169 // Return true only if we are able to lower using the SVE2p2/SME2p2
5170 // expand instruction.
5171 return (ST->isSVEAvailable() && ST->hasSVE2p2()) ||
5172 (ST->isSVEorStreamingSVEAvailable() && ST->hasSME2p2());
5173}
5174
5176 return ST->getMaxInterleaveFactor();
5177}
5178
5179// For Falkor, we want to avoid having too many strided loads in a loop since
5180// that can exhaust the HW prefetcher resources. We adjust the unroller
5181// MaxCount preference below to attempt to ensure unrolling doesn't create too
5182// many strided loads.
5183static void
5186 enum { MaxStridedLoads = 7 };
5187 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
5188 int StridedLoads = 0;
5189 // FIXME? We could make this more precise by looking at the CFG and
5190 // e.g. not counting loads in each side of an if-then-else diamond.
5191 for (const auto BB : L->blocks()) {
5192 for (auto &I : *BB) {
5193 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
5194 if (!LMemI)
5195 continue;
5196
5197 Value *PtrValue = LMemI->getPointerOperand();
5198 if (L->isLoopInvariant(PtrValue))
5199 continue;
5200
5201 const SCEV *LSCEV = SE.getSCEV(PtrValue);
5202 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
5203 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
5204 continue;
5205
5206 // FIXME? We could take pairing of unrolled load copies into account
5207 // by looking at the AddRec, but we would probably have to limit this
5208 // to loops with no stores or other memory optimization barriers.
5209 ++StridedLoads;
5210 // We've seen enough strided loads that seeing more won't make a
5211 // difference.
5212 if (StridedLoads > MaxStridedLoads / 2)
5213 return StridedLoads;
5214 }
5215 }
5216 return StridedLoads;
5217 };
5218
5219 int StridedLoads = countStridedLoads(L, SE);
5220 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5221 << " strided loads\n");
5222 // Pick the largest power of 2 unroll count that won't result in too many
5223 // strided loads.
5224 if (StridedLoads) {
5225 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
5226 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5227 << UP.MaxCount << '\n');
5228 }
5229}
5230
5231// This function returns true if the loop:
5232// 1. Has a valid cost, and
5233// 2. Has a cost within the supplied budget.
5234// Otherwise it returns false.
5236 InstructionCost Budget,
5237 unsigned *FinalSize) {
5238 // Estimate the size of the loop.
5239 InstructionCost LoopCost = 0;
5240
5241 for (auto *BB : L->getBlocks()) {
5242 for (auto &I : *BB) {
5243 SmallVector<const Value *, 4> Operands(I.operand_values());
5244 InstructionCost Cost =
5245 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
5246 // This can happen with intrinsics that don't currently have a cost model
5247 // or for some operations that require SVE.
5248 if (!Cost.isValid())
5249 return false;
5250
5251 LoopCost += Cost;
5252 if (LoopCost > Budget)
5253 return false;
5254 }
5255 }
5256
5257 if (FinalSize)
5258 *FinalSize = LoopCost.getValue();
5259 return true;
5260}
5261
5263 const AArch64TTIImpl &TTI) {
5264 // Only consider loops with unknown trip counts for which we can determine
5265 // a symbolic expression. Multi-exit loops with small known trip counts will
5266 // likely be unrolled anyway.
5267 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5269 return false;
5270
5271 // It might not be worth unrolling loops with low max trip counts. Restrict
5272 // this to max trip counts > 32 for now.
5273 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5274 if (MaxTC > 0 && MaxTC <= 32)
5275 return false;
5276
5277 // Make sure the loop size is <= 5.
5278 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
5279 return false;
5280
5281 // Small search loops with multiple exits can be highly beneficial to unroll.
5282 // We only care about loops with exactly two exiting blocks, although each
5283 // block could jump to the same exit block.
5284 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5285 if (Blocks.size() != 2)
5286 return false;
5287
5288 if (any_of(Blocks, [](BasicBlock *BB) {
5290 }))
5291 return false;
5292
5293 return true;
5294}
5295
5296/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5297/// OOO engine's wide instruction window and various predictors.
5298static void
5301 const AArch64TTIImpl &TTI) {
5302 // Limit loops with structure that is highly likely to benefit from runtime
5303 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5304 // likely with complex control flow). Note that the heuristics here may be
5305 // overly conservative and we err on the side of avoiding runtime unrolling
5306 // rather than unroll excessively. They are all subject to further refinement.
5307 if (!L->isInnermost() || L->getNumBlocks() > 8)
5308 return;
5309
5310 // Loops with multiple exits are handled by common code.
5311 if (!L->getExitBlock())
5312 return;
5313
5314 // Check if the loop contains any reductions that could be parallelized when
5315 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5316 // a multiple of 2.
5317 bool HasParellelizableReductions =
5318 L->getNumBlocks() == 1 &&
5319 any_of(L->getHeader()->phis(),
5320 [&SE, L](PHINode &Phi) {
5321 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
5322 }) &&
5323 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
5324 if (HasParellelizableReductions &&
5325 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
5326 UP.Partial = true;
5327 UP.MaxCount = 4;
5328 UP.AddAdditionalAccumulators = true;
5329 }
5330
5331 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5333 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5334 SE.getSmallConstantMaxTripCount(L) <= 32))
5335 return;
5336
5337 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
5338 return;
5339
5341 return;
5342
5343 // Limit to loops with trip counts that are cheap to expand.
5344 UP.SCEVExpansionBudget = 1;
5345
5346 if (HasParellelizableReductions) {
5347 UP.Runtime = true;
5349 UP.AddAdditionalAccumulators = true;
5350 }
5351
5352 // Try to unroll small loops, of few-blocks with low budget, if they have
5353 // load/store dependencies, to expose more parallel memory access streams,
5354 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5355 BasicBlock *Header = L->getHeader();
5356 BasicBlock *Latch = L->getLoopLatch();
5357 if (Header == Latch) {
5358 // Estimate the size of the loop.
5359 unsigned Size;
5360 unsigned Width = 10;
5361 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5362 return;
5363
5364 // Try to find an unroll count that maximizes the use of the instruction
5365 // window, i.e. trying to fetch as many instructions per cycle as possible.
5366 unsigned MaxInstsPerLine = 16;
5367 unsigned UC = 1;
5368 unsigned BestUC = 1;
5369 unsigned SizeWithBestUC = BestUC * Size;
5370 while (UC <= 8) {
5371 unsigned SizeWithUC = UC * Size;
5372 if (SizeWithUC > 48)
5373 break;
5374 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5375 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5376 BestUC = UC;
5377 SizeWithBestUC = BestUC * Size;
5378 }
5379 UC++;
5380 }
5381
5382 if (BestUC == 1)
5383 return;
5384
5385 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5387 for (auto *BB : L->blocks()) {
5388 for (auto &I : *BB) {
5390 if (!Ptr)
5391 continue;
5392 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5393 if (SE.isLoopInvariant(PtrSCEV, L))
5394 continue;
5395 if (isa<LoadInst>(&I)) {
5396 LoadedValuesPlus.insert(&I);
5397 // Include in-loop 1st users of loaded values.
5398 for (auto *U : I.users())
5399 if (L->contains(cast<Instruction>(U)))
5400 LoadedValuesPlus.insert(U);
5401 } else
5402 Stores.push_back(cast<StoreInst>(&I));
5403 }
5404 }
5405
5406 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5407 return LoadedValuesPlus.contains(SI->getOperand(0));
5408 }))
5409 return;
5410
5411 UP.Runtime = true;
5412 UP.DefaultUnrollRuntimeCount = BestUC;
5413 return;
5414 }
5415
5416 // Try to runtime-unroll loops with early-continues depending on loop-varying
5417 // loads; this helps with branch-prediction for the early-continues.
5418 auto *Term = dyn_cast<CondBrInst>(Header->getTerminator());
5420 if (!Term || Preds.size() == 1 || !llvm::is_contained(Preds, Header) ||
5421 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5422 return;
5423
5424 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5425 [&](Instruction *I, unsigned Depth) -> bool {
5426 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5427 return false;
5428
5429 if (isa<LoadInst>(I))
5430 return true;
5431
5432 return any_of(I->operands(), [&](Value *V) {
5433 auto *I = dyn_cast<Instruction>(V);
5434 return I && DependsOnLoopLoad(I, Depth + 1);
5435 });
5436 };
5437 CmpPredicate Pred;
5438 Instruction *I;
5439 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5440 m_Value())) &&
5441 DependsOnLoopLoad(I, 0)) {
5442 UP.Runtime = true;
5443 }
5444}
5445
5448 OptimizationRemarkEmitter *ORE) const {
5449 // Enable partial unrolling and runtime unrolling.
5450 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5451
5452 UP.UpperBound = true;
5453
5454 // For inner loop, it is more likely to be a hot one, and the runtime check
5455 // can be promoted out from LICM pass, so the overhead is less, let's try
5456 // a larger threshold to unroll more loops.
5457 if (L->getLoopDepth() > 1)
5458 UP.PartialThreshold *= 2;
5459
5460 // Disable partial & runtime unrolling on -Os.
5462
5463 // Scan the loop: don't unroll loops with calls as this could prevent
5464 // inlining. Don't unroll auto-vectorized loops either, though do allow
5465 // unrolling of the scalar remainder.
5466 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5468 for (auto *BB : L->getBlocks()) {
5469 for (auto &I : *BB) {
5470 // Both auto-vectorized loops and the scalar remainder have the
5471 // isvectorized attribute, so differentiate between them by the presence
5472 // of vector instructions.
5473 if (IsVectorized && I.getType()->isVectorTy())
5474 return;
5475 if (isa<CallBase>(I)) {
5478 if (!isLoweredToCall(F))
5479 continue;
5480 return;
5481 }
5482
5483 SmallVector<const Value *, 4> Operands(I.operand_values());
5484 Cost += getInstructionCost(&I, Operands,
5486 }
5487 }
5488
5489 // Apply subtarget-specific unrolling preferences.
5490 if (ST->isAppleMLike())
5491 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5492 else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
5495
5496 // If this is a small, multi-exit loop similar to something like std::find,
5497 // then there is typically a performance improvement achieved by unrolling.
5498 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5499 UP.RuntimeUnrollMultiExit = true;
5500 UP.Runtime = true;
5501 // Limit unroll count.
5503 // Allow slightly more costly trip-count expansion to catch search loops
5504 // with pointer inductions.
5505 UP.SCEVExpansionBudget = 5;
5506 return;
5507 }
5508
5509 // Enable runtime unrolling for in-order models
5510 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5511 // checking for that case, we can ensure that the default behaviour is
5512 // unchanged
5513 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5514 !ST->getSchedModel().isOutOfOrder()) {
5515 UP.Runtime = true;
5516 UP.Partial = true;
5517 UP.UnrollRemainder = true;
5519
5520 UP.UnrollAndJam = true;
5522 }
5523
5524 // Force unrolling small loops can be very useful because of the branch
5525 // taken cost of the backedge.
5527 UP.Force = true;
5528}
5529
5534
5536 Type *ExpectedType,
5537 bool CanCreate) const {
5538 switch (Inst->getIntrinsicID()) {
5539 default:
5540 return nullptr;
5541 case Intrinsic::aarch64_neon_st2:
5542 case Intrinsic::aarch64_neon_st3:
5543 case Intrinsic::aarch64_neon_st4: {
5544 // Create a struct type
5545 StructType *ST = dyn_cast<StructType>(ExpectedType);
5546 if (!CanCreate || !ST)
5547 return nullptr;
5548 unsigned NumElts = Inst->arg_size() - 1;
5549 if (ST->getNumElements() != NumElts)
5550 return nullptr;
5551 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5552 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5553 return nullptr;
5554 }
5555 Value *Res = PoisonValue::get(ExpectedType);
5556 IRBuilder<> Builder(Inst);
5557 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5558 Value *L = Inst->getArgOperand(i);
5559 Res = Builder.CreateInsertValue(Res, L, i);
5560 }
5561 return Res;
5562 }
5563 case Intrinsic::aarch64_neon_ld2:
5564 case Intrinsic::aarch64_neon_ld3:
5565 case Intrinsic::aarch64_neon_ld4:
5566 if (Inst->getType() == ExpectedType)
5567 return Inst;
5568 return nullptr;
5569 }
5570}
5571
5573 MemIntrinsicInfo &Info) const {
5574 switch (Inst->getIntrinsicID()) {
5575 default:
5576 break;
5577 case Intrinsic::aarch64_neon_ld2:
5578 case Intrinsic::aarch64_neon_ld3:
5579 case Intrinsic::aarch64_neon_ld4:
5580 Info.ReadMem = true;
5581 Info.WriteMem = false;
5582 Info.PtrVal = Inst->getArgOperand(0);
5583 break;
5584 case Intrinsic::aarch64_neon_st2:
5585 case Intrinsic::aarch64_neon_st3:
5586 case Intrinsic::aarch64_neon_st4:
5587 Info.ReadMem = false;
5588 Info.WriteMem = true;
5589 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5590 break;
5591 }
5592
5593 switch (Inst->getIntrinsicID()) {
5594 default:
5595 return false;
5596 case Intrinsic::aarch64_neon_ld2:
5597 case Intrinsic::aarch64_neon_st2:
5598 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5599 break;
5600 case Intrinsic::aarch64_neon_ld3:
5601 case Intrinsic::aarch64_neon_st3:
5602 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5603 break;
5604 case Intrinsic::aarch64_neon_ld4:
5605 case Intrinsic::aarch64_neon_st4:
5606 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5607 break;
5608 }
5609 return true;
5610}
5611
5612/// See if \p I should be considered for address type promotion. We check if \p
5613/// I is a sext with right type and used in memory accesses. If it used in a
5614/// "complex" getelementptr, we allow it to be promoted without finding other
5615/// sext instructions that sign extended the same initial value. A getelementptr
5616/// is considered as "complex" if it has more than 2 operands.
5618 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5619 bool Considerable = false;
5620 AllowPromotionWithoutCommonHeader = false;
5621 if (!isa<SExtInst>(&I))
5622 return false;
5623 Type *ConsideredSExtType =
5624 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5625 if (I.getType() != ConsideredSExtType)
5626 return false;
5627 // See if the sext is the one with the right type and used in at least one
5628 // GetElementPtrInst.
5629 for (const User *U : I.users()) {
5630 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5631 Considerable = true;
5632 // A getelementptr is considered as "complex" if it has more than 2
5633 // operands. We will promote a SExt used in such complex GEP as we
5634 // expect some computation to be merged if they are done on 64 bits.
5635 if (GEPInst->getNumOperands() > 2) {
5636 AllowPromotionWithoutCommonHeader = true;
5637 break;
5638 }
5639 }
5640 }
5641 return Considerable;
5642}
5643
5645 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5646 if (!VF.isScalable())
5647 return true;
5648
5649 Type *Ty = RdxDesc.getRecurrenceType();
5650 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5651 return false;
5652
5653 switch (RdxDesc.getRecurrenceKind()) {
5654 case RecurKind::Sub:
5656 case RecurKind::Add:
5657 case RecurKind::FAdd:
5658 case RecurKind::And:
5659 case RecurKind::Or:
5660 case RecurKind::Xor:
5661 case RecurKind::SMin:
5662 case RecurKind::SMax:
5663 case RecurKind::UMin:
5664 case RecurKind::UMax:
5665 case RecurKind::FMin:
5666 case RecurKind::FMax:
5667 case RecurKind::FMulAdd:
5668 case RecurKind::AnyOf:
5670 return true;
5671 default:
5672 return false;
5673 }
5674}
5675
5678 FastMathFlags FMF,
5680 // The code-generator is currently not able to handle scalable vectors
5681 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5682 // it. This change will be removed when code-generation for these types is
5683 // sufficiently reliable.
5684 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5685 if (VTy->getElementCount() == ElementCount::getScalable(1))
5687
5688 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5689
5690 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5691 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5692
5693 InstructionCost LegalizationCost = 0;
5694 if (LT.first > 1) {
5695 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5696 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5697 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5698 }
5699
5700 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5701}
5702
5704 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5705 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5706 InstructionCost LegalizationCost = 0;
5707 if (LT.first > 1) {
5708 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5709 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5710 LegalizationCost *= LT.first - 1;
5711 }
5712
5713 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5714 assert(ISD && "Invalid opcode");
5715 // Add the final reduction cost for the legal horizontal reduction
5716 switch (ISD) {
5717 case ISD::ADD:
5718 case ISD::AND:
5719 case ISD::OR:
5720 case ISD::XOR:
5721 case ISD::FADD:
5722 return LegalizationCost + 2;
5723 default:
5725 }
5726}
5727
5730 std::optional<FastMathFlags> FMF,
5732 // The code-generator is currently not able to handle scalable vectors
5733 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5734 // it. This change will be removed when code-generation for these types is
5735 // sufficiently reliable.
5736 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5737 if (VTy->getElementCount() == ElementCount::getScalable(1))
5739
5741 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5742 InstructionCost BaseCost =
5743 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5744 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5745 // end up vectorizing for more computationally intensive loops.
5746 return BaseCost + FixedVTy->getNumElements();
5747 }
5748
5749 if (Opcode != Instruction::FAdd)
5751
5752 auto *VTy = cast<ScalableVectorType>(ValTy);
5754 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5755 Cost *= getMaxNumElements(VTy->getElementCount());
5756 return Cost;
5757 }
5758
5759 if (isa<ScalableVectorType>(ValTy))
5760 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5761
5762 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5763 MVT MTy = LT.second;
5764 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5765 assert(ISD && "Invalid opcode");
5766
5767 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5768 // instructions as twice a normal vector add, plus 1 for each legalization
5769 // step (LT.first). This is the only arithmetic vector reduction operation for
5770 // which we have an instruction.
5771 // OR, XOR and AND costs should match the codegen from:
5772 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5773 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5774 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5775 static const CostTblEntry CostTblNoPairwise[]{
5776 {ISD::ADD, MVT::v8i8, 2},
5777 {ISD::ADD, MVT::v16i8, 2},
5778 {ISD::ADD, MVT::v4i16, 2},
5779 {ISD::ADD, MVT::v8i16, 2},
5780 {ISD::ADD, MVT::v2i32, 2},
5781 {ISD::ADD, MVT::v4i32, 2},
5782 {ISD::ADD, MVT::v2i64, 2},
5783 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5784 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
5785 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
5786 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
5787 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
5788 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
5789 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
5790 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
5791 {ISD::XOR, MVT::v16i8, 7},
5792 {ISD::XOR, MVT::v4i16, 4},
5793 {ISD::XOR, MVT::v8i16, 6},
5794 {ISD::XOR, MVT::v2i32, 3},
5795 {ISD::XOR, MVT::v4i32, 5},
5796 {ISD::XOR, MVT::v2i64, 3},
5797 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
5798 {ISD::AND, MVT::v16i8, 7},
5799 {ISD::AND, MVT::v4i16, 4},
5800 {ISD::AND, MVT::v8i16, 6},
5801 {ISD::AND, MVT::v2i32, 3},
5802 {ISD::AND, MVT::v4i32, 5},
5803 {ISD::AND, MVT::v2i64, 3},
5804 };
5805 switch (ISD) {
5806 default:
5807 break;
5808 case ISD::FADD:
5809 if (Type *EltTy = ValTy->getScalarType();
5810 // FIXME: For half types without fullfp16 support, this could extend and
5811 // use a fp32 faddp reduction but current codegen unrolls.
5812 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5813 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5814 const unsigned NElts = MTy.getVectorNumElements();
5815 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5816 isPowerOf2_32(NElts))
5817 // Reduction corresponding to series of fadd instructions is lowered to
5818 // series of faddp instructions. faddp has latency/throughput that
5819 // matches fadd instruction and hence, every faddp instruction can be
5820 // considered to have a relative cost = 1 with
5821 // CostKind = TCK_RecipThroughput.
5822 // An faddp will pairwise add vector elements, so the size of input
5823 // vector reduces by half every time, requiring
5824 // #(faddp instructions) = log2_32(NElts).
5825 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5826 }
5827 break;
5828 case ISD::ADD:
5829 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
5830 return (LT.first - 1) + Entry->Cost;
5831 break;
5832 case ISD::XOR:
5833 case ISD::AND:
5834 case ISD::OR:
5835 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5836 if (!Entry)
5837 break;
5838 auto *ValVTy = cast<FixedVectorType>(ValTy);
5839 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5840 isPowerOf2_32(ValVTy->getNumElements())) {
5841 InstructionCost ExtraCost = 0;
5842 if (LT.first != 1) {
5843 // Type needs to be split, so there is an extra cost of LT.first - 1
5844 // arithmetic ops.
5845 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5846 MTy.getVectorNumElements());
5847 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5848 ExtraCost *= LT.first - 1;
5849 }
5850 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5851 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
5852 return Cost + ExtraCost;
5853 }
5854 break;
5855 }
5856 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5857}
5858
5860 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5861 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5862 EVT VecVT = TLI->getValueType(DL, VecTy);
5863 EVT ResVT = TLI->getValueType(DL, ResTy);
5864
5865 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5866 VecVT.getSizeInBits() >= 64) {
5867 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5868
5869 // The legal cases are:
5870 // UADDLV 8/16/32->32
5871 // UADDLP 32->64
5872 unsigned RevVTSize = ResVT.getSizeInBits();
5873 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5874 RevVTSize <= 32) ||
5875 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5876 RevVTSize <= 32) ||
5877 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5878 RevVTSize <= 64))
5879 return (LT.first - 1) * 2 + 2;
5880 }
5881
5882 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
5883 CostKind);
5884}
5885
5887AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5888 Type *ResTy, VectorType *VecTy,
5890 EVT VecVT = TLI->getValueType(DL, VecTy);
5891 EVT ResVT = TLI->getValueType(DL, ResTy);
5892
5893 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5894 RedOpcode == Instruction::Add) {
5895 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5896
5897 // The legal cases with dotprod are
5898 // UDOT 8->32
5899 // Which requires an additional uaddv to sum the i32 values.
5900 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5901 ResVT == MVT::i32)
5902 return LT.first + 2;
5903 }
5904
5905 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
5906 CostKind);
5907}
5908
5912 static const CostTblEntry ShuffleTbl[] = {
5913 { TTI::SK_Splice, MVT::nxv16i8, 1 },
5914 { TTI::SK_Splice, MVT::nxv8i16, 1 },
5915 { TTI::SK_Splice, MVT::nxv4i32, 1 },
5916 { TTI::SK_Splice, MVT::nxv2i64, 1 },
5917 { TTI::SK_Splice, MVT::nxv2f16, 1 },
5918 { TTI::SK_Splice, MVT::nxv4f16, 1 },
5919 { TTI::SK_Splice, MVT::nxv8f16, 1 },
5920 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
5921 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
5922 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
5923 { TTI::SK_Splice, MVT::nxv2f32, 1 },
5924 { TTI::SK_Splice, MVT::nxv4f32, 1 },
5925 { TTI::SK_Splice, MVT::nxv2f64, 1 },
5926 };
5927
5928 // The code-generator is currently not able to handle scalable vectors
5929 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5930 // it. This change will be removed when code-generation for these types is
5931 // sufficiently reliable.
5934
5935 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
5936 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
5937 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5938 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
5939 : LT.second;
5940 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
5941 InstructionCost LegalizationCost = 0;
5942 if (Index < 0) {
5943 LegalizationCost =
5944 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
5946 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
5948 }
5949
5950 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5951 // Cost performed on a promoted type.
5952 if (LT.second.getScalarType() == MVT::i1) {
5953 LegalizationCost +=
5954 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
5956 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
5958 }
5959 const auto *Entry =
5960 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
5961 assert(Entry && "Illegal Type for Splice");
5962 LegalizationCost += Entry->Cost;
5963 return LegalizationCost * LT.first;
5964}
5965
5967 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5969 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5970 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
5972
5974 return Invalid;
5975
5976 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub &&
5977 Opcode != Instruction::FAdd) ||
5978 OpAExtend == TTI::PR_None)
5979 return Invalid;
5980
5981 // Floating-point partial reductions are invalid if `reassoc` and `contract`
5982 // are not allowed.
5983 if (AccumType->isFloatingPointTy()) {
5984 assert(FMF && "Missing FastMathFlags for floating-point partial reduction");
5985 if (!FMF->allowReassoc() || !FMF->allowContract())
5986 return Invalid;
5987 } else {
5988 assert(!FMF &&
5989 "FastMathFlags only apply to floating-point partial reductions");
5990 }
5991
5992 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5993 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
5994 "Unexpected values for OpBExtend or InputTypeB");
5995
5996 // We only support multiply binary operations for now, and for muls we
5997 // require the types being extended to be the same.
5998 if (BinOp && ((*BinOp != Instruction::Mul && *BinOp != Instruction::FMul) ||
5999 InputTypeA != InputTypeB))
6000 return Invalid;
6001
6002 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
6003 if (IsUSDot && !ST->hasMatMulInt8())
6004 // FIXME: Remove this early bailout in favour of expand cost.
6005 return Invalid;
6006
6007 unsigned Ratio =
6008 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
6009 if (VF.getKnownMinValue() <= Ratio)
6010 return Invalid;
6011
6012 VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
6013 VectorType *AccumVectorType =
6014 VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
6015 // We don't yet support all kinds of legalization.
6016 auto TC = TLI->getTypeConversion(AccumVectorType->getContext(),
6017 EVT::getEVT(AccumVectorType));
6018 switch (TC.first) {
6019 default:
6020 return Invalid;
6024 // The legalised type (e.g. after splitting) must be legal too.
6025 if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) !=
6027 return Invalid;
6028 break;
6029 }
6030
6031 std::pair<InstructionCost, MVT> AccumLT =
6032 getTypeLegalizationCost(AccumVectorType);
6033 std::pair<InstructionCost, MVT> InputLT =
6034 getTypeLegalizationCost(InputVectorType);
6035
6036 // Returns true if the subtarget supports the operation for a given type.
6037 auto IsSupported = [&](bool SVEPred, bool NEONPred) -> bool {
6038 return (ST->isSVEorStreamingSVEAvailable() && SVEPred) ||
6039 (AccumLT.second.isFixedLengthVector() &&
6040 AccumLT.second.getSizeInBits() <= 128 && ST->isNeonAvailable() &&
6041 NEONPred);
6042 };
6043
6044 bool IsSub = Opcode == Instruction::Sub;
6045 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
6046
6047 if (AccumLT.second.getScalarType() == MVT::i32 &&
6048 InputLT.second.getScalarType() == MVT::i8 && !IsSub) {
6049 // i8 -> i32 is natively supported with udot/sdot for both NEON and SVE.
6050 if (!IsUSDot && IsSupported(true, ST->hasDotProd()))
6051 return Cost;
6052 // i8 -> i32 usdot requires +i8mm
6053 if (IsUSDot && IsSupported(ST->hasMatMulInt8(), ST->hasMatMulInt8()))
6054 return Cost;
6055 }
6056
6057 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot && !IsSub) {
6058 // i16 -> i64 is natively supported for udot/sdot
6059 if (AccumLT.second.getScalarType() == MVT::i64 &&
6060 InputLT.second.getScalarType() == MVT::i16)
6061 return Cost;
6062 // i16 -> i32 is natively supported with SVE2p1
6063 if (AccumLT.second.getScalarType() == MVT::i32 &&
6064 InputLT.second.getScalarType() == MVT::i16 &&
6065 (ST->hasSVE2p1() || ST->hasSME2()))
6066 return Cost;
6067 // i8 -> i64 is supported with an extra level of extends
6068 if (AccumLT.second.getScalarType() == MVT::i64 &&
6069 InputLT.second.getScalarType() == MVT::i8)
6070 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
6071 // because it requires two extra extends on the inputs. But if we'd change
6072 // that now, a regular reduction would be cheaper because the costs of
6073 // the extends in the IR are still counted. This can be fixed
6074 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
6075 return Cost;
6076 // i8 -> i16 is natively supported with SVE2p3
6077 if (AccumLT.second.getScalarType() == MVT::i16 &&
6078 InputLT.second.getScalarType() == MVT::i8 &&
6079 (ST->hasSVE2p3() || ST->hasSME2p3()))
6080 return Cost;
6081 }
6082
6083 // f16 -> f32 is natively supported for fdot using either
6084 // SVE or NEON instruction.
6085 if (Opcode == Instruction::FAdd && !IsSub &&
6086 IsSupported(ST->hasSME2() || ST->hasSVE2p1(), ST->hasF16F32DOT()) &&
6087 AccumLT.second.getScalarType() == MVT::f32 &&
6088 InputLT.second.getScalarType() == MVT::f16)
6089 return Cost;
6090
6091 // For a ratio of 2, we can use *mlal top/bottom instructions.
6092 if (Ratio == 2 && !IsSub) {
6093 MVT InVT = InputLT.second.getScalarType();
6094
6095 // SVE2 [us]mlalb/t and NEON [us]mlal(2)
6096 if (IsSupported(ST->hasSVE2(), true) &&
6097 llvm::is_contained({MVT::i8, MVT::i16, MVT::i32}, InVT.SimpleTy))
6098 return Cost * 2;
6099
6100 // SVE2 fmlalb/t and NEON fmlal(2)
6101 if (IsSupported(ST->hasSVE2(), ST->hasFP16FML()) && InVT == MVT::f16)
6102 return Cost * 2;
6103
6104 // SVE and NEON bfmlalb/t
6105 if (IsSupported(ST->hasBF16(), ST->hasBF16()) && InVT == MVT::bf16)
6106 return Cost * 2;
6107 }
6108
6110 Opcode, InputTypeA, InputTypeB, AccumType, VF, OpAExtend, OpBExtend,
6111 BinOp, CostKind, FMF);
6112
6113 // Slightly lower the cost of a sub reduction so that it can be considered
6114 // as candidate for 'cdot' operations. This is a somewhat arbitrary number,
6115 // because we don't yet model these operations directly.
6116 return ExpandCost.isValid() && IsSub ? ((8 * ExpandCost) / 10) : ExpandCost;
6117}
6118
6121 VectorType *SrcTy, ArrayRef<int> Mask,
6122 TTI::TargetCostKind CostKind, int Index,
6124 const Instruction *CxtI) const {
6125 assert((Mask.empty() || DstTy->isScalableTy() ||
6126 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
6127 "Expected the Mask to match the return size if given");
6128 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
6129 "Expected the same scalar types");
6130 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
6131
6132 // If we have a Mask, and the LT is being legalized somehow, split the Mask
6133 // into smaller vectors and sum the cost of each shuffle.
6134 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
6135 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
6136 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
6137 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
6138 // Check for LD3/LD4 instructions, which are represented in llvm IR as
6139 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
6140 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
6141 // cost than just the load.
6142 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
6145 return std::max<InstructionCost>(1, LT.first / 4);
6146
6147 // Check for ST3/ST4 instructions, which are represented in llvm IR as
6148 // store(interleaving-shuffle). The shuffle cost could potentially be free,
6149 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
6150 // cost than just the store.
6151 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
6153 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
6155 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
6156 return LT.first;
6157
6158 unsigned TpNumElts = Mask.size();
6159 unsigned LTNumElts = LT.second.getVectorNumElements();
6160 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
6161 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
6162 LT.second.getVectorElementCount());
6164 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
6165 PreviousCosts;
6166 for (unsigned N = 0; N < NumVecs; N++) {
6167 SmallVector<int> NMask;
6168 // Split the existing mask into chunks of size LTNumElts. Track the source
6169 // sub-vectors to ensure the result has at most 2 inputs.
6170 unsigned Source1 = -1U, Source2 = -1U;
6171 unsigned NumSources = 0;
6172 for (unsigned E = 0; E < LTNumElts; E++) {
6173 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
6175 if (MaskElt < 0) {
6177 continue;
6178 }
6179
6180 // Calculate which source from the input this comes from and whether it
6181 // is new to us.
6182 unsigned Source = MaskElt / LTNumElts;
6183 if (NumSources == 0) {
6184 Source1 = Source;
6185 NumSources = 1;
6186 } else if (NumSources == 1 && Source != Source1) {
6187 Source2 = Source;
6188 NumSources = 2;
6189 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
6190 NumSources++;
6191 }
6192
6193 // Add to the new mask. For the NumSources>2 case these are not correct,
6194 // but are only used for the modular lane number.
6195 if (Source == Source1)
6196 NMask.push_back(MaskElt % LTNumElts);
6197 else if (Source == Source2)
6198 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
6199 else
6200 NMask.push_back(MaskElt % LTNumElts);
6201 }
6202 // Check if we have already generated this sub-shuffle, which means we
6203 // will have already generated the output. For example a <16 x i32> splat
6204 // will be the same sub-splat 4 times, which only needs to be generated
6205 // once and reused.
6206 auto Result =
6207 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
6208 // Check if it was already in the map (already costed).
6209 if (!Result.second)
6210 continue;
6211 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
6212 // getShuffleCost. If not then cost it using the worst case as the number
6213 // of element moves into a new vector.
6214 InstructionCost NCost =
6215 NumSources <= 2
6216 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
6218 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
6219 CxtI)
6220 : LTNumElts;
6221 Result.first->second = NCost;
6222 Cost += NCost;
6223 }
6224 return Cost;
6225 }
6226
6227 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
6228 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
6229 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
6230 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
6231 // This currently only handles low or high extracts to prevent SLP vectorizer
6232 // regressions.
6233 // Note that SVE's ext instruction is destructive, but it can be fused with
6234 // a movprfx to act like a constructive instruction.
6235 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6236 if (LT.second.getFixedSizeInBits() >= 128 &&
6237 cast<FixedVectorType>(SubTp)->getNumElements() ==
6238 LT.second.getVectorNumElements() / 2) {
6239 if (Index == 0)
6240 return 0;
6241 if (Index == (int)LT.second.getVectorNumElements() / 2)
6242 return 1;
6243 }
6245 }
6246 // FIXME: This was added to keep the costs equal when adding DstTys. Update
6247 // the code to handle length-changing shuffles.
6248 if (Kind == TTI::SK_InsertSubvector) {
6249 LT = getTypeLegalizationCost(DstTy);
6250 SrcTy = DstTy;
6251 }
6252
6253 // Check for identity masks, which we can treat as free for both fixed and
6254 // scalable vector paths.
6255 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6256 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6257 all_of(enumerate(Mask), [](const auto &M) {
6258 return M.value() < 0 || M.value() == (int)M.index();
6259 }))
6260 return 0;
6261
6262 // Segmented shuffle matching.
6263 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
6264 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6265 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6267
6269 unsigned Segments =
6271 unsigned SegmentElts = VTy->getNumElements() / Segments;
6272
6273 // dupq zd.t, zn.t[idx]
6274 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6275 ST->isSVEorStreamingSVEAvailable() &&
6276 isDUPQMask(Mask, Segments, SegmentElts))
6277 return LT.first;
6278
6279 // mov zd.q, vn
6280 if (ST->isSVEorStreamingSVEAvailable() &&
6281 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
6282 return LT.first;
6283 }
6284
6285 // Check for broadcast loads, which are supported by the LD1R instruction.
6286 // In terms of code-size, the shuffle vector is free when a load + dup get
6287 // folded into a LD1R. That's what we check and return here. For performance
6288 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6289 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6290 // that we model the load + dup sequence slightly higher because LD1R is a
6291 // high latency instruction.
6292 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6293 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
6294 if (IsLoad && LT.second.isVector() &&
6295 isLegalBroadcastLoad(SrcTy->getElementType(),
6296 LT.second.getVectorElementCount()))
6297 return 0;
6298 }
6299
6300 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6301 // from the perfect shuffle tables.
6302 if (Mask.size() == 4 &&
6303 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
6304 (SrcTy->getScalarSizeInBits() == 16 ||
6305 SrcTy->getScalarSizeInBits() == 32) &&
6306 all_of(Mask, [](int E) { return E < 8; }))
6307 return getPerfectShuffleCost(Mask);
6308
6309 // Check for other shuffles that are not SK_ kinds but we have native
6310 // instructions for, for example ZIP and UZP.
6311 unsigned Unused;
6312 if (LT.second.isFixedLengthVector() &&
6313 LT.second.getVectorNumElements() == Mask.size() &&
6314 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc ||
6315 // Discrepancies between isTRNMask and ShuffleVectorInst::isTransposeMask
6316 // mean that we can end up with shuffles that satisfy isTRNMask, but end
6317 // up labelled as TTI::SK_InsertSubvector. (e.g. {2, 0}).
6318 Kind == TTI::SK_InsertSubvector) &&
6319 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6320 isTRNMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6321 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6322 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6323 LT.second.getVectorNumElements(), 16) ||
6324 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6325 LT.second.getVectorNumElements(), 32) ||
6326 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6327 LT.second.getVectorNumElements(), 64) ||
6328 // Check for non-zero lane splats
6329 all_of(drop_begin(Mask),
6330 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6331 return 1;
6332
6333 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6334 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6335 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6336 static const CostTblEntry ShuffleTbl[] = {
6337 // Broadcast shuffle kinds can be performed with 'dup'.
6338 {TTI::SK_Broadcast, MVT::v8i8, 1},
6339 {TTI::SK_Broadcast, MVT::v16i8, 1},
6340 {TTI::SK_Broadcast, MVT::v4i16, 1},
6341 {TTI::SK_Broadcast, MVT::v8i16, 1},
6342 {TTI::SK_Broadcast, MVT::v2i32, 1},
6343 {TTI::SK_Broadcast, MVT::v4i32, 1},
6344 {TTI::SK_Broadcast, MVT::v2i64, 1},
6345 {TTI::SK_Broadcast, MVT::v4f16, 1},
6346 {TTI::SK_Broadcast, MVT::v8f16, 1},
6347 {TTI::SK_Broadcast, MVT::v4bf16, 1},
6348 {TTI::SK_Broadcast, MVT::v8bf16, 1},
6349 {TTI::SK_Broadcast, MVT::v2f32, 1},
6350 {TTI::SK_Broadcast, MVT::v4f32, 1},
6351 {TTI::SK_Broadcast, MVT::v2f64, 1},
6352 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6353 // 'zip1/zip2' instructions.
6354 {TTI::SK_Transpose, MVT::v8i8, 1},
6355 {TTI::SK_Transpose, MVT::v16i8, 1},
6356 {TTI::SK_Transpose, MVT::v4i16, 1},
6357 {TTI::SK_Transpose, MVT::v8i16, 1},
6358 {TTI::SK_Transpose, MVT::v2i32, 1},
6359 {TTI::SK_Transpose, MVT::v4i32, 1},
6360 {TTI::SK_Transpose, MVT::v2i64, 1},
6361 {TTI::SK_Transpose, MVT::v4f16, 1},
6362 {TTI::SK_Transpose, MVT::v8f16, 1},
6363 {TTI::SK_Transpose, MVT::v4bf16, 1},
6364 {TTI::SK_Transpose, MVT::v8bf16, 1},
6365 {TTI::SK_Transpose, MVT::v2f32, 1},
6366 {TTI::SK_Transpose, MVT::v4f32, 1},
6367 {TTI::SK_Transpose, MVT::v2f64, 1},
6368 // Select shuffle kinds.
6369 // TODO: handle vXi8/vXi16.
6370 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
6371 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
6372 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
6373 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
6374 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
6375 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
6376 // PermuteSingleSrc shuffle kinds.
6377 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
6378 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
6379 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
6380 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
6381 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
6382 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
6383 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
6384 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
6385 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
6386 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
6387 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
6388 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
6389 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
6390 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
6391 // Reverse can be lowered with `rev`.
6392 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
6393 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
6394 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
6395 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
6396 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
6397 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
6398 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
6399 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
6400 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
6401 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
6402 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
6403 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
6404 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
6405 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
6406 // Splice can all be lowered as `ext`.
6407 {TTI::SK_Splice, MVT::v2i32, 1},
6408 {TTI::SK_Splice, MVT::v4i32, 1},
6409 {TTI::SK_Splice, MVT::v2i64, 1},
6410 {TTI::SK_Splice, MVT::v2f32, 1},
6411 {TTI::SK_Splice, MVT::v4f32, 1},
6412 {TTI::SK_Splice, MVT::v2f64, 1},
6413 {TTI::SK_Splice, MVT::v8f16, 1},
6414 {TTI::SK_Splice, MVT::v8bf16, 1},
6415 {TTI::SK_Splice, MVT::v8i16, 1},
6416 {TTI::SK_Splice, MVT::v16i8, 1},
6417 {TTI::SK_Splice, MVT::v4f16, 1},
6418 {TTI::SK_Splice, MVT::v4bf16, 1},
6419 {TTI::SK_Splice, MVT::v4i16, 1},
6420 {TTI::SK_Splice, MVT::v8i8, 1},
6421 // Broadcast shuffle kinds for scalable vectors
6422 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
6423 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6424 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6425 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6426 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6427 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6428 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6429 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6430 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6431 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6432 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6433 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6434 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6435 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6436 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6437 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6438 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6439 // Handle the cases for vector.reverse with scalable vectors
6440 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6441 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6442 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6443 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6444 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6445 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6446 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6447 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6448 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6449 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6450 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6451 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6452 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6453 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6454 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6455 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6456 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6457 };
6458 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6459 return LT.first * Entry->Cost;
6460 }
6461
6462 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6463 return getSpliceCost(SrcTy, Index, CostKind);
6464
6465 // Inserting a subvector can often be done with either a D, S or H register
6466 // move, so long as the inserted vector is "aligned".
6467 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6468 LT.second.getSizeInBits() <= 128 && SubTp) {
6469 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6470 if (SubLT.second.isVector()) {
6471 int NumElts = LT.second.getVectorNumElements();
6472 int NumSubElts = SubLT.second.getVectorNumElements();
6473 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6474 return SubLT.first;
6475 }
6476 }
6477
6478 // Restore optimal kind.
6479 if (IsExtractSubvector)
6481 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6482 Args, CxtI);
6483}
6484
6487 const DominatorTree &DT) {
6488 const auto &Strides = DenseMap<Value *, const SCEV *>();
6489 for (BasicBlock *BB : TheLoop->blocks()) {
6490 // Scan the instructions in the block and look for addresses that are
6491 // consecutive and decreasing.
6492 for (Instruction &I : *BB) {
6493 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6495 Type *AccessTy = getLoadStoreType(&I);
6496 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
6497 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6498 .value_or(0) < 0)
6499 return true;
6500 }
6501 }
6502 }
6503 return false;
6504}
6505
6507 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6509 // For cases like post-LTO vectorization, when we eventually know the trip
6510 // count, epilogue with fixed-width vectorization can be deleted if the trip
6511 // count is less than the epilogue iterations. That's why we prefer
6512 // fixed-width vectorization in epilogue in case of equal costs.
6513 if (IsEpilogue)
6514 return true;
6515 return ST->useFixedOverScalableIfEqualCost();
6516}
6517
6519 return ST->getEpilogueVectorizationMinVF();
6520}
6521
6523 if (!ST->hasSVE())
6524 return false;
6525
6526 // We don't currently support vectorisation with interleaving for SVE - with
6527 // such loops we're better off not using tail-folding. This gives us a chance
6528 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6529 if (TFI->IAI->hasGroups())
6530 return false;
6531
6533 if (TFI->LVL->getReductionVars().size())
6535 if (TFI->LVL->getFixedOrderRecurrences().size())
6537
6538 // We call this to discover whether any load/store pointers in the loop have
6539 // negative strides. This will require extra work to reverse the loop
6540 // predicate, which may be expensive.
6543 *TFI->LVL->getDominatorTree()))
6547
6548 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6549 Required))
6550 return false;
6551
6552 // Don't tail-fold for tight loops where we would be better off interleaving
6553 // with an unpredicated loop.
6554 unsigned NumInsns = 0;
6555 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6556 NumInsns += BB->size();
6557 }
6558
6559 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6560 return NumInsns >= SVETailFoldInsnThreshold;
6561}
6562
6565 StackOffset BaseOffset, bool HasBaseReg,
6566 int64_t Scale, unsigned AddrSpace) const {
6567 // Scaling factors are not free at all.
6568 // Operands | Rt Latency
6569 // -------------------------------------------
6570 // Rt, [Xn, Xm] | 4
6571 // -------------------------------------------
6572 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6573 // Rt, [Xn, Wm, <extend> #imm] |
6575 AM.BaseGV = BaseGV;
6576 AM.BaseOffs = BaseOffset.getFixed();
6577 AM.HasBaseReg = HasBaseReg;
6578 AM.Scale = Scale;
6579 AM.ScalableOffset = BaseOffset.getScalable();
6580 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6581 // Scale represents reg2 * scale, thus account for 1 if
6582 // it is not equal to 0 or 1.
6583 return AM.Scale != 0 && AM.Scale != 1;
6585}
6586
6588 const Instruction *I) const {
6590 // For the binary operators (e.g. or) we need to be more careful than
6591 // selects, here we only transform them if they are already at a natural
6592 // break point in the code - the end of a block with an unconditional
6593 // terminator.
6594 if (I->getOpcode() == Instruction::Or &&
6595 isa<UncondBrInst>(I->getNextNode()))
6596 return true;
6597
6598 if (I->getOpcode() == Instruction::Add ||
6599 I->getOpcode() == Instruction::Sub)
6600 return true;
6601 }
6603}
6604
6607 const TargetTransformInfo::LSRCost &C2) const {
6608 // AArch64 specific here is adding the number of instructions to the
6609 // comparison (though not as the first consideration, as some targets do)
6610 // along with changing the priority of the base additions.
6611 // TODO: Maybe a more nuanced tradeoff between instruction count
6612 // and number of registers? To be investigated at a later date.
6613 if (EnableLSRCostOpt)
6614 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6615 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6616 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6617 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6618
6620}
6621
6622static bool isSplatShuffle(Value *V) {
6623 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6624 return all_equal(Shuf->getShuffleMask());
6625 return false;
6626}
6627
6628/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6629/// or upper half of the vector elements.
6630static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6631 bool AllowSplat = false) {
6632 // Scalable types can't be extract shuffle vectors.
6633 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6634 return false;
6635
6636 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6637 auto *FullTy = FullV->getType();
6638 auto *HalfTy = HalfV->getType();
6639 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6640 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6641 };
6642
6643 auto extractHalf = [](Value *FullV, Value *HalfV) {
6644 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6645 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6646 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6647 };
6648
6649 ArrayRef<int> M1, M2;
6650 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6651 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6652 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6653 return false;
6654
6655 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6656 // it is not checked as an extract below.
6657 if (AllowSplat && isSplatShuffle(Op1))
6658 S1Op1 = nullptr;
6659 if (AllowSplat && isSplatShuffle(Op2))
6660 S2Op1 = nullptr;
6661
6662 // Check that the operands are half as wide as the result and we extract
6663 // half of the elements of the input vectors.
6664 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6665 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6666 return false;
6667
6668 // Check the mask extracts either the lower or upper half of vector
6669 // elements.
6670 int M1Start = 0;
6671 int M2Start = 0;
6672 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6673 if ((S1Op1 &&
6674 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6675 (S2Op1 &&
6676 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6677 return false;
6678
6679 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6680 (M2Start != 0 && M2Start != (NumElements / 2)))
6681 return false;
6682 if (S1Op1 && S2Op1 && M1Start != M2Start)
6683 return false;
6684
6685 return true;
6686}
6687
6688/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6689/// of the vector elements.
6690static bool areExtractExts(Value *Ext1, Value *Ext2) {
6691 auto areExtDoubled = [](Instruction *Ext) {
6692 return Ext->getType()->getScalarSizeInBits() ==
6693 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6694 };
6695
6696 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6697 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6698 !areExtDoubled(cast<Instruction>(Ext1)) ||
6699 !areExtDoubled(cast<Instruction>(Ext2)))
6700 return false;
6701
6702 return true;
6703}
6704
6705/// Check if Op could be used with vmull_high_p64 intrinsic.
6707 Value *VectorOperand = nullptr;
6708 ConstantInt *ElementIndex = nullptr;
6709 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6710 m_ConstantInt(ElementIndex))) &&
6711 ElementIndex->getValue() == 1 &&
6712 isa<FixedVectorType>(VectorOperand->getType()) &&
6713 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6714}
6715
6716/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6717static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6719}
6720
6722 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6723 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6724 if (!GEP || GEP->getNumOperands() != 2)
6725 return false;
6726
6727 Value *Base = GEP->getOperand(0);
6728 Value *Offsets = GEP->getOperand(1);
6729
6730 // We only care about scalar_base+vector_offsets.
6731 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6732 return false;
6733
6734 // Sink extends that would allow us to use 32-bit offset vectors.
6735 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6736 auto *OffsetsInst = cast<Instruction>(Offsets);
6737 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6738 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6739 Ops.push_back(&GEP->getOperandUse(1));
6740 }
6741
6742 // Sink the GEP.
6743 return true;
6744}
6745
6746/// We want to sink following cases:
6747/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6748/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6750 if (match(Op, m_VScale()))
6751 return true;
6752 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
6754 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6755 return true;
6756 }
6757 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
6759 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
6760 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
6761 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6762 return true;
6763 }
6764 return false;
6765}
6766
6767static bool isFNeg(Value *Op) { return match(Op, m_FNeg(m_Value())); }
6768
6769/// Check if sinking \p I's operands to I's basic block is profitable, because
6770/// the operands can be folded into a target instruction, e.g.
6771/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6775 switch (II->getIntrinsicID()) {
6776 case Intrinsic::aarch64_neon_smull:
6777 case Intrinsic::aarch64_neon_umull:
6778 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6779 /*AllowSplat=*/true)) {
6780 Ops.push_back(&II->getOperandUse(0));
6781 Ops.push_back(&II->getOperandUse(1));
6782 return true;
6783 }
6784 [[fallthrough]];
6785
6786 case Intrinsic::fma:
6787 case Intrinsic::fmuladd:
6788 if (isa<VectorType>(I->getType()) &&
6789 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6790 !ST->hasFullFP16())
6791 return false;
6792
6793 if (isFNeg(II->getOperand(0)))
6794 Ops.push_back(&II->getOperandUse(0));
6795 if (isFNeg(II->getOperand(1)))
6796 Ops.push_back(&II->getOperandUse(1));
6797
6798 [[fallthrough]];
6799 case Intrinsic::aarch64_neon_sqdmull:
6800 case Intrinsic::aarch64_neon_sqdmulh:
6801 case Intrinsic::aarch64_neon_sqrdmulh:
6802 // Sink splats for index lane variants
6803 if (isSplatShuffle(II->getOperand(0)))
6804 Ops.push_back(&II->getOperandUse(0));
6805 if (isSplatShuffle(II->getOperand(1)))
6806 Ops.push_back(&II->getOperandUse(1));
6807 return !Ops.empty();
6808 case Intrinsic::aarch64_neon_fmlal:
6809 case Intrinsic::aarch64_neon_fmlal2:
6810 case Intrinsic::aarch64_neon_fmlsl:
6811 case Intrinsic::aarch64_neon_fmlsl2:
6812 // Sink splats for index lane variants
6813 if (isSplatShuffle(II->getOperand(1)))
6814 Ops.push_back(&II->getOperandUse(1));
6815 if (isSplatShuffle(II->getOperand(2)))
6816 Ops.push_back(&II->getOperandUse(2));
6817 return !Ops.empty();
6818 case Intrinsic::aarch64_sve_ptest_first:
6819 case Intrinsic::aarch64_sve_ptest_last:
6820 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6821 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6822 Ops.push_back(&II->getOperandUse(0));
6823 return !Ops.empty();
6824 case Intrinsic::aarch64_sme_write_horiz:
6825 case Intrinsic::aarch64_sme_write_vert:
6826 case Intrinsic::aarch64_sme_writeq_horiz:
6827 case Intrinsic::aarch64_sme_writeq_vert: {
6828 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
6829 if (!Idx || Idx->getOpcode() != Instruction::Add)
6830 return false;
6831 Ops.push_back(&II->getOperandUse(1));
6832 return true;
6833 }
6834 case Intrinsic::aarch64_sme_read_horiz:
6835 case Intrinsic::aarch64_sme_read_vert:
6836 case Intrinsic::aarch64_sme_readq_horiz:
6837 case Intrinsic::aarch64_sme_readq_vert:
6838 case Intrinsic::aarch64_sme_ld1b_vert:
6839 case Intrinsic::aarch64_sme_ld1h_vert:
6840 case Intrinsic::aarch64_sme_ld1w_vert:
6841 case Intrinsic::aarch64_sme_ld1d_vert:
6842 case Intrinsic::aarch64_sme_ld1q_vert:
6843 case Intrinsic::aarch64_sme_st1b_vert:
6844 case Intrinsic::aarch64_sme_st1h_vert:
6845 case Intrinsic::aarch64_sme_st1w_vert:
6846 case Intrinsic::aarch64_sme_st1d_vert:
6847 case Intrinsic::aarch64_sme_st1q_vert:
6848 case Intrinsic::aarch64_sme_ld1b_horiz:
6849 case Intrinsic::aarch64_sme_ld1h_horiz:
6850 case Intrinsic::aarch64_sme_ld1w_horiz:
6851 case Intrinsic::aarch64_sme_ld1d_horiz:
6852 case Intrinsic::aarch64_sme_ld1q_horiz:
6853 case Intrinsic::aarch64_sme_st1b_horiz:
6854 case Intrinsic::aarch64_sme_st1h_horiz:
6855 case Intrinsic::aarch64_sme_st1w_horiz:
6856 case Intrinsic::aarch64_sme_st1d_horiz:
6857 case Intrinsic::aarch64_sme_st1q_horiz: {
6858 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
6859 if (!Idx || Idx->getOpcode() != Instruction::Add)
6860 return false;
6861 Ops.push_back(&II->getOperandUse(3));
6862 return true;
6863 }
6864 case Intrinsic::aarch64_neon_pmull:
6865 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
6866 return false;
6867 Ops.push_back(&II->getOperandUse(0));
6868 Ops.push_back(&II->getOperandUse(1));
6869 return true;
6870 case Intrinsic::aarch64_neon_pmull64:
6871 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
6872 II->getArgOperand(1)))
6873 return false;
6874 Ops.push_back(&II->getArgOperandUse(0));
6875 Ops.push_back(&II->getArgOperandUse(1));
6876 return true;
6877 case Intrinsic::masked_gather:
6878 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
6879 return false;
6880 Ops.push_back(&II->getArgOperandUse(0));
6881 return true;
6882 case Intrinsic::masked_scatter:
6883 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
6884 return false;
6885 Ops.push_back(&II->getArgOperandUse(1));
6886 return true;
6887 default:
6888 return false;
6889 }
6890 }
6891
6892 auto ShouldSinkCondition = [](Value *Cond,
6893 SmallVectorImpl<Use *> &Ops) -> bool {
6895 return false;
6897 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6898 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
6899 return false;
6900 if (isa<CmpInst>(II->getOperand(0)))
6901 Ops.push_back(&II->getOperandUse(0));
6902 return true;
6903 };
6904
6905 switch (I->getOpcode()) {
6906 case Instruction::GetElementPtr:
6907 case Instruction::Add:
6908 case Instruction::Sub:
6909 // Sink vscales closer to uses for better isel
6910 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6911 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
6912 Ops.push_back(&I->getOperandUse(Op));
6913 return true;
6914 }
6915 }
6916 break;
6917 case Instruction::Select: {
6918 if (!ShouldSinkCondition(I->getOperand(0), Ops))
6919 return false;
6920
6921 Ops.push_back(&I->getOperandUse(0));
6922 return true;
6923 }
6924 case Instruction::UncondBr:
6925 return false;
6926 case Instruction::CondBr: {
6927 if (!ShouldSinkCondition(cast<CondBrInst>(I)->getCondition(), Ops))
6928 return false;
6929
6930 Ops.push_back(&I->getOperandUse(0));
6931 return true;
6932 }
6933 case Instruction::FMul:
6934 // fmul with contract flag can be combined with fadd into fma.
6935 // Sinking fneg into this block enables fmls pattern.
6936 if (cast<FPMathOperator>(I)->hasAllowContract()) {
6937 if (isFNeg(I->getOperand(0)))
6938 Ops.push_back(&I->getOperandUse(0));
6939 if (isFNeg(I->getOperand(1)))
6940 Ops.push_back(&I->getOperandUse(1));
6941 }
6942 break;
6943
6944 // Type | BIC | ORN | EON
6945 // ----------------+-----------+-----------+-----------
6946 // scalar | Base | Base | Base
6947 // scalar w/shift | - | - | -
6948 // fixed vector | NEON/Base | NEON/Base | BSL2N/Base
6949 // scalable vector | SVE | - | BSL2N
6950 case Instruction::Xor:
6951 // EON only for scalars (possibly expanded fixed vectors)
6952 // and vectors using the SVE2/SME BSL2N instruction.
6953 if (I->getType()->isVectorTy() && ST->isNeonAvailable()) {
6954 bool HasBSL2N =
6955 ST->isSVEorStreamingSVEAvailable() && (ST->hasSVE2() || ST->hasSME());
6956 if (!HasBSL2N)
6957 break;
6958 }
6959 [[fallthrough]];
6960 case Instruction::And:
6961 case Instruction::Or:
6962 // Even though we could use the SVE2/SME BSL2N instruction,
6963 // it might pessimize with an extra MOV depending on register allocation.
6964 if (I->getOpcode() == Instruction::Or &&
6965 isa<ScalableVectorType>(I->getType()))
6966 break;
6967 // Shift can be fold into scalar AND/ORR/EOR,
6968 // but not the non-negated operand of BIC/ORN/EON.
6969 if (!(I->getType()->isVectorTy() && ST->hasNEON()) &&
6971 break;
6972 for (auto &Op : I->operands()) {
6973 // (and/or/xor X, (not Y)) -> (bic/orn/eon X, Y)
6974 if (match(Op.get(), m_Not(m_Value()))) {
6975 Ops.push_back(&Op);
6976 return true;
6977 }
6978 // (and/or/xor X, (splat (not Y))) -> (bic/orn/eon X, (splat Y))
6979 if (match(Op.get(),
6981 m_Value(), m_ZeroMask()))) {
6982 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
6983 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
6984 Ops.push_back(&Not);
6985 Ops.push_back(&InsertElt);
6986 Ops.push_back(&Op);
6987 return true;
6988 }
6989 }
6990 break;
6991 default:
6992 break;
6993 }
6994
6995 if (!I->getType()->isVectorTy())
6996 return !Ops.empty();
6997
6998 switch (I->getOpcode()) {
6999 case Instruction::Sub:
7000 case Instruction::Add: {
7001 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
7002 return false;
7003
7004 // If the exts' operands extract either the lower or upper elements, we
7005 // can sink them too.
7006 auto Ext1 = cast<Instruction>(I->getOperand(0));
7007 auto Ext2 = cast<Instruction>(I->getOperand(1));
7008 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
7009 Ops.push_back(&Ext1->getOperandUse(0));
7010 Ops.push_back(&Ext2->getOperandUse(0));
7011 }
7012
7013 Ops.push_back(&I->getOperandUse(0));
7014 Ops.push_back(&I->getOperandUse(1));
7015
7016 return true;
7017 }
7018 case Instruction::Or: {
7019 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
7020 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
7021 if (ST->hasNEON()) {
7022 Instruction *OtherAnd, *IA, *IB;
7023 Value *MaskValue;
7024 // MainAnd refers to And instruction that has 'Not' as one of its operands
7025 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
7026 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
7027 m_Instruction(IA)))))) {
7028 if (match(OtherAnd,
7029 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
7030 Instruction *MainAnd = I->getOperand(0) == OtherAnd
7031 ? cast<Instruction>(I->getOperand(1))
7032 : cast<Instruction>(I->getOperand(0));
7033
7034 // Both Ands should be in same basic block as Or
7035 if (I->getParent() != MainAnd->getParent() ||
7036 I->getParent() != OtherAnd->getParent())
7037 return false;
7038
7039 // Non-mask operands of both Ands should also be in same basic block
7040 if (I->getParent() != IA->getParent() ||
7041 I->getParent() != IB->getParent())
7042 return false;
7043
7044 Ops.push_back(
7045 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
7046 Ops.push_back(&I->getOperandUse(0));
7047 Ops.push_back(&I->getOperandUse(1));
7048
7049 return true;
7050 }
7051 }
7052 }
7053
7054 return false;
7055 }
7056 case Instruction::Mul: {
7057 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
7058 auto *Ty = cast<VectorType>(V->getType());
7059 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7060 if (Ty->isScalableTy())
7061 return false;
7062
7063 // Indexed variants of Mul exist for i16 and i32 element types only.
7064 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
7065 };
7066
7067 int NumZExts = 0, NumSExts = 0;
7068 for (auto &Op : I->operands()) {
7069 // Make sure we are not already sinking this operand
7070 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7071 continue;
7072
7073 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
7074 auto *Ext = cast<Instruction>(Op);
7075 auto *ExtOp = Ext->getOperand(0);
7076 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
7077 Ops.push_back(&Ext->getOperandUse(0));
7078 Ops.push_back(&Op);
7079
7080 if (isa<SExtInst>(Ext)) {
7081 NumSExts++;
7082 } else {
7083 NumZExts++;
7084 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
7085 if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
7086 I->getType()->getScalarSizeInBits())
7087 NumSExts++;
7088 }
7089
7090 continue;
7091 }
7092
7094 if (!Shuffle)
7095 continue;
7096
7097 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
7098 // operand and the s/zext can help create indexed s/umull. This is
7099 // especially useful to prevent i64 mul being scalarized.
7100 if (isSplatShuffle(Shuffle) &&
7101 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
7102 Ops.push_back(&Shuffle->getOperandUse(0));
7103 Ops.push_back(&Op);
7104 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
7105 NumSExts++;
7106 else
7107 NumZExts++;
7108 continue;
7109 }
7110
7111 Value *ShuffleOperand = Shuffle->getOperand(0);
7112 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
7113 if (!Insert)
7114 continue;
7115
7116 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
7117 if (!OperandInstr)
7118 continue;
7119
7120 ConstantInt *ElementConstant =
7121 dyn_cast<ConstantInt>(Insert->getOperand(2));
7122 // Check that the insertelement is inserting into element 0
7123 if (!ElementConstant || !ElementConstant->isZero())
7124 continue;
7125
7126 unsigned Opcode = OperandInstr->getOpcode();
7127 if (Opcode == Instruction::SExt)
7128 NumSExts++;
7129 else if (Opcode == Instruction::ZExt)
7130 NumZExts++;
7131 else {
7132 // If we find that the top bits are known 0, then we can sink and allow
7133 // the backend to generate a umull.
7134 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
7135 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
7136 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
7137 continue;
7138 NumZExts++;
7139 }
7140
7141 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
7142 // the And, just to hoist it again back to the load.
7143 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
7144 Ops.push_back(&Insert->getOperandUse(1));
7145 Ops.push_back(&Shuffle->getOperandUse(0));
7146 Ops.push_back(&Op);
7147 }
7148
7149 // It is profitable to sink if we found two of the same type of extends.
7150 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
7151 return true;
7152
7153 // Otherwise, see if we should sink splats for indexed variants.
7154 if (!ShouldSinkSplatForIndexedVariant(I))
7155 return false;
7156
7157 Ops.clear();
7158 if (isSplatShuffle(I->getOperand(0)))
7159 Ops.push_back(&I->getOperandUse(0));
7160 if (isSplatShuffle(I->getOperand(1)))
7161 Ops.push_back(&I->getOperandUse(1));
7162
7163 return !Ops.empty();
7164 }
7165 case Instruction::FMul: {
7166 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7167 if (I->getType()->isScalableTy())
7168 return !Ops.empty();
7169
7170 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
7171 !ST->hasFullFP16())
7172 return !Ops.empty();
7173
7174 // Sink splats for index lane variants
7175 if (isSplatShuffle(I->getOperand(0)))
7176 Ops.push_back(&I->getOperandUse(0));
7177 if (isSplatShuffle(I->getOperand(1)))
7178 Ops.push_back(&I->getOperandUse(1));
7179 return !Ops.empty();
7180 }
7181 default:
7182 return false;
7183 }
7184 return false;
7185}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineWhilelo(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST)
static void extractAttrFeatures(const Function &F, const AArch64TTIImpl *TTI, SmallVectorImpl< StringRef > &Features)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< int > Aarch64ForceUnrollThreshold("aarch64-force-unroll-threshold", cl::init(0), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in AArch64"))
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static bool isFNeg(Value *Op)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE, const DominatorTree &DT)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
static Value * getCondition(Instruction *I)
Hexagon Common GEP
const HexagonInstrInfo * TII
#define _
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
#define LLVM_DEBUG(...)
Definition Debug.h:114
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool preferTailFoldingOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
APInt getPriorityMask(const Function &F) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const override
bool enableScalableVectorization() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
unsigned countLeadingOnes() const
Definition APInt.h:1647
void negate()
Negate this APInt in place.
Definition APInt.h:1491
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1083
unsigned logBase2() const
Definition APInt.h:1784
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:834
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:219
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
bool isUnsigned() const
Definition InstrTypes.h:936
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
LLVM_ABI Constant * getSplatValue(bool AllowPoison=false) const
If all elements of the vector constant have the same value, return that value.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:784
bool empty() const
Definition DenseMap.h:109
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool noSignedZeros() const
Definition FMF.h:70
bool noInfs() const
Definition FMF.h:69
bool approxFunc() const
Definition FMF.h:73
bool allowContract() const
Definition FMF.h:72
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2602
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1142
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2590
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:599
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:619
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:586
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:604
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:1985
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:534
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2299
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2514
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1748
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2217
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1895
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2624
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1908
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:614
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2290
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2829
This instruction inserts a single (scalar) element into a VectorType value.
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
LLVM_ABI FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition MapVector.h:56
Information for memory intrinsic cost model.
const Instruction * getInst() const
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:895
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:730
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:65
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:158
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:290
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:964
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:399
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
LLVM_ABI APInt getCpuSupportsMask(ArrayRef< StringRef > Features)
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
match_bind< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
brc_match< Cond_t, match_bind< BasicBlock >, match_bind< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
LLVM_ABI Libcall getPOW(EVT RetVT)
getPOW - Return the POW_* value for the given types, or UNKNOWN_LIBCALL if there is none.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0,...
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Uninitialized
Definition Threading.h:60
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:328
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0,...
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:292
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:381
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:367
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:324
bool isFixedLengthVector() const
Definition ValueTypes.h:189
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:331
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:182
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:336
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:344
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
bool isVariant() const
Definition MCSchedule.h:144
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...