LLVM 23.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
81 "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
82 cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83
84namespace {
85class TailFoldingOption {
86 // These bitfields will only ever be set to something non-zero in operator=,
87 // when setting the -sve-tail-folding option. This option should always be of
88 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
89 // InitialBits is one of (disabled|all|simple). EnableBits represents
90 // additional flags we're enabling, and DisableBits for those flags we're
91 // disabling. The default flag is tracked in the variable NeedsDefault, since
92 // at the time of setting the option we may not know what the default value
93 // for the CPU is.
97
98 // This value needs to be initialised to true in case the user does not
99 // explicitly set the -sve-tail-folding option.
100 bool NeedsDefault = true;
101
102 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104 void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106 void setEnableBit(TailFoldingOpts Bit) {
107 EnableBits |= Bit;
108 DisableBits &= ~Bit;
109 }
110
111 void setDisableBit(TailFoldingOpts Bit) {
112 EnableBits &= ~Bit;
113 DisableBits |= Bit;
114 }
115
116 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
123 Bits |= EnableBits;
124 Bits &= ~DisableBits;
125
126 return Bits;
127 }
128
129 void reportError(std::string Opt) {
130 errs() << "invalid argument '" << Opt
131 << "' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
134 report_fatal_error("Unrecognised tail-folding option");
135 }
136
137public:
138
139 void operator=(const std::string &Val) {
140 // If the user explicitly sets -sve-tail-folding= then treat as an error.
141 if (Val.empty()) {
142 reportError("");
143 return;
144 }
145
146 // Since the user is explicitly setting the option we don't automatically
147 // need the default unless they require it.
148 setNeedsDefault(false);
149
150 SmallVector<StringRef, 4> TailFoldTypes;
151 StringRef(Val).split(TailFoldTypes, '+', -1, false);
152
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] == "disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] == "all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] == "default")
159 setNeedsDefault(true);
160 else if (TailFoldTypes[0] == "simple")
161 setInitialBits(TailFoldingOpts::Simple);
162 else {
163 StartIdx = 0;
164 setInitialBits(TailFoldingOpts::Disabled);
165 }
166
167 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168 if (TailFoldTypes[I] == "reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[I] == "noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[I] == "norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[I] == "noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
180 else
181 reportError(Val);
182 }
183 }
184
185 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186 return (getBits(DefaultBits) & Required) == Required;
187 }
188};
189} // namespace
190
191TailFoldingOption TailFoldingOptionLoc;
192
194 "sve-tail-folding",
195 cl::desc(
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
199 "tail-folding"
200 "\ndefault (Initial) Uses the default tail-folding settings for "
201 "the target CPU"
202 "\nall (Initial) All legal loop types will vectorize using "
203 "tail-folding"
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
209 "recurrences"
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
212 "predicates"
213 "\nnoreverse Inverse of above"),
215
216// Experimental option that will only be fully functional when the
217// code-generator is changed to use SVE instead of NEON for all fixed-width
218// operations.
220 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
221
222// Experimental option that will only be fully functional when the cost-model
223// and code-generator have been changed to avoid using scalable vector
224// instructions that are not legal in streaming SVE mode.
226 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
227
228static bool isSMEABIRoutineCall(const CallInst &CI,
229 const AArch64TargetLowering &TLI) {
230 const auto *F = CI.getCalledFunction();
231 return F &&
233}
234
235/// Returns true if the function has explicit operations that can only be
236/// lowered using incompatible instructions for the selected mode. This also
237/// returns true if the function F may use or modify ZA state.
239 const AArch64TargetLowering &TLI) {
240 for (const BasicBlock &BB : *F) {
241 for (const Instruction &I : BB) {
242 // Be conservative for now and assume that any call to inline asm or to
243 // intrinsics could could result in non-streaming ops (e.g. calls to
244 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
245 // all native LLVM instructions can be lowered to compatible instructions.
246 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
247 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
249 return true;
250 }
251 }
252 return false;
253}
254
256 SmallVectorImpl<StringRef> &Features) {
257 StringRef AttributeStr =
258 TTI->isMultiversionedFunction(F) ? "fmv-features" : "target-features";
259 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
260 FeatureStr.split(Features, ",");
261}
262
265 extractAttrFeatures(F, this, Features);
266 return AArch64::getCpuSupportsMask(Features);
267}
268
271 extractAttrFeatures(F, this, Features);
272 return AArch64::getFMVPriority(Features);
273}
274
276 return F.hasFnAttribute("fmv-features");
277}
278
279const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
280 AArch64::FeatureExecuteOnly,
281};
282
284 const Function *Callee) const {
285 SMECallAttrs CallAttrs(*Caller, *Callee);
286
287 // Never inline a function explicitly marked as being streaming,
288 // into a non-streaming function. Assume it was marked as streaming
289 // for a reason.
290 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
292 return false;
293
294 // When inlining, we should consider the body of the function, not the
295 // interface.
296 if (CallAttrs.callee().hasStreamingBody()) {
297 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
298 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
299 }
300
301 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
302 return false;
303
304 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
305 CallAttrs.requiresPreservingZT0() ||
306 CallAttrs.requiresPreservingAllZAState()) {
307 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
308 return false;
309 }
310
311 const TargetMachine &TM = getTLI()->getTargetMachine();
312 const FeatureBitset &CallerBits =
313 TM.getSubtargetImpl(*Caller)->getFeatureBits();
314 const FeatureBitset &CalleeBits =
315 TM.getSubtargetImpl(*Callee)->getFeatureBits();
316 // Adjust the feature bitsets by inverting some of the bits. This is needed
317 // for target features that represent restrictions rather than capabilities,
318 // for example a "+execute-only" callee can be inlined into a caller without
319 // "+execute-only", but not vice versa.
320 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
321 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
322
323 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
324}
325
327 const Function *Callee,
328 ArrayRef<Type *> Types) const {
329 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
330 return false;
331
332 // We need to ensure that argument promotion does not attempt to promote
333 // pointers to fixed-length vector types larger than 128 bits like
334 // <8 x float> (and pointers to aggregate types which have such fixed-length
335 // vector type members) into the values of the pointees. Such vector types
336 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
337 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
338 // types can be safely treated as 128-bit NEON types and they cannot be
339 // distinguished in IR.
340 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
341 auto FVTy = dyn_cast<FixedVectorType>(Ty);
342 return FVTy &&
343 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
344 }))
345 return false;
346
347 return true;
348}
349
350unsigned
352 unsigned DefaultCallPenalty) const {
353 // This function calculates a penalty for executing Call in F.
354 //
355 // There are two ways this function can be called:
356 // (1) F:
357 // call from F -> G (the call here is Call)
358 //
359 // For (1), Call.getCaller() == F, so it will always return a high cost if
360 // a streaming-mode change is required (thus promoting the need to inline the
361 // function)
362 //
363 // (2) F:
364 // call from F -> G (the call here is not Call)
365 // G:
366 // call from G -> H (the call here is Call)
367 //
368 // For (2), if after inlining the body of G into F the call to H requires a
369 // streaming-mode change, and the call to G from F would also require a
370 // streaming-mode change, then there is benefit to do the streaming-mode
371 // change only once and avoid inlining of G into F.
372
373 SMEAttrs FAttrs(*F);
374 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
375
376 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
377 if (F == Call.getCaller()) // (1)
378 return CallPenaltyChangeSM * DefaultCallPenalty;
379 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
380 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
381 }
382
383 return DefaultCallPenalty;
384}
385
389
390 if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
391 return true;
392
394 ST->isSVEorStreamingSVEAvailable() &&
395 !ST->disableMaximizeScalableBandwidth();
396}
397
398/// Calculate the cost of materializing a 64-bit value. This helper
399/// method might only calculate a fraction of a larger immediate. Therefore it
400/// is valid to return a cost of ZERO.
402 // Check if the immediate can be encoded within an instruction.
403 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
404 return 0;
405
406 if (Val < 0)
407 Val = ~Val;
408
409 // Calculate how many moves we will need to materialize this constant.
411 AArch64_IMM::expandMOVImm(Val, 64, Insn);
412 return Insn.size();
413}
414
415/// Calculate the cost of materializing the given constant.
419 assert(Ty->isIntegerTy());
420
421 unsigned BitSize = Ty->getPrimitiveSizeInBits();
422 if (BitSize == 0)
423 return ~0U;
424
425 // Sign-extend all constants to a multiple of 64-bit.
426 APInt ImmVal = Imm;
427 if (BitSize & 0x3f)
428 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
429
430 // Split the constant into 64-bit chunks and calculate the cost for each
431 // chunk.
433 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
434 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
435 int64_t Val = Tmp.getSExtValue();
436 Cost += getIntImmCost(Val);
437 }
438 // We need at least one instruction to materialze the constant.
439 return std::max<InstructionCost>(1, Cost);
440}
441
443 const APInt &Imm, Type *Ty,
445 Instruction *Inst) const {
446 assert(Ty->isIntegerTy());
447
448 unsigned BitSize = Ty->getPrimitiveSizeInBits();
449 // There is no cost model for constants with a bit size of 0. Return TCC_Free
450 // here, so that constant hoisting will ignore this constant.
451 if (BitSize == 0)
452 return TTI::TCC_Free;
453
454 unsigned ImmIdx = ~0U;
455 switch (Opcode) {
456 default:
457 return TTI::TCC_Free;
458 case Instruction::GetElementPtr:
459 // Always hoist the base address of a GetElementPtr.
460 if (Idx == 0)
461 return 2 * TTI::TCC_Basic;
462 return TTI::TCC_Free;
463 case Instruction::Store:
464 ImmIdx = 0;
465 break;
466 case Instruction::Add:
467 case Instruction::Sub:
468 case Instruction::Mul:
469 case Instruction::UDiv:
470 case Instruction::SDiv:
471 case Instruction::URem:
472 case Instruction::SRem:
473 case Instruction::And:
474 case Instruction::Or:
475 case Instruction::Xor:
476 case Instruction::ICmp:
477 ImmIdx = 1;
478 break;
479 // Always return TCC_Free for the shift value of a shift instruction.
480 case Instruction::Shl:
481 case Instruction::LShr:
482 case Instruction::AShr:
483 if (Idx == 1)
484 return TTI::TCC_Free;
485 break;
486 case Instruction::Trunc:
487 case Instruction::ZExt:
488 case Instruction::SExt:
489 case Instruction::IntToPtr:
490 case Instruction::PtrToInt:
491 case Instruction::BitCast:
492 case Instruction::PHI:
493 case Instruction::Call:
494 case Instruction::Select:
495 case Instruction::Ret:
496 case Instruction::Load:
497 break;
498 }
499
500 if (Idx == ImmIdx) {
501 int NumConstants = (BitSize + 63) / 64;
503 return (Cost <= NumConstants * TTI::TCC_Basic)
504 ? static_cast<int>(TTI::TCC_Free)
505 : Cost;
506 }
508}
509
512 const APInt &Imm, Type *Ty,
514 assert(Ty->isIntegerTy());
515
516 unsigned BitSize = Ty->getPrimitiveSizeInBits();
517 // There is no cost model for constants with a bit size of 0. Return TCC_Free
518 // here, so that constant hoisting will ignore this constant.
519 if (BitSize == 0)
520 return TTI::TCC_Free;
521
522 // Most (all?) AArch64 intrinsics do not support folding immediates into the
523 // selected instruction, so we compute the materialization cost for the
524 // immediate directly.
525 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
527
528 switch (IID) {
529 default:
530 return TTI::TCC_Free;
531 case Intrinsic::sadd_with_overflow:
532 case Intrinsic::uadd_with_overflow:
533 case Intrinsic::ssub_with_overflow:
534 case Intrinsic::usub_with_overflow:
535 case Intrinsic::smul_with_overflow:
536 case Intrinsic::umul_with_overflow:
537 if (Idx == 1) {
538 int NumConstants = (BitSize + 63) / 64;
540 return (Cost <= NumConstants * TTI::TCC_Basic)
541 ? static_cast<int>(TTI::TCC_Free)
542 : Cost;
543 }
544 break;
545 case Intrinsic::experimental_stackmap:
546 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
547 return TTI::TCC_Free;
548 break;
549 case Intrinsic::experimental_patchpoint_void:
550 case Intrinsic::experimental_patchpoint:
551 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
552 return TTI::TCC_Free;
553 break;
554 case Intrinsic::experimental_gc_statepoint:
555 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
556 return TTI::TCC_Free;
557 break;
558 }
560}
561
563AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
564 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
565 if (TyWidth == 32 || TyWidth == 64)
567 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
568 return TTI::PSK_Software;
569}
570
571static bool isUnpackedVectorVT(EVT VecVT) {
572 return VecVT.isScalableVector() &&
574}
575
577 const IntrinsicCostAttributes &ICA) {
578 // We need to know at least the number of elements in the vector of buckets
579 // and the size of each element to update.
580 if (ICA.getArgTypes().size() < 2)
582
583 // Only interested in costing for the hardware instruction from SVE2.
584 if (!ST->hasSVE2())
586
587 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
588 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
589 unsigned TotalHistCnts = 1;
590
591 unsigned EltSize = EltTy->getScalarSizeInBits();
592 // Only allow (up to 64b) integers or pointers
593 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
595
596 // FIXME: We should be able to generate histcnt for fixed-length vectors
597 // using ptrue with a specific VL.
598 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
599 unsigned EC = VTy->getElementCount().getKnownMinValue();
600 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
602
603 // HistCnt only supports 32b and 64b element types
604 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
605
606 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
608
609 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
610 TotalHistCnts = EC / NaturalVectorWidth;
611
612 return InstructionCost(BaseHistCntCost * TotalHistCnts);
613 }
614
616}
617
621 // The code-generator is currently not able to handle scalable vectors
622 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
623 // it. This change will be removed when code-generation for these types is
624 // sufficiently reliable.
625 auto *RetTy = ICA.getReturnType();
626 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
627 if (VTy->getElementCount() == ElementCount::getScalable(1))
629
630 switch (ICA.getID()) {
631 case Intrinsic::experimental_vector_histogram_add: {
632 InstructionCost HistCost = getHistogramCost(ST, ICA);
633 // If the cost isn't valid, we may still be able to scalarize
634 if (HistCost.isValid())
635 return HistCost;
636 break;
637 }
638 case Intrinsic::umin:
639 case Intrinsic::umax:
640 case Intrinsic::smin:
641 case Intrinsic::smax: {
642 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
643 MVT::v8i16, MVT::v2i32, MVT::v4i32,
644 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
645 MVT::nxv2i64};
646 auto LT = getTypeLegalizationCost(RetTy);
647 // v2i64 types get converted to cmp+bif hence the cost of 2
648 if (LT.second == MVT::v2i64)
649 return LT.first * 2;
650 if (any_of(ValidMinMaxTys, equal_to(LT.second)))
651 return LT.first;
652 break;
653 }
654 case Intrinsic::scmp:
655 case Intrinsic::ucmp: {
656 static const CostTblEntry BitreverseTbl[] = {
657 {Intrinsic::scmp, MVT::i32, 3}, // cmp+cset+csinv
658 {Intrinsic::scmp, MVT::i64, 3}, // cmp+cset+csinv
659 {Intrinsic::scmp, MVT::v8i8, 3}, // cmgt+cmgt+sub
660 {Intrinsic::scmp, MVT::v16i8, 3}, // cmgt+cmgt+sub
661 {Intrinsic::scmp, MVT::v4i16, 3}, // cmgt+cmgt+sub
662 {Intrinsic::scmp, MVT::v8i16, 3}, // cmgt+cmgt+sub
663 {Intrinsic::scmp, MVT::v2i32, 3}, // cmgt+cmgt+sub
664 {Intrinsic::scmp, MVT::v4i32, 3}, // cmgt+cmgt+sub
665 {Intrinsic::scmp, MVT::v1i64, 3}, // cmgt+cmgt+sub
666 {Intrinsic::scmp, MVT::v2i64, 3}, // cmgt+cmgt+sub
667 };
668 const auto LT = getTypeLegalizationCost(RetTy);
669 const auto *Entry =
670 CostTableLookup(BitreverseTbl, Intrinsic::scmp, LT.second);
671 if (Entry)
672 return Entry->Cost * LT.first;
673 break;
674 }
675 case Intrinsic::sadd_sat:
676 case Intrinsic::ssub_sat:
677 case Intrinsic::uadd_sat:
678 case Intrinsic::usub_sat: {
679 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
680 MVT::v8i16, MVT::v2i32, MVT::v4i32,
681 MVT::v2i64};
682 auto LT = getTypeLegalizationCost(RetTy);
683 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
684 // need to extend the type, as it uses shr(qadd(shl, shl)).
685 unsigned Instrs =
686 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
687 if (any_of(ValidSatTys, equal_to(LT.second)))
688 return LT.first * Instrs;
689
691 uint64_t VectorSize = TS.getKnownMinValue();
692
693 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
694 return LT.first * Instrs;
695
696 break;
697 }
698 case Intrinsic::abs: {
699 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
700 MVT::v8i16, MVT::v2i32, MVT::v4i32,
701 MVT::v2i64, MVT::nxv16i8, MVT::nxv8i16,
702 MVT::nxv4i32, MVT::nxv2i64};
703 auto LT = getTypeLegalizationCost(RetTy);
704 if (any_of(ValidAbsTys, equal_to(LT.second)))
705 return LT.first;
706 break;
707 }
708 case Intrinsic::bswap: {
709 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
710 MVT::v4i32, MVT::v2i64};
711 auto LT = getTypeLegalizationCost(RetTy);
712 if (any_of(ValidAbsTys, equal_to(LT.second)) &&
713 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
714 return LT.first;
715 break;
716 }
717 case Intrinsic::fma:
718 case Intrinsic::fmuladd: {
719 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
720 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
721 Type *EltTy = RetTy->getScalarType();
722 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
723 (EltTy->isHalfTy() && ST->hasFullFP16()))
724 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
725 break;
726 }
727 case Intrinsic::stepvector: {
728 InstructionCost Cost = 1; // Cost of the `index' instruction
729 auto LT = getTypeLegalizationCost(RetTy);
730 // Legalisation of illegal vectors involves an `index' instruction plus
731 // (LT.first - 1) vector adds.
732 if (LT.first > 1) {
733 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
734 InstructionCost AddCost =
735 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
736 Cost += AddCost * (LT.first - 1);
737 }
738 return Cost;
739 }
740 case Intrinsic::vector_extract:
741 case Intrinsic::vector_insert: {
742 // If both the vector and subvector types are legal types and the index
743 // is 0, then this should be a no-op or simple operation; return a
744 // relatively low cost.
745
746 // If arguments aren't actually supplied, then we cannot determine the
747 // value of the index. We also want to skip predicate types.
748 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
750 break;
751
752 LLVMContext &C = RetTy->getContext();
753 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
754 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
755 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
756 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
757 // Skip this if either the vector or subvector types are unpacked
758 // SVE types; they may get lowered to stack stores and loads.
759 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
760 break;
761
763 getTLI()->getTypeConversion(C, SubVecVT);
765 getTLI()->getTypeConversion(C, VecVT);
766 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
767 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
768 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
769 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
770 return TTI::TCC_Free;
771 break;
772 }
773 case Intrinsic::bitreverse: {
774 static const CostTblEntry BitreverseTbl[] = {
775 {Intrinsic::bitreverse, MVT::i32, 1},
776 {Intrinsic::bitreverse, MVT::i64, 1},
777 {Intrinsic::bitreverse, MVT::v8i8, 1},
778 {Intrinsic::bitreverse, MVT::v16i8, 1},
779 {Intrinsic::bitreverse, MVT::v4i16, 2},
780 {Intrinsic::bitreverse, MVT::v8i16, 2},
781 {Intrinsic::bitreverse, MVT::v2i32, 2},
782 {Intrinsic::bitreverse, MVT::v4i32, 2},
783 {Intrinsic::bitreverse, MVT::v1i64, 2},
784 {Intrinsic::bitreverse, MVT::v2i64, 2},
785 };
786 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
787 const auto *Entry =
788 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
789 if (Entry) {
790 // Cost Model is using the legal type(i32) that i8 and i16 will be
791 // converted to +1 so that we match the actual lowering cost
792 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
793 TLI->getValueType(DL, RetTy, true) == MVT::i16)
794 return LegalisationCost.first * Entry->Cost + 1;
795
796 return LegalisationCost.first * Entry->Cost;
797 }
798 break;
799 }
800 case Intrinsic::ctpop: {
801 if (!ST->hasNEON()) {
802 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
803 return getTypeLegalizationCost(RetTy).first * 12;
804 }
805 static const CostTblEntry CtpopCostTbl[] = {
806 {ISD::CTPOP, MVT::v2i64, 4},
807 {ISD::CTPOP, MVT::v4i32, 3},
808 {ISD::CTPOP, MVT::v8i16, 2},
809 {ISD::CTPOP, MVT::v16i8, 1},
810 {ISD::CTPOP, MVT::i64, 4},
811 {ISD::CTPOP, MVT::v2i32, 3},
812 {ISD::CTPOP, MVT::v4i16, 2},
813 {ISD::CTPOP, MVT::v8i8, 1},
814 {ISD::CTPOP, MVT::i32, 5},
815 // SVE types (For targets that override NEON for fixed length vectors)
816 {ISD::CTPOP, MVT::nxv2i64, 1},
817 {ISD::CTPOP, MVT::nxv4i32, 1},
818 {ISD::CTPOP, MVT::nxv8i16, 1},
819 {ISD::CTPOP, MVT::nxv16i8, 1},
820 };
821 auto LT = getTypeLegalizationCost(RetTy);
822 MVT MTy = LT.second;
823
824 // When SVE is available CNT will be used for fixed and scalable vectors.
825 if (ST->isSVEorStreamingSVEAvailable() && MTy.isFixedLengthVector())
827 128 / MTy.getScalarSizeInBits());
828
829 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
830 // Extra cost of +1 when illegal vector types are legalized by promoting
831 // the integer type.
832 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
833 RetTy->getScalarSizeInBits()
834 ? 1
835 : 0;
836 return LT.first * Entry->Cost + ExtraCost;
837 }
838 break;
839 }
840 case Intrinsic::sadd_with_overflow:
841 case Intrinsic::uadd_with_overflow:
842 case Intrinsic::ssub_with_overflow:
843 case Intrinsic::usub_with_overflow:
844 case Intrinsic::smul_with_overflow:
845 case Intrinsic::umul_with_overflow: {
846 static const CostTblEntry WithOverflowCostTbl[] = {
847 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
848 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
849 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
850 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
851 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
852 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
853 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
854 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
855 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
856 {Intrinsic::usub_with_overflow, MVT::i8, 3},
857 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
858 {Intrinsic::usub_with_overflow, MVT::i16, 3},
859 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
860 {Intrinsic::usub_with_overflow, MVT::i32, 1},
861 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
862 {Intrinsic::usub_with_overflow, MVT::i64, 1},
863 {Intrinsic::smul_with_overflow, MVT::i8, 5},
864 {Intrinsic::umul_with_overflow, MVT::i8, 4},
865 {Intrinsic::smul_with_overflow, MVT::i16, 5},
866 {Intrinsic::umul_with_overflow, MVT::i16, 4},
867 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
868 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
869 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
870 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
871 };
872 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
873 if (MTy.isSimple())
874 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
875 MTy.getSimpleVT()))
876 return Entry->Cost;
877 break;
878 }
879 case Intrinsic::fptosi_sat:
880 case Intrinsic::fptoui_sat: {
881 if (ICA.getArgTypes().empty())
882 break;
883 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
884 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
885 EVT MTy = TLI->getValueType(DL, RetTy);
886 // Check for the legal types, which are where the size of the input and the
887 // output are the same, or we are using cvt f64->i32 or f32->i64.
888 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
889 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
890 LT.second == MVT::v2f64)) {
891 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
892 (LT.second == MVT::f64 && MTy == MVT::i32) ||
893 (LT.second == MVT::f32 && MTy == MVT::i64)))
894 return LT.first;
895 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
896 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
897 MTy.getScalarSizeInBits() == 64)
898 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
899 }
900 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
901 // f32.
902 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
903 return LT.first + getIntrinsicInstrCost(
904 {ICA.getID(),
905 RetTy,
906 {ICA.getArgTypes()[0]->getWithNewType(
907 Type::getFloatTy(RetTy->getContext()))}},
908 CostKind);
909 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
910 (LT.second == MVT::f16 && MTy == MVT::i64) ||
911 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
912 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
913 return LT.first;
914 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
915 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
916 MTy.getScalarSizeInBits() == 32)
917 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
918 // Extending vector types v8f16->v8i32. These current scalarize but the
919 // codegen could be better.
920 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
921 MTy.getScalarSizeInBits() == 64)
922 return MTy.getVectorNumElements() * 3;
923
924 // If we can we use a legal convert followed by a min+max
925 if ((LT.second.getScalarType() == MVT::f32 ||
926 LT.second.getScalarType() == MVT::f64 ||
927 LT.second.getScalarType() == MVT::f16) &&
928 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
929 Type *LegalTy =
930 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
931 if (LT.second.isVector())
932 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
934 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
935 : Intrinsic::umin,
936 LegalTy, {LegalTy, LegalTy});
938 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
939 : Intrinsic::umax,
940 LegalTy, {LegalTy, LegalTy});
942 return LT.first * Cost +
943 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
944 : 1);
945 }
946 // Otherwise we need to follow the default expansion that clamps the value
947 // using a float min/max with a fcmp+sel for nan handling when signed.
948 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
949 RetTy = RetTy->getScalarType();
950 if (LT.second.isVector()) {
951 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
952 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
953 }
954 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
956 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
958 Cost +=
959 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
961 if (IsSigned) {
962 Type *CondTy = RetTy->getWithNewBitWidth(1);
963 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
965 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
967 }
968 return LT.first * Cost;
969 }
970 case Intrinsic::fshl:
971 case Intrinsic::fshr: {
972 if (ICA.getArgs().empty())
973 break;
974
975 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
976
977 // ROTR / ROTL is a funnel shift with equal first and second operand. For
978 // ROTR on integer registers (i32/i64) this can be done in a single ror
979 // instruction. A fshl with a non-constant shift uses a neg + ror.
980 if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] &&
981 (RetTy->getPrimitiveSizeInBits() == 32 ||
982 RetTy->getPrimitiveSizeInBits() == 64)) {
983 InstructionCost NegCost =
984 (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0;
985 return 1 + NegCost;
986 }
987
988 // TODO: Add handling for fshl where third argument is not a constant.
989 if (!OpInfoZ.isConstant())
990 break;
991
992 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
993 if (OpInfoZ.isUniform()) {
994 static const CostTblEntry FshlTbl[] = {
995 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
996 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
997 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
998 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
999 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
1000 // to avoid having to duplicate the costs.
1001 const auto *Entry =
1002 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
1003 if (Entry)
1004 return LegalisationCost.first * Entry->Cost;
1005 }
1006
1007 auto TyL = getTypeLegalizationCost(RetTy);
1008 if (!RetTy->isIntegerTy())
1009 break;
1010
1011 // Estimate cost manually, as types like i8 and i16 will get promoted to
1012 // i32 and CostTableLookup will ignore the extra conversion cost.
1013 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
1014 RetTy->getScalarSizeInBits() < 64) ||
1015 (RetTy->getScalarSizeInBits() % 64 != 0);
1016 unsigned ExtraCost = HigherCost ? 1 : 0;
1017 if (RetTy->getScalarSizeInBits() == 32 ||
1018 RetTy->getScalarSizeInBits() == 64)
1019 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
1020 // extr instruction.
1021 else if (HigherCost)
1022 ExtraCost = 1;
1023 else
1024 break;
1025 return TyL.first + ExtraCost;
1026 }
1027 case Intrinsic::get_active_lane_mask: {
1028 auto RetTy = cast<VectorType>(ICA.getReturnType());
1029 EVT RetVT = getTLI()->getValueType(DL, RetTy);
1030 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1031 if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
1032 break;
1033
1034 if (RetTy->isScalableTy()) {
1035 if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
1037 break;
1038
1039 auto LT = getTypeLegalizationCost(RetTy);
1040 InstructionCost Cost = LT.first;
1041 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
1042 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
1043 // nxv32i1 = get_active_lane_mask(base, idx) ->
1044 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
1045 if (ST->hasSVE2p1() || ST->hasSME2()) {
1046 Cost /= 2;
1047 if (Cost == 1)
1048 return Cost;
1049 }
1050
1051 // If more than one whilelo intrinsic is required, include the extra cost
1052 // required by the saturating add & select required to increment the
1053 // start value after the first intrinsic call.
1054 Type *OpTy = ICA.getArgTypes()[0];
1055 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
1056 InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
1057 Type *CondTy = OpTy->getWithNewBitWidth(1);
1058 SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
1060 return Cost + (SplitCost * (Cost - 1));
1061 } else if (!getTLI()->isTypeLegal(RetVT)) {
1062 // We don't have enough context at this point to determine if the mask
1063 // is going to be kept live after the block, which will force the vXi1
1064 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1065 // For now, we just assume the vectorizer created this intrinsic and
1066 // the result will be the input for a PHI. In this case the cost will
1067 // be extremely high for fixed-width vectors.
1068 // NOTE: getScalarizationOverhead returns a cost that's far too
1069 // pessimistic for the actual generated codegen. In reality there are
1070 // two instructions generated per lane.
1071 return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
1072 }
1073 break;
1074 }
1075 case Intrinsic::experimental_vector_match: {
1076 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
1077 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1078 unsigned SearchSize = NeedleTy->getNumElements();
1079 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
1080 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1081 // Neoverse V3, these are cheap operations with the same latency as a
1082 // vector ADD. In most cases, however, we also need to do an extra DUP.
1083 // For fixed-length vectors we currently need an extra five--six
1084 // instructions besides the MATCH.
1086 if (isa<FixedVectorType>(RetTy))
1087 Cost += 10;
1088 return Cost;
1089 }
1090 break;
1091 }
1092 case Intrinsic::cttz: {
1093 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1094 if (LT.second == MVT::v8i8 || LT.second == MVT::v16i8)
1095 return LT.first * 2;
1096 if (LT.second == MVT::v4i16 || LT.second == MVT::v8i16 ||
1097 LT.second == MVT::v2i32 || LT.second == MVT::v4i32)
1098 return LT.first * 3;
1099 break;
1100 }
1101 case Intrinsic::experimental_cttz_elts: {
1102 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1103 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1104 // This will consist of a SVE brkb and a cntp instruction. These
1105 // typically have the same latency and half the throughput as a vector
1106 // add instruction.
1107 return 4;
1108 }
1109 break;
1110 }
1111 case Intrinsic::loop_dependence_raw_mask:
1112 case Intrinsic::loop_dependence_war_mask: {
1113 // The whilewr/rw instructions require SVE2 or SME.
1114 if (ST->hasSVE2() || ST->hasSME()) {
1115 EVT VecVT = getTLI()->getValueType(DL, RetTy);
1116 unsigned EltSizeInBytes =
1117 cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
1118 if (!is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes) ||
1119 VecVT.getVectorMinNumElements() != (16 / EltSizeInBytes))
1120 break;
1121 // For fixed-vector types we need to AND the mask with a ptrue vl<N>.
1122 return isa<FixedVectorType>(RetTy) ? 2 : 1;
1123 }
1124 break;
1125 }
1126 case Intrinsic::experimental_vector_extract_last_active:
1127 if (ST->isSVEorStreamingSVEAvailable()) {
1128 auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1129 // This should turn into chained clastb instructions.
1130 return LegalCost;
1131 }
1132 break;
1133 case Intrinsic::pow: {
1134 // For scalar calls we know the target has the libcall, and for fixed-width
1135 // vectors we know for the worst case it can be scalarised.
1136 EVT VT = getTLI()->getValueType(DL, RetTy);
1137 RTLIB::Libcall LC = RTLIB::getPOW(VT);
1138 bool HasLibcall = getTLI()->getLibcallImpl(LC) != RTLIB::Unsupported;
1139 bool CanLowerWithLibcalls = !isa<ScalableVectorType>(RetTy) || HasLibcall;
1140
1141 // If we know that the call can be lowered with libcalls then it's safe to
1142 // reduce the costs in some cases. This is important for scalable vectors,
1143 // since we cannot scalarize the call in the absence of a vector math
1144 // library.
1145 if (CanLowerWithLibcalls && ICA.getInst() && !ICA.getArgs().empty()) {
1146 // If we know the fast math flags and the exponent is a constant then the
1147 // cost may be less for some exponents like 0.25 and 0.75.
1148 const Constant *ExpC = dyn_cast<Constant>(ICA.getArgs()[1]);
1149 if (ExpC && isa<VectorType>(ExpC->getType()))
1150 ExpC = ExpC->getSplatValue();
1151 if (auto *ExpF = dyn_cast_or_null<ConstantFP>(ExpC)) {
1152 // The argument must be a FP constant.
1153 bool Is025 = ExpF->getValueAPF().isExactlyValue(0.25);
1154 bool Is075 = ExpF->getValueAPF().isExactlyValue(0.75);
1155 FastMathFlags FMF = ICA.getInst()->getFastMathFlags();
1156 if ((Is025 || Is075) && FMF.noInfs() && FMF.approxFunc() &&
1157 (!Is025 || FMF.noSignedZeros())) {
1158 IntrinsicCostAttributes Attrs(Intrinsic::sqrt, RetTy, {RetTy}, FMF);
1160 if (Is025)
1161 return 2 * Sqrt;
1163 getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
1164 return (Sqrt * 2) + FMul;
1165 }
1166 // TODO: For 1/3 exponents we expect the cbrt call to be slightly
1167 // cheaper than pow.
1168 }
1169 }
1170
1171 if (HasLibcall)
1172 return getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
1173 break;
1174 }
1175 case Intrinsic::sqrt:
1176 case Intrinsic::fabs:
1177 case Intrinsic::ceil:
1178 case Intrinsic::floor:
1179 case Intrinsic::nearbyint:
1180 case Intrinsic::round:
1181 case Intrinsic::rint:
1182 case Intrinsic::roundeven:
1183 case Intrinsic::trunc:
1184 case Intrinsic::minnum:
1185 case Intrinsic::maxnum:
1186 case Intrinsic::minimum:
1187 case Intrinsic::maximum: {
1188 if (isa<ScalableVectorType>(RetTy) && ST->isSVEorStreamingSVEAvailable()) {
1189 auto LT = getTypeLegalizationCost(RetTy);
1190 return LT.first;
1191 }
1192 break;
1193 }
1194 default:
1195 break;
1196 }
1198}
1199
1200/// The function will remove redundant reinterprets casting in the presence
1201/// of the control flow
1202static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1203 IntrinsicInst &II) {
1205 auto RequiredType = II.getType();
1206
1207 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1208 assert(PN && "Expected Phi Node!");
1209
1210 // Don't create a new Phi unless we can remove the old one.
1211 if (!PN->hasOneUse())
1212 return std::nullopt;
1213
1214 for (Value *IncValPhi : PN->incoming_values()) {
1215 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1216 if (!Reinterpret ||
1217 Reinterpret->getIntrinsicID() !=
1218 Intrinsic::aarch64_sve_convert_to_svbool ||
1219 RequiredType != Reinterpret->getArgOperand(0)->getType())
1220 return std::nullopt;
1221 }
1222
1223 // Create the new Phi
1224 IC.Builder.SetInsertPoint(PN);
1225 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1226 Worklist.push_back(PN);
1227
1228 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1229 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1230 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1231 Worklist.push_back(Reinterpret);
1232 }
1233
1234 // Cleanup Phi Node and reinterprets
1235 return IC.replaceInstUsesWith(II, NPN);
1236}
1237
1238// A collection of properties common to SVE intrinsics that allow for combines
1239// to be written without needing to know the specific intrinsic.
1241 //
1242 // Helper routines for common intrinsic definitions.
1243 //
1244
1245 // e.g. llvm.aarch64.sve.add pg, op1, op2
1246 // with IID ==> llvm.aarch64.sve.add_u
1247 static SVEIntrinsicInfo
1254
1255 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1262
1263 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1269
1270 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1276
1277 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1278 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1279 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1280 return SVEIntrinsicInfo()
1283 }
1284
1285 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1286 // llvm.aarch64.sve.ld1 pg, ptr
1293
1294 // All properties relate to predication and thus having a general predicate
1295 // is the minimum requirement to say there is intrinsic info to act on.
1296 explicit operator bool() const { return hasGoverningPredicate(); }
1297
1298 //
1299 // Properties relating to the governing predicate.
1300 //
1301
1303 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1304 }
1305
1307 assert(hasGoverningPredicate() && "Propery not set!");
1308 return GoverningPredicateIdx;
1309 }
1310
1312 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1313 GoverningPredicateIdx = Index;
1314 return *this;
1315 }
1316
1317 //
1318 // Properties relating to operations the intrinsic could be transformed into.
1319 // NOTE: This does not mean such a transformation is always possible, but the
1320 // knowledge makes it possible to reuse existing optimisations without needing
1321 // to embed specific handling for each intrinsic. For example, instruction
1322 // simplification can be used to optimise an intrinsic's active lanes.
1323 //
1324
1326 return UndefIntrinsic != Intrinsic::not_intrinsic;
1327 }
1328
1330 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1331 return UndefIntrinsic;
1332 }
1333
1335 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1336 UndefIntrinsic = IID;
1337 return *this;
1338 }
1339
1340 bool hasMatchingIROpode() const { return IROpcode != 0; }
1341
1342 unsigned getMatchingIROpode() const {
1343 assert(hasMatchingIROpode() && "Propery not set!");
1344 return IROpcode;
1345 }
1346
1348 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1349 IROpcode = Opcode;
1350 return *this;
1351 }
1352
1353 //
1354 // Properties relating to the result of inactive lanes.
1355 //
1356
1358 return ResultLanes == InactiveLanesTakenFromOperand;
1359 }
1360
1362 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1363 return OperandIdxForInactiveLanes;
1364 }
1365
1367 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1368 ResultLanes = InactiveLanesTakenFromOperand;
1369 OperandIdxForInactiveLanes = Index;
1370 return *this;
1371 }
1372
1374 return ResultLanes == InactiveLanesAreNotDefined;
1375 }
1376
1378 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1379 ResultLanes = InactiveLanesAreNotDefined;
1380 return *this;
1381 }
1382
1384 return ResultLanes == InactiveLanesAreUnused;
1385 }
1386
1388 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1389 ResultLanes = InactiveLanesAreUnused;
1390 return *this;
1391 }
1392
1393 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1394 // inactiveLanesAreZeroed =
1395 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1396 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1397
1399 ResultIsZeroInitialized = true;
1400 return *this;
1401 }
1402
1403 //
1404 // The first operand of unary merging operations is typically only used to
1405 // set the result for inactive lanes. Knowing this allows us to deadcode the
1406 // operand when we can prove there are no inactive lanes.
1407 //
1408
1410 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1411 }
1412
1414 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1415 return OperandIdxWithNoActiveLanes;
1416 }
1417
1419 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1420 OperandIdxWithNoActiveLanes = Index;
1421 return *this;
1422 }
1423
1424private:
1425 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1426
1427 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1428 unsigned IROpcode = 0;
1429
1430 enum PredicationStyle {
1432 InactiveLanesTakenFromOperand,
1433 InactiveLanesAreNotDefined,
1434 InactiveLanesAreUnused
1435 } ResultLanes = Uninitialized;
1436
1437 bool ResultIsZeroInitialized = false;
1438 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1439 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1440};
1441
1443 // Some SVE intrinsics do not use scalable vector types, but since they are
1444 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1445 if (!isa<ScalableVectorType>(II.getType()) &&
1446 all_of(II.args(), [&](const Value *V) {
1447 return !isa<ScalableVectorType>(V->getType());
1448 }))
1449 return SVEIntrinsicInfo();
1450
1451 Intrinsic::ID IID = II.getIntrinsicID();
1452 switch (IID) {
1453 default:
1454 break;
1455 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1456 case Intrinsic::aarch64_sve_fcvt_f16f32:
1457 case Intrinsic::aarch64_sve_fcvt_f16f64:
1458 case Intrinsic::aarch64_sve_fcvt_f32f16:
1459 case Intrinsic::aarch64_sve_fcvt_f32f64:
1460 case Intrinsic::aarch64_sve_fcvt_f64f16:
1461 case Intrinsic::aarch64_sve_fcvt_f64f32:
1462 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1463 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1464 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1465 case Intrinsic::aarch64_sve_fcvtzs:
1466 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1467 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1468 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1469 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1470 case Intrinsic::aarch64_sve_fcvtzu:
1471 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1472 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1473 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1474 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1475 case Intrinsic::aarch64_sve_revb:
1476 case Intrinsic::aarch64_sve_revh:
1477 case Intrinsic::aarch64_sve_revw:
1478 case Intrinsic::aarch64_sve_revd:
1479 case Intrinsic::aarch64_sve_scvtf:
1480 case Intrinsic::aarch64_sve_scvtf_f16i32:
1481 case Intrinsic::aarch64_sve_scvtf_f16i64:
1482 case Intrinsic::aarch64_sve_scvtf_f32i64:
1483 case Intrinsic::aarch64_sve_scvtf_f64i32:
1484 case Intrinsic::aarch64_sve_ucvtf:
1485 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1486 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1487 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1488 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1490
1491 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1492 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1493 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1494 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1496
1497 case Intrinsic::aarch64_sve_fabd:
1498 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1499 case Intrinsic::aarch64_sve_fadd:
1500 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1501 .setMatchingIROpcode(Instruction::FAdd);
1502 case Intrinsic::aarch64_sve_fdiv:
1503 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1504 .setMatchingIROpcode(Instruction::FDiv);
1505 case Intrinsic::aarch64_sve_fmax:
1506 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1507 case Intrinsic::aarch64_sve_fmaxnm:
1508 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1509 case Intrinsic::aarch64_sve_fmin:
1510 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1511 case Intrinsic::aarch64_sve_fminnm:
1512 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1513 case Intrinsic::aarch64_sve_fmla:
1514 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1515 case Intrinsic::aarch64_sve_fmls:
1516 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1517 case Intrinsic::aarch64_sve_fmul:
1518 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1519 .setMatchingIROpcode(Instruction::FMul);
1520 case Intrinsic::aarch64_sve_fmulx:
1521 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1522 case Intrinsic::aarch64_sve_fnmla:
1523 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1524 case Intrinsic::aarch64_sve_fnmls:
1525 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1526 case Intrinsic::aarch64_sve_fsub:
1527 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1528 .setMatchingIROpcode(Instruction::FSub);
1529 case Intrinsic::aarch64_sve_add:
1530 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1531 .setMatchingIROpcode(Instruction::Add);
1532 case Intrinsic::aarch64_sve_mla:
1533 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1534 case Intrinsic::aarch64_sve_mls:
1535 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1536 case Intrinsic::aarch64_sve_mul:
1537 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1538 .setMatchingIROpcode(Instruction::Mul);
1539 case Intrinsic::aarch64_sve_sabd:
1540 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1541 case Intrinsic::aarch64_sve_sdiv:
1542 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1543 .setMatchingIROpcode(Instruction::SDiv);
1544 case Intrinsic::aarch64_sve_smax:
1545 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1546 case Intrinsic::aarch64_sve_smin:
1547 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1548 case Intrinsic::aarch64_sve_smulh:
1549 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1550 case Intrinsic::aarch64_sve_sub:
1551 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1552 .setMatchingIROpcode(Instruction::Sub);
1553 case Intrinsic::aarch64_sve_uabd:
1554 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1555 case Intrinsic::aarch64_sve_udiv:
1556 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1557 .setMatchingIROpcode(Instruction::UDiv);
1558 case Intrinsic::aarch64_sve_umax:
1559 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1560 case Intrinsic::aarch64_sve_umin:
1561 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1562 case Intrinsic::aarch64_sve_umulh:
1563 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1564 case Intrinsic::aarch64_sve_asr:
1565 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1566 .setMatchingIROpcode(Instruction::AShr);
1567 case Intrinsic::aarch64_sve_lsl:
1568 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1569 .setMatchingIROpcode(Instruction::Shl);
1570 case Intrinsic::aarch64_sve_lsr:
1571 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1572 .setMatchingIROpcode(Instruction::LShr);
1573 case Intrinsic::aarch64_sve_and:
1574 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1575 .setMatchingIROpcode(Instruction::And);
1576 case Intrinsic::aarch64_sve_bic:
1577 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1578 case Intrinsic::aarch64_sve_eor:
1579 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1580 .setMatchingIROpcode(Instruction::Xor);
1581 case Intrinsic::aarch64_sve_orr:
1582 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1583 .setMatchingIROpcode(Instruction::Or);
1584 case Intrinsic::aarch64_sve_shsub:
1585 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_shsub_u);
1586 case Intrinsic::aarch64_sve_shsubr:
1588 case Intrinsic::aarch64_sve_sqrshl:
1589 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqrshl_u);
1590 case Intrinsic::aarch64_sve_sqshl:
1591 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqshl_u);
1592 case Intrinsic::aarch64_sve_sqsub:
1593 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1594 case Intrinsic::aarch64_sve_srshl:
1595 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_srshl_u);
1596 case Intrinsic::aarch64_sve_uhsub:
1597 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uhsub_u);
1598 case Intrinsic::aarch64_sve_uhsubr:
1600 case Intrinsic::aarch64_sve_uqrshl:
1601 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqrshl_u);
1602 case Intrinsic::aarch64_sve_uqshl:
1603 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqshl_u);
1604 case Intrinsic::aarch64_sve_uqsub:
1605 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1606 case Intrinsic::aarch64_sve_urshl:
1607 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_urshl_u);
1608
1609 case Intrinsic::aarch64_sve_add_u:
1611 Instruction::Add);
1612 case Intrinsic::aarch64_sve_and_u:
1614 Instruction::And);
1615 case Intrinsic::aarch64_sve_asr_u:
1617 Instruction::AShr);
1618 case Intrinsic::aarch64_sve_eor_u:
1620 Instruction::Xor);
1621 case Intrinsic::aarch64_sve_fadd_u:
1623 Instruction::FAdd);
1624 case Intrinsic::aarch64_sve_fdiv_u:
1626 Instruction::FDiv);
1627 case Intrinsic::aarch64_sve_fmul_u:
1629 Instruction::FMul);
1630 case Intrinsic::aarch64_sve_fsub_u:
1632 Instruction::FSub);
1633 case Intrinsic::aarch64_sve_lsl_u:
1635 Instruction::Shl);
1636 case Intrinsic::aarch64_sve_lsr_u:
1638 Instruction::LShr);
1639 case Intrinsic::aarch64_sve_mul_u:
1641 Instruction::Mul);
1642 case Intrinsic::aarch64_sve_orr_u:
1644 Instruction::Or);
1645 case Intrinsic::aarch64_sve_sdiv_u:
1647 Instruction::SDiv);
1648 case Intrinsic::aarch64_sve_sub_u:
1650 Instruction::Sub);
1651 case Intrinsic::aarch64_sve_udiv_u:
1653 Instruction::UDiv);
1654
1655 case Intrinsic::aarch64_sve_addqv:
1656 case Intrinsic::aarch64_sve_and_z:
1657 case Intrinsic::aarch64_sve_bic_z:
1658 case Intrinsic::aarch64_sve_brka_z:
1659 case Intrinsic::aarch64_sve_brkb_z:
1660 case Intrinsic::aarch64_sve_brkn_z:
1661 case Intrinsic::aarch64_sve_brkpa_z:
1662 case Intrinsic::aarch64_sve_brkpb_z:
1663 case Intrinsic::aarch64_sve_cntp:
1664 case Intrinsic::aarch64_sve_compact:
1665 case Intrinsic::aarch64_sve_eor_z:
1666 case Intrinsic::aarch64_sve_eorv:
1667 case Intrinsic::aarch64_sve_eorqv:
1668 case Intrinsic::aarch64_sve_nand_z:
1669 case Intrinsic::aarch64_sve_nor_z:
1670 case Intrinsic::aarch64_sve_orn_z:
1671 case Intrinsic::aarch64_sve_orr_z:
1672 case Intrinsic::aarch64_sve_orv:
1673 case Intrinsic::aarch64_sve_orqv:
1674 case Intrinsic::aarch64_sve_pnext:
1675 case Intrinsic::aarch64_sve_rdffr_z:
1676 case Intrinsic::aarch64_sve_saddv:
1677 case Intrinsic::aarch64_sve_uaddv:
1678 case Intrinsic::aarch64_sve_umaxv:
1679 case Intrinsic::aarch64_sve_umaxqv:
1680 case Intrinsic::aarch64_sve_cmpeq:
1681 case Intrinsic::aarch64_sve_cmpeq_wide:
1682 case Intrinsic::aarch64_sve_cmpge:
1683 case Intrinsic::aarch64_sve_cmpge_wide:
1684 case Intrinsic::aarch64_sve_cmpgt:
1685 case Intrinsic::aarch64_sve_cmpgt_wide:
1686 case Intrinsic::aarch64_sve_cmphi:
1687 case Intrinsic::aarch64_sve_cmphi_wide:
1688 case Intrinsic::aarch64_sve_cmphs:
1689 case Intrinsic::aarch64_sve_cmphs_wide:
1690 case Intrinsic::aarch64_sve_cmple_wide:
1691 case Intrinsic::aarch64_sve_cmplo_wide:
1692 case Intrinsic::aarch64_sve_cmpls_wide:
1693 case Intrinsic::aarch64_sve_cmplt_wide:
1694 case Intrinsic::aarch64_sve_cmpne:
1695 case Intrinsic::aarch64_sve_cmpne_wide:
1696 case Intrinsic::aarch64_sve_facge:
1697 case Intrinsic::aarch64_sve_facgt:
1698 case Intrinsic::aarch64_sve_fcmpeq:
1699 case Intrinsic::aarch64_sve_fcmpge:
1700 case Intrinsic::aarch64_sve_fcmpgt:
1701 case Intrinsic::aarch64_sve_fcmpne:
1702 case Intrinsic::aarch64_sve_fcmpuo:
1703 case Intrinsic::aarch64_sve_ld1:
1704 case Intrinsic::aarch64_sve_ld1_gather:
1705 case Intrinsic::aarch64_sve_ld1_gather_index:
1706 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1707 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1708 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1709 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1710 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1711 case Intrinsic::aarch64_sve_ld1q_gather_index:
1712 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1713 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1714 case Intrinsic::aarch64_sve_ld1ro:
1715 case Intrinsic::aarch64_sve_ld1rq:
1716 case Intrinsic::aarch64_sve_ld1udq:
1717 case Intrinsic::aarch64_sve_ld1uwq:
1718 case Intrinsic::aarch64_sve_ld2_sret:
1719 case Intrinsic::aarch64_sve_ld2q_sret:
1720 case Intrinsic::aarch64_sve_ld3_sret:
1721 case Intrinsic::aarch64_sve_ld3q_sret:
1722 case Intrinsic::aarch64_sve_ld4_sret:
1723 case Intrinsic::aarch64_sve_ld4q_sret:
1724 case Intrinsic::aarch64_sve_ldff1:
1725 case Intrinsic::aarch64_sve_ldff1_gather:
1726 case Intrinsic::aarch64_sve_ldff1_gather_index:
1727 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1728 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1729 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1730 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1731 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1732 case Intrinsic::aarch64_sve_ldnf1:
1733 case Intrinsic::aarch64_sve_ldnt1:
1734 case Intrinsic::aarch64_sve_ldnt1_gather:
1735 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1736 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1737 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1739
1740 case Intrinsic::aarch64_sve_prf:
1741 case Intrinsic::aarch64_sve_prfb_gather_index:
1742 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1743 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1744 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1745 case Intrinsic::aarch64_sve_prfd_gather_index:
1746 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1747 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1748 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1749 case Intrinsic::aarch64_sve_prfh_gather_index:
1750 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1751 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1752 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1753 case Intrinsic::aarch64_sve_prfw_gather_index:
1754 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1755 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1756 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1758
1759 case Intrinsic::aarch64_sve_st1_scatter:
1760 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1761 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1762 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1763 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1764 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1765 case Intrinsic::aarch64_sve_st1dq:
1766 case Intrinsic::aarch64_sve_st1q_scatter_index:
1767 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1768 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1769 case Intrinsic::aarch64_sve_st1wq:
1770 case Intrinsic::aarch64_sve_stnt1:
1771 case Intrinsic::aarch64_sve_stnt1_scatter:
1772 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1773 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1774 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1776 case Intrinsic::aarch64_sve_st2:
1777 case Intrinsic::aarch64_sve_st2q:
1779 case Intrinsic::aarch64_sve_st3:
1780 case Intrinsic::aarch64_sve_st3q:
1782 case Intrinsic::aarch64_sve_st4:
1783 case Intrinsic::aarch64_sve_st4q:
1785 }
1786
1787 return SVEIntrinsicInfo();
1788}
1789
1790static bool isAllActivePredicate(Value *Pred) {
1791 Value *UncastedPred;
1792
1793 // Look through predicate casts that only remove lanes.
1795 m_Value(UncastedPred)))) {
1796 auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
1797 Pred = UncastedPred;
1798
1800 m_Value(UncastedPred))))
1801 // If the predicate has the same or less lanes than the uncasted predicate
1802 // then we know the casting has no effect.
1803 if (OrigPredTy->getMinNumElements() <=
1804 cast<ScalableVectorType>(UncastedPred->getType())
1805 ->getMinNumElements())
1806 Pred = UncastedPred;
1807 }
1808
1809 auto *C = dyn_cast<Constant>(Pred);
1810 return C && C->isAllOnesValue();
1811}
1812
1813// Simplify `V` by only considering the operations that affect active lanes.
1814// This function should only return existing Values or newly created Constants.
1815static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1816 auto *Dup = dyn_cast<IntrinsicInst>(V);
1817 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1818 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1820 cast<VectorType>(V->getType())->getElementCount(),
1821 cast<Constant>(Dup->getOperand(2)));
1822
1823 return V;
1824}
1825
1826static std::optional<Instruction *>
1828 const SVEIntrinsicInfo &IInfo) {
1829 const unsigned Opc = IInfo.getMatchingIROpode();
1830 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1831
1832 Value *Pg = II.getOperand(0);
1833 Value *Op1 = II.getOperand(1);
1834 Value *Op2 = II.getOperand(2);
1835 const DataLayout &DL = II.getDataLayout();
1836
1837 // Canonicalise constants to the RHS.
1839 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1840 IC.replaceOperand(II, 1, Op2);
1841 IC.replaceOperand(II, 2, Op1);
1842 return &II;
1843 }
1844
1845 // Only active lanes matter when simplifying the operation.
1846 Op1 = stripInactiveLanes(Op1, Pg);
1847 Op2 = stripInactiveLanes(Op2, Pg);
1848
1849 Value *SimpleII;
1850 if (auto FII = dyn_cast<FPMathOperator>(&II))
1851 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1852 else
1853 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1854
1855 // An SVE intrinsic's result is always defined. However, this is not the case
1856 // for its equivalent IR instruction (e.g. when shifting by an amount more
1857 // than the data's bitwidth). Simplifications to an undefined result must be
1858 // ignored to preserve the intrinsic's expected behaviour.
1859 if (!SimpleII || isa<UndefValue>(SimpleII))
1860 return std::nullopt;
1861
1862 if (IInfo.inactiveLanesAreNotDefined())
1863 return IC.replaceInstUsesWith(II, SimpleII);
1864
1865 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1866
1867 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1868 if (SimpleII == Inactive)
1869 return IC.replaceInstUsesWith(II, SimpleII);
1870
1871 // Inactive lanes must be preserved.
1872 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1873 return IC.replaceInstUsesWith(II, SimpleII);
1874}
1875
1876// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1877// to operations with less strict inactive lane requirements.
1878static std::optional<Instruction *>
1880 const SVEIntrinsicInfo &IInfo) {
1881 if (!IInfo.hasGoverningPredicate())
1882 return std::nullopt;
1883
1884 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1885
1886 // If there are no active lanes.
1887 if (match(OpPredicate, m_ZeroInt())) {
1889 return IC.replaceInstUsesWith(
1890 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1891
1892 if (IInfo.inactiveLanesAreUnused()) {
1893 if (IInfo.resultIsZeroInitialized())
1895
1896 return IC.eraseInstFromFunction(II);
1897 }
1898 }
1899
1900 // If there are no inactive lanes.
1901 if (isAllActivePredicate(OpPredicate)) {
1902 if (IInfo.hasOperandWithNoActiveLanes()) {
1903 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1904 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1905 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1906 }
1907
1908 if (IInfo.hasMatchingUndefIntrinsic()) {
1909 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1910 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1911 II.setCalledFunction(NewDecl);
1912 return &II;
1913 }
1914 }
1915
1916 // Operation specific simplifications.
1917 if (IInfo.hasMatchingIROpode() &&
1919 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1920
1921 return std::nullopt;
1922}
1923
1924// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1925// => (binop (pred) (from_svbool _) (from_svbool _))
1926//
1927// The above transformation eliminates a `to_svbool` in the predicate
1928// operand of bitwise operation `binop` by narrowing the vector width of
1929// the operation. For example, it would convert a `<vscale x 16 x i1>
1930// and` into a `<vscale x 4 x i1> and`. This is profitable because
1931// to_svbool must zero the new lanes during widening, whereas
1932// from_svbool is free.
1933static std::optional<Instruction *>
1935 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1936 if (!BinOp)
1937 return std::nullopt;
1938
1939 auto IntrinsicID = BinOp->getIntrinsicID();
1940 switch (IntrinsicID) {
1941 case Intrinsic::aarch64_sve_and_z:
1942 case Intrinsic::aarch64_sve_bic_z:
1943 case Intrinsic::aarch64_sve_eor_z:
1944 case Intrinsic::aarch64_sve_nand_z:
1945 case Intrinsic::aarch64_sve_nor_z:
1946 case Intrinsic::aarch64_sve_orn_z:
1947 case Intrinsic::aarch64_sve_orr_z:
1948 break;
1949 default:
1950 return std::nullopt;
1951 }
1952
1953 auto BinOpPred = BinOp->getOperand(0);
1954 auto BinOpOp1 = BinOp->getOperand(1);
1955 auto BinOpOp2 = BinOp->getOperand(2);
1956
1957 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1958 if (!PredIntr ||
1959 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1960 return std::nullopt;
1961
1962 auto PredOp = PredIntr->getOperand(0);
1963 auto PredOpTy = cast<VectorType>(PredOp->getType());
1964 if (PredOpTy != II.getType())
1965 return std::nullopt;
1966
1967 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1968 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1969 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1970 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1971 if (BinOpOp1 == BinOpOp2)
1972 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1973 else
1974 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1975 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1976
1977 auto NarrowedBinOp =
1978 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1979 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1980}
1981
1982static std::optional<Instruction *>
1984 // If the reinterpret instruction operand is a PHI Node
1985 if (isa<PHINode>(II.getArgOperand(0)))
1986 return processPhiNode(IC, II);
1987
1988 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1989 return BinOpCombine;
1990
1991 // Ignore converts to/from svcount_t.
1992 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1993 isa<TargetExtType>(II.getType()))
1994 return std::nullopt;
1995
1996 SmallVector<Instruction *, 32> CandidatesForRemoval;
1997 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1998
1999 const auto *IVTy = cast<VectorType>(II.getType());
2000
2001 // Walk the chain of conversions.
2002 while (Cursor) {
2003 // If the type of the cursor has fewer lanes than the final result, zeroing
2004 // must take place, which breaks the equivalence chain.
2005 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
2006 if (CursorVTy->getElementCount().getKnownMinValue() <
2007 IVTy->getElementCount().getKnownMinValue())
2008 break;
2009
2010 // If the cursor has the same type as I, it is a viable replacement.
2011 if (Cursor->getType() == IVTy)
2012 EarliestReplacement = Cursor;
2013
2014 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
2015
2016 // If this is not an SVE conversion intrinsic, this is the end of the chain.
2017 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
2018 Intrinsic::aarch64_sve_convert_to_svbool ||
2019 IntrinsicCursor->getIntrinsicID() ==
2020 Intrinsic::aarch64_sve_convert_from_svbool))
2021 break;
2022
2023 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
2024 Cursor = IntrinsicCursor->getOperand(0);
2025 }
2026
2027 // If no viable replacement in the conversion chain was found, there is
2028 // nothing to do.
2029 if (!EarliestReplacement)
2030 return std::nullopt;
2031
2032 return IC.replaceInstUsesWith(II, EarliestReplacement);
2033}
2034
2035static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
2036 IntrinsicInst &II) {
2037 // svsel(ptrue, x, y) => x
2038 auto *OpPredicate = II.getOperand(0);
2039 if (isAllActivePredicate(OpPredicate))
2040 return IC.replaceInstUsesWith(II, II.getOperand(1));
2041
2042 auto Select =
2043 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
2044 return IC.replaceInstUsesWith(II, Select);
2045}
2046
2047static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
2048 IntrinsicInst &II) {
2049 Value *Pg = II.getOperand(1);
2050
2051 // sve.dup(V, all_active, X) ==> splat(X)
2052 if (isAllActivePredicate(Pg)) {
2053 auto *RetTy = cast<ScalableVectorType>(II.getType());
2054 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
2055 II.getArgOperand(2));
2056 return IC.replaceInstUsesWith(II, Splat);
2057 }
2058
2060 m_SpecificInt(AArch64SVEPredPattern::vl1))))
2061 return std::nullopt;
2062
2063 // sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0
2064 Value *Insert = IC.Builder.CreateInsertElement(
2065 II.getArgOperand(0), II.getArgOperand(2), uint64_t(0));
2066 return IC.replaceInstUsesWith(II, Insert);
2067}
2068
2069static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
2070 IntrinsicInst &II) {
2071 // Replace DupX with a regular IR splat.
2072 auto *RetTy = cast<ScalableVectorType>(II.getType());
2073 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
2074 II.getArgOperand(0));
2075 Splat->takeName(&II);
2076 return IC.replaceInstUsesWith(II, Splat);
2077}
2078
2079static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
2080 IntrinsicInst &II) {
2081 LLVMContext &Ctx = II.getContext();
2082
2083 if (!isAllActivePredicate(II.getArgOperand(0)))
2084 return std::nullopt;
2085
2086 // Check that we have a compare of zero..
2087 auto *SplatValue =
2089 if (!SplatValue || !SplatValue->isZero())
2090 return std::nullopt;
2091
2092 // ..against a dupq
2093 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
2094 if (!DupQLane ||
2095 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
2096 return std::nullopt;
2097
2098 // Where the dupq is a lane 0 replicate of a vector insert
2099 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
2100 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
2101 return std::nullopt;
2102
2103 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
2104 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
2105 return std::nullopt;
2106
2107 // Where the vector insert is a fixed constant vector insert into undef at
2108 // index zero
2109 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
2110 return std::nullopt;
2111
2112 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
2113 return std::nullopt;
2114
2115 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
2116 if (!ConstVec)
2117 return std::nullopt;
2118
2119 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
2120 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
2121 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
2122 return std::nullopt;
2123
2124 unsigned NumElts = VecTy->getNumElements();
2125 unsigned PredicateBits = 0;
2126
2127 // Expand intrinsic operands to a 16-bit byte level predicate
2128 for (unsigned I = 0; I < NumElts; ++I) {
2129 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
2130 if (!Arg)
2131 return std::nullopt;
2132 if (!Arg->isZero())
2133 PredicateBits |= 1 << (I * (16 / NumElts));
2134 }
2135
2136 // If all bits are zero bail early with an empty predicate
2137 if (PredicateBits == 0) {
2138 auto *PFalse = Constant::getNullValue(II.getType());
2139 PFalse->takeName(&II);
2140 return IC.replaceInstUsesWith(II, PFalse);
2141 }
2142
2143 // Calculate largest predicate type used (where byte predicate is largest)
2144 unsigned Mask = 8;
2145 for (unsigned I = 0; I < 16; ++I)
2146 if ((PredicateBits & (1 << I)) != 0)
2147 Mask |= (I % 8);
2148
2149 unsigned PredSize = Mask & -Mask;
2150 auto *PredType = ScalableVectorType::get(
2151 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
2152
2153 // Ensure all relevant bits are set
2154 for (unsigned I = 0; I < 16; I += PredSize)
2155 if ((PredicateBits & (1 << I)) == 0)
2156 return std::nullopt;
2157
2158 auto *PTruePat =
2159 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2160 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2161 {PredType}, {PTruePat});
2162 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
2163 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
2164 auto *ConvertFromSVBool =
2165 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
2166 {II.getType()}, {ConvertToSVBool});
2167
2168 ConvertFromSVBool->takeName(&II);
2169 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
2170}
2171
2172static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2173 IntrinsicInst &II) {
2174 Value *Pg = II.getArgOperand(0);
2175 Value *Vec = II.getArgOperand(1);
2176 auto IntrinsicID = II.getIntrinsicID();
2177 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2178
2179 // lastX(splat(X)) --> X
2180 if (auto *SplatVal = getSplatValue(Vec))
2181 return IC.replaceInstUsesWith(II, SplatVal);
2182
2183 // If x and/or y is a splat value then:
2184 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2185 Value *LHS, *RHS;
2186 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
2187 if (isSplatValue(LHS) || isSplatValue(RHS)) {
2188 auto *OldBinOp = cast<BinaryOperator>(Vec);
2189 auto OpC = OldBinOp->getOpcode();
2190 auto *NewLHS =
2191 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
2192 auto *NewRHS =
2193 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
2195 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
2196 return IC.replaceInstUsesWith(II, NewBinOp);
2197 }
2198 }
2199
2200 auto *C = dyn_cast<Constant>(Pg);
2201 if (IsAfter && C && C->isNullValue()) {
2202 // The intrinsic is extracting lane 0 so use an extract instead.
2203 auto *IdxTy = Type::getInt64Ty(II.getContext());
2204 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
2205 Extract->insertBefore(II.getIterator());
2206 Extract->takeName(&II);
2207 return IC.replaceInstUsesWith(II, Extract);
2208 }
2209
2210 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
2211 if (!IntrPG)
2212 return std::nullopt;
2213
2214 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2215 return std::nullopt;
2216
2217 const auto PTruePattern =
2218 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2219
2220 // Can the intrinsic's predicate be converted to a known constant index?
2221 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2222 if (!MinNumElts)
2223 return std::nullopt;
2224
2225 unsigned Idx = MinNumElts - 1;
2226 // Increment the index if extracting the element after the last active
2227 // predicate element.
2228 if (IsAfter)
2229 ++Idx;
2230
2231 // Ignore extracts whose index is larger than the known minimum vector
2232 // length. NOTE: This is an artificial constraint where we prefer to
2233 // maintain what the user asked for until an alternative is proven faster.
2234 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2235 if (Idx >= PgVTy->getMinNumElements())
2236 return std::nullopt;
2237
2238 // The intrinsic is extracting a fixed lane so use an extract instead.
2239 auto *IdxTy = Type::getInt64Ty(II.getContext());
2240 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2241 Extract->insertBefore(II.getIterator());
2242 Extract->takeName(&II);
2243 return IC.replaceInstUsesWith(II, Extract);
2244}
2245
2246static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2247 IntrinsicInst &II) {
2248 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2249 // integer variant across a variety of micro-architectures. Replace scalar
2250 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2251 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2252 // depending on the micro-architecture, but has been observed as generally
2253 // being faster, particularly when the CLAST[AB] op is a loop-carried
2254 // dependency.
2255 Value *Pg = II.getArgOperand(0);
2256 Value *Fallback = II.getArgOperand(1);
2257 Value *Vec = II.getArgOperand(2);
2258 Type *Ty = II.getType();
2259
2260 if (!Ty->isIntegerTy())
2261 return std::nullopt;
2262
2263 Type *FPTy;
2264 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2265 default:
2266 return std::nullopt;
2267 case 16:
2268 FPTy = IC.Builder.getHalfTy();
2269 break;
2270 case 32:
2271 FPTy = IC.Builder.getFloatTy();
2272 break;
2273 case 64:
2274 FPTy = IC.Builder.getDoubleTy();
2275 break;
2276 }
2277
2278 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2279 auto *FPVTy = VectorType::get(
2280 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2281 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2282 auto *FPII = IC.Builder.CreateIntrinsic(
2283 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2284 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2285 return IC.replaceInstUsesWith(II, FPIItoInt);
2286}
2287
2288static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2289 IntrinsicInst &II) {
2290 LLVMContext &Ctx = II.getContext();
2291 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2292 // can work with RDFFR_PP for ptest elimination.
2293 auto *AllPat =
2294 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2295 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2296 {II.getType()}, {AllPat});
2297 auto *RDFFR =
2298 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue});
2299 RDFFR->takeName(&II);
2300 return IC.replaceInstUsesWith(II, RDFFR);
2301}
2302
2303static std::optional<Instruction *>
2305 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2306
2307 if (Pattern == AArch64SVEPredPattern::all) {
2309 II.getType(), ElementCount::getScalable(NumElts));
2310 Cnt->takeName(&II);
2311 return IC.replaceInstUsesWith(II, Cnt);
2312 }
2313
2314 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2315
2316 return MinNumElts && NumElts >= MinNumElts
2317 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2318 II, ConstantInt::get(II.getType(), MinNumElts)))
2319 : std::nullopt;
2320}
2321
2322static std::optional<Instruction *>
2324 const AArch64Subtarget *ST) {
2325 if (!ST->isStreaming())
2326 return std::nullopt;
2327
2328 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2329 // with SVEPredPattern::all
2330 Value *Cnt =
2332 Cnt->takeName(&II);
2333 return IC.replaceInstUsesWith(II, Cnt);
2334}
2335
2336static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2337 IntrinsicInst &II) {
2338 Value *PgVal = II.getArgOperand(0);
2339 Value *OpVal = II.getArgOperand(1);
2340
2341 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2342 // Later optimizations prefer this form.
2343 if (PgVal == OpVal &&
2344 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2345 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2346 Value *Ops[] = {PgVal, OpVal};
2347 Type *Tys[] = {PgVal->getType()};
2348
2349 auto *PTest =
2350 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2351 PTest->takeName(&II);
2352
2353 return IC.replaceInstUsesWith(II, PTest);
2354 }
2355
2358
2359 if (!Pg || !Op)
2360 return std::nullopt;
2361
2362 Intrinsic::ID OpIID = Op->getIntrinsicID();
2363
2364 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2365 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2366 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2367 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2368 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2369
2370 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2371
2372 PTest->takeName(&II);
2373 return IC.replaceInstUsesWith(II, PTest);
2374 }
2375
2376 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2377 // Later optimizations may rewrite sequence to use the flag-setting variant
2378 // of instruction X to remove PTEST.
2379 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2380 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2381 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2382 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2383 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2384 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2385 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2386 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2387 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2388 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2389 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2390 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2391 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2392 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2393 Type *Tys[] = {Pg->getType()};
2394
2395 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2396 PTest->takeName(&II);
2397
2398 return IC.replaceInstUsesWith(II, PTest);
2399 }
2400
2401 return std::nullopt;
2402}
2403
2404template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2405static std::optional<Instruction *>
2407 bool MergeIntoAddendOp) {
2408 Value *P = II.getOperand(0);
2409 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2410 if (MergeIntoAddendOp) {
2411 AddendOp = II.getOperand(1);
2412 Mul = II.getOperand(2);
2413 } else {
2414 AddendOp = II.getOperand(2);
2415 Mul = II.getOperand(1);
2416 }
2417
2419 m_Value(MulOp1))))
2420 return std::nullopt;
2421
2422 if (!Mul->hasOneUse())
2423 return std::nullopt;
2424
2425 Instruction *FMFSource = nullptr;
2426 if (II.getType()->isFPOrFPVectorTy()) {
2427 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2428 // Stop the combine when the flags on the inputs differ in case dropping
2429 // flags would lead to us missing out on more beneficial optimizations.
2430 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2431 return std::nullopt;
2432 if (!FAddFlags.allowContract())
2433 return std::nullopt;
2434 FMFSource = &II;
2435 }
2436
2437 CallInst *Res;
2438 if (MergeIntoAddendOp)
2439 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2440 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2441 else
2442 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2443 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2444
2445 return IC.replaceInstUsesWith(II, Res);
2446}
2447
2448static std::optional<Instruction *>
2450 Value *Pred = II.getOperand(0);
2451 Value *PtrOp = II.getOperand(1);
2452 Type *VecTy = II.getType();
2453
2454 if (isAllActivePredicate(Pred)) {
2455 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2456 Load->copyMetadata(II);
2457 return IC.replaceInstUsesWith(II, Load);
2458 }
2459
2460 CallInst *MaskedLoad =
2461 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2462 Pred, ConstantAggregateZero::get(VecTy));
2463 MaskedLoad->copyMetadata(II);
2464 return IC.replaceInstUsesWith(II, MaskedLoad);
2465}
2466
2467static std::optional<Instruction *>
2469 Value *VecOp = II.getOperand(0);
2470 Value *Pred = II.getOperand(1);
2471 Value *PtrOp = II.getOperand(2);
2472
2473 if (isAllActivePredicate(Pred)) {
2474 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2475 Store->copyMetadata(II);
2476 return IC.eraseInstFromFunction(II);
2477 }
2478
2479 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2480 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2481 MaskedStore->copyMetadata(II);
2482 return IC.eraseInstFromFunction(II);
2483}
2484
2486 switch (Intrinsic) {
2487 case Intrinsic::aarch64_sve_fmul_u:
2488 return Instruction::BinaryOps::FMul;
2489 case Intrinsic::aarch64_sve_fadd_u:
2490 return Instruction::BinaryOps::FAdd;
2491 case Intrinsic::aarch64_sve_fsub_u:
2492 return Instruction::BinaryOps::FSub;
2493 default:
2494 return Instruction::BinaryOpsEnd;
2495 }
2496}
2497
2498static std::optional<Instruction *>
2500 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2501 if (II.isStrictFP())
2502 return std::nullopt;
2503
2504 auto *OpPredicate = II.getOperand(0);
2505 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2506 if (BinOpCode == Instruction::BinaryOpsEnd ||
2507 !isAllActivePredicate(OpPredicate))
2508 return std::nullopt;
2509 auto BinOp = IC.Builder.CreateBinOpFMF(
2510 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2511 return IC.replaceInstUsesWith(II, BinOp);
2512}
2513
2514static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2515 IntrinsicInst &II) {
2516 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2517 Intrinsic::aarch64_sve_mla>(
2518 IC, II, true))
2519 return MLA;
2520 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2521 Intrinsic::aarch64_sve_mad>(
2522 IC, II, false))
2523 return MAD;
2524 return std::nullopt;
2525}
2526
2527static std::optional<Instruction *>
2529 if (auto FMLA =
2530 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2531 Intrinsic::aarch64_sve_fmla>(IC, II,
2532 true))
2533 return FMLA;
2534 if (auto FMAD =
2535 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2536 Intrinsic::aarch64_sve_fmad>(IC, II,
2537 false))
2538 return FMAD;
2539 if (auto FMLA =
2540 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2541 Intrinsic::aarch64_sve_fmla>(IC, II,
2542 true))
2543 return FMLA;
2544 return std::nullopt;
2545}
2546
2547static std::optional<Instruction *>
2549 if (auto FMLA =
2550 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2551 Intrinsic::aarch64_sve_fmla>(IC, II,
2552 true))
2553 return FMLA;
2554 if (auto FMAD =
2555 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2556 Intrinsic::aarch64_sve_fmad>(IC, II,
2557 false))
2558 return FMAD;
2559 if (auto FMLA_U =
2560 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2561 Intrinsic::aarch64_sve_fmla_u>(
2562 IC, II, true))
2563 return FMLA_U;
2564 return instCombineSVEVectorBinOp(IC, II);
2565}
2566
2567static std::optional<Instruction *>
2569 if (auto FMLS =
2570 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2571 Intrinsic::aarch64_sve_fmls>(IC, II,
2572 true))
2573 return FMLS;
2574 if (auto FMSB =
2575 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2576 Intrinsic::aarch64_sve_fnmsb>(
2577 IC, II, false))
2578 return FMSB;
2579 if (auto FMLS =
2580 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2581 Intrinsic::aarch64_sve_fmls>(IC, II,
2582 true))
2583 return FMLS;
2584 return std::nullopt;
2585}
2586
2587static std::optional<Instruction *>
2589 if (auto FMLS =
2590 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2591 Intrinsic::aarch64_sve_fmls>(IC, II,
2592 true))
2593 return FMLS;
2594 if (auto FMSB =
2595 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2596 Intrinsic::aarch64_sve_fnmsb>(
2597 IC, II, false))
2598 return FMSB;
2599 if (auto FMLS_U =
2600 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2601 Intrinsic::aarch64_sve_fmls_u>(
2602 IC, II, true))
2603 return FMLS_U;
2604 return instCombineSVEVectorBinOp(IC, II);
2605}
2606
2607static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2608 IntrinsicInst &II) {
2609 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2610 Intrinsic::aarch64_sve_mls>(
2611 IC, II, true))
2612 return MLS;
2613 return std::nullopt;
2614}
2615
2616static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2617 IntrinsicInst &II) {
2618 Value *UnpackArg = II.getArgOperand(0);
2619 auto *RetTy = cast<ScalableVectorType>(II.getType());
2620 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2621 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2622
2623 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2624 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2625 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2626 ScalarArg =
2627 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2628 Value *NewVal =
2629 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2630 NewVal->takeName(&II);
2631 return IC.replaceInstUsesWith(II, NewVal);
2632 }
2633
2634 return std::nullopt;
2635}
2636static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2637 IntrinsicInst &II) {
2638 auto *OpVal = II.getOperand(0);
2639 auto *OpIndices = II.getOperand(1);
2640 VectorType *VTy = cast<VectorType>(II.getType());
2641
2642 // Check whether OpIndices is a constant splat value < minimal element count
2643 // of result.
2644 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2645 if (!SplatValue ||
2646 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2647 return std::nullopt;
2648
2649 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2650 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2651 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2652 auto *VectorSplat =
2653 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2654
2655 VectorSplat->takeName(&II);
2656 return IC.replaceInstUsesWith(II, VectorSplat);
2657}
2658
2659static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2660 IntrinsicInst &II) {
2661 Value *A, *B;
2662 Type *RetTy = II.getType();
2663 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2664 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2665
2666 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2667 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2668 if ((match(II.getArgOperand(0),
2670 match(II.getArgOperand(1),
2672 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2673 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2674 auto *TyA = cast<ScalableVectorType>(A->getType());
2675 if (TyA == B->getType() &&
2677 auto *SubVec = IC.Builder.CreateInsertVector(
2678 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2679 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2680 TyA->getMinNumElements());
2681 ConcatVec->takeName(&II);
2682 return IC.replaceInstUsesWith(II, ConcatVec);
2683 }
2684 }
2685
2686 return std::nullopt;
2687}
2688
2689static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2690 IntrinsicInst &II) {
2691 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2692 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2693 Value *A, *B;
2694 if (match(II.getArgOperand(0),
2697 m_Specific(A), m_Specific(B))))
2698 return IC.replaceInstUsesWith(
2699 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2700
2701 return std::nullopt;
2702}
2703
2704static std::optional<Instruction *>
2706 Value *Mask = II.getOperand(0);
2707 Value *BasePtr = II.getOperand(1);
2708 Value *Index = II.getOperand(2);
2709 Type *Ty = II.getType();
2710 Value *PassThru = ConstantAggregateZero::get(Ty);
2711
2712 // Contiguous gather => masked load.
2713 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2714 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2715 Value *IndexBase;
2717 m_Value(IndexBase), m_SpecificInt(1)))) {
2718 Align Alignment =
2719 BasePtr->getPointerAlignment(II.getDataLayout());
2720
2721 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2722 BasePtr, IndexBase);
2723 CallInst *MaskedLoad =
2724 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2725 MaskedLoad->takeName(&II);
2726 return IC.replaceInstUsesWith(II, MaskedLoad);
2727 }
2728
2729 return std::nullopt;
2730}
2731
2732static std::optional<Instruction *>
2734 Value *Val = II.getOperand(0);
2735 Value *Mask = II.getOperand(1);
2736 Value *BasePtr = II.getOperand(2);
2737 Value *Index = II.getOperand(3);
2738 Type *Ty = Val->getType();
2739
2740 // Contiguous scatter => masked store.
2741 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2742 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2743 Value *IndexBase;
2745 m_Value(IndexBase), m_SpecificInt(1)))) {
2746 Align Alignment =
2747 BasePtr->getPointerAlignment(II.getDataLayout());
2748
2749 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2750 BasePtr, IndexBase);
2751 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2752
2753 return IC.eraseInstFromFunction(II);
2754 }
2755
2756 return std::nullopt;
2757}
2758
2759static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2760 IntrinsicInst &II) {
2762 Value *Pred = II.getOperand(0);
2763 Value *Vec = II.getOperand(1);
2764 Value *DivVec = II.getOperand(2);
2765
2766 Value *SplatValue = getSplatValue(DivVec);
2767 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2768 if (!SplatConstantInt)
2769 return std::nullopt;
2770
2771 APInt Divisor = SplatConstantInt->getValue();
2772 const int64_t DivisorValue = Divisor.getSExtValue();
2773 if (DivisorValue == -1)
2774 return std::nullopt;
2775 if (DivisorValue == 1)
2776 IC.replaceInstUsesWith(II, Vec);
2777
2778 if (Divisor.isPowerOf2()) {
2779 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2780 auto ASRD = IC.Builder.CreateIntrinsic(
2781 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2782 return IC.replaceInstUsesWith(II, ASRD);
2783 }
2784 if (Divisor.isNegatedPowerOf2()) {
2785 Divisor.negate();
2786 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2787 auto ASRD = IC.Builder.CreateIntrinsic(
2788 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2789 auto NEG = IC.Builder.CreateIntrinsic(
2790 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2791 return IC.replaceInstUsesWith(II, NEG);
2792 }
2793
2794 return std::nullopt;
2795}
2796
2797bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2798 size_t VecSize = Vec.size();
2799 if (VecSize == 1)
2800 return true;
2801 if (!isPowerOf2_64(VecSize))
2802 return false;
2803 size_t HalfVecSize = VecSize / 2;
2804
2805 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2806 RHS != Vec.end(); LHS++, RHS++) {
2807 if (*LHS != nullptr && *RHS != nullptr) {
2808 if (*LHS == *RHS)
2809 continue;
2810 else
2811 return false;
2812 }
2813 if (!AllowPoison)
2814 return false;
2815 if (*LHS == nullptr && *RHS != nullptr)
2816 *LHS = *RHS;
2817 }
2818
2819 Vec.resize(HalfVecSize);
2820 SimplifyValuePattern(Vec, AllowPoison);
2821 return true;
2822}
2823
2824// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2825// to dupqlane(f64(C)) where C is A concatenated with B
2826static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2827 IntrinsicInst &II) {
2828 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2829 if (!match(II.getOperand(0),
2831 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2832 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2833 return std::nullopt;
2834 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2835
2836 // Insert the scalars into a container ordered by InsertElement index
2837 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2838 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2839 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2840 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2841 CurrentInsertElt = InsertElt->getOperand(0);
2842 }
2843
2844 bool AllowPoison =
2845 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2846 if (!SimplifyValuePattern(Elts, AllowPoison))
2847 return std::nullopt;
2848
2849 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2850 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2851 for (size_t I = 0; I < Elts.size(); I++) {
2852 if (Elts[I] == nullptr)
2853 continue;
2854 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2855 IC.Builder.getInt64(I));
2856 }
2857 if (InsertEltChain == nullptr)
2858 return std::nullopt;
2859
2860 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2861 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2862 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2863 // be narrowed back to the original type.
2864 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2865 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2866 IIScalableTy->getMinNumElements() /
2867 PatternWidth;
2868
2869 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2870 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2871 auto *WideShuffleMaskTy =
2872 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2873
2874 auto InsertSubvector = IC.Builder.CreateInsertVector(
2875 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2876 uint64_t(0));
2877 auto WideBitcast =
2878 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2879 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2880 auto WideShuffle = IC.Builder.CreateShuffleVector(
2881 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2882 auto NarrowBitcast =
2883 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2884
2885 return IC.replaceInstUsesWith(II, NarrowBitcast);
2886}
2887
2888static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2889 IntrinsicInst &II) {
2890 Value *A = II.getArgOperand(0);
2891 Value *B = II.getArgOperand(1);
2892 if (A == B)
2893 return IC.replaceInstUsesWith(II, A);
2894
2895 return std::nullopt;
2896}
2897
2898static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2899 IntrinsicInst &II) {
2900 Value *Pred = II.getOperand(0);
2901 Value *Vec = II.getOperand(1);
2902 Value *Shift = II.getOperand(2);
2903
2904 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2905 Value *AbsPred, *MergedValue;
2907 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2909 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2910
2911 return std::nullopt;
2912
2913 // Transform is valid if any of the following are true:
2914 // * The ABS merge value is an undef or non-negative
2915 // * The ABS predicate is all active
2916 // * The ABS predicate and the SRSHL predicates are the same
2917 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2918 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2919 return std::nullopt;
2920
2921 // Only valid when the shift amount is non-negative, otherwise the rounding
2922 // behaviour of SRSHL cannot be ignored.
2923 if (!match(Shift, m_NonNegative()))
2924 return std::nullopt;
2925
2926 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2927 {II.getType()}, {Pred, Vec, Shift});
2928
2929 return IC.replaceInstUsesWith(II, LSL);
2930}
2931
2932static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2933 IntrinsicInst &II) {
2934 Value *Vec = II.getOperand(0);
2935
2936 if (getSplatValue(Vec) == II.getOperand(1))
2937 return IC.replaceInstUsesWith(II, Vec);
2938
2939 return std::nullopt;
2940}
2941
2942static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2943 IntrinsicInst &II) {
2944 // If this barrier is post-dominated by identical one we can remove it
2945 auto *NI = II.getNextNode();
2946 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2947 auto CanSkipOver = [](Instruction *I) {
2948 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2949 };
2950 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2951 auto *NIBB = NI->getParent();
2952 NI = NI->getNextNode();
2953 if (!NI) {
2954 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2955 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2956 else
2957 break;
2958 }
2959 }
2960 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2961 if (NextII && II.isIdenticalTo(NextII))
2962 return IC.eraseInstFromFunction(II);
2963
2964 return std::nullopt;
2965}
2966
2967static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
2968 IntrinsicInst &II) {
2969 return IC.replaceInstUsesWith(
2970 II,
2971 IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
2972 {II.getType(), II.getOperand(0)->getType()},
2973 {II.getOperand(0), II.getOperand(1)}));
2974}
2975
2976static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2977 IntrinsicInst &II) {
2979 return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
2980 return std::nullopt;
2981}
2982
2983static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2985 unsigned NumBits) {
2986 Value *Passthru = II.getOperand(0);
2987 Value *Pg = II.getOperand(1);
2988 Value *Op = II.getOperand(2);
2989
2990 // Convert UXT[BHW] to AND.
2991 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
2992 auto *Ty = cast<VectorType>(II.getType());
2993 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
2994 auto *Mask = ConstantInt::get(Ty, MaskValue);
2995 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
2996 {Pg, Op, Mask});
2997 return IC.replaceInstUsesWith(II, And);
2998 }
2999
3000 return std::nullopt;
3001}
3002
3003static std::optional<Instruction *>
3005 SMEAttrs FnSMEAttrs(*II.getFunction());
3006 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
3007 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
3008 return IC.replaceInstUsesWith(
3009 II, ConstantInt::getBool(II.getType(), IsStreaming));
3010 return std::nullopt;
3011}
3012
3013std::optional<Instruction *>
3015 IntrinsicInst &II) const {
3017 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
3018 return I;
3019
3020 Intrinsic::ID IID = II.getIntrinsicID();
3021 switch (IID) {
3022 default:
3023 break;
3024 case Intrinsic::aarch64_dmb:
3025 return instCombineDMB(IC, II);
3026 case Intrinsic::aarch64_neon_fmaxnm:
3027 case Intrinsic::aarch64_neon_fminnm:
3028 return instCombineMaxMinNM(IC, II);
3029 case Intrinsic::aarch64_sve_convert_from_svbool:
3030 return instCombineConvertFromSVBool(IC, II);
3031 case Intrinsic::aarch64_sve_dup:
3032 return instCombineSVEDup(IC, II);
3033 case Intrinsic::aarch64_sve_dup_x:
3034 return instCombineSVEDupX(IC, II);
3035 case Intrinsic::aarch64_sve_cmpne:
3036 case Intrinsic::aarch64_sve_cmpne_wide:
3037 return instCombineSVECmpNE(IC, II);
3038 case Intrinsic::aarch64_sve_rdffr:
3039 return instCombineRDFFR(IC, II);
3040 case Intrinsic::aarch64_sve_lasta:
3041 case Intrinsic::aarch64_sve_lastb:
3042 return instCombineSVELast(IC, II);
3043 case Intrinsic::aarch64_sve_clasta_n:
3044 case Intrinsic::aarch64_sve_clastb_n:
3045 return instCombineSVECondLast(IC, II);
3046 case Intrinsic::aarch64_sve_cntd:
3047 return instCombineSVECntElts(IC, II, 2);
3048 case Intrinsic::aarch64_sve_cntw:
3049 return instCombineSVECntElts(IC, II, 4);
3050 case Intrinsic::aarch64_sve_cnth:
3051 return instCombineSVECntElts(IC, II, 8);
3052 case Intrinsic::aarch64_sve_cntb:
3053 return instCombineSVECntElts(IC, II, 16);
3054 case Intrinsic::aarch64_sme_cntsd:
3055 return instCombineSMECntsd(IC, II, ST);
3056 case Intrinsic::aarch64_sve_ptest_any:
3057 case Intrinsic::aarch64_sve_ptest_first:
3058 case Intrinsic::aarch64_sve_ptest_last:
3059 return instCombineSVEPTest(IC, II);
3060 case Intrinsic::aarch64_sve_fadd:
3061 return instCombineSVEVectorFAdd(IC, II);
3062 case Intrinsic::aarch64_sve_fadd_u:
3063 return instCombineSVEVectorFAddU(IC, II);
3064 case Intrinsic::aarch64_sve_fmul_u:
3065 return instCombineSVEVectorBinOp(IC, II);
3066 case Intrinsic::aarch64_sve_fsub:
3067 return instCombineSVEVectorFSub(IC, II);
3068 case Intrinsic::aarch64_sve_fsub_u:
3069 return instCombineSVEVectorFSubU(IC, II);
3070 case Intrinsic::aarch64_sve_add:
3071 return instCombineSVEVectorAdd(IC, II);
3072 case Intrinsic::aarch64_sve_add_u:
3073 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3074 Intrinsic::aarch64_sve_mla_u>(
3075 IC, II, true);
3076 case Intrinsic::aarch64_sve_sub:
3077 return instCombineSVEVectorSub(IC, II);
3078 case Intrinsic::aarch64_sve_sub_u:
3079 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3080 Intrinsic::aarch64_sve_mls_u>(
3081 IC, II, true);
3082 case Intrinsic::aarch64_sve_tbl:
3083 return instCombineSVETBL(IC, II);
3084 case Intrinsic::aarch64_sve_uunpkhi:
3085 case Intrinsic::aarch64_sve_uunpklo:
3086 case Intrinsic::aarch64_sve_sunpkhi:
3087 case Intrinsic::aarch64_sve_sunpklo:
3088 return instCombineSVEUnpack(IC, II);
3089 case Intrinsic::aarch64_sve_uzp1:
3090 return instCombineSVEUzp1(IC, II);
3091 case Intrinsic::aarch64_sve_zip1:
3092 case Intrinsic::aarch64_sve_zip2:
3093 return instCombineSVEZip(IC, II);
3094 case Intrinsic::aarch64_sve_ld1_gather_index:
3095 return instCombineLD1GatherIndex(IC, II);
3096 case Intrinsic::aarch64_sve_st1_scatter_index:
3097 return instCombineST1ScatterIndex(IC, II);
3098 case Intrinsic::aarch64_sve_ld1:
3099 return instCombineSVELD1(IC, II, DL);
3100 case Intrinsic::aarch64_sve_st1:
3101 return instCombineSVEST1(IC, II, DL);
3102 case Intrinsic::aarch64_sve_sdiv:
3103 return instCombineSVESDIV(IC, II);
3104 case Intrinsic::aarch64_sve_sel:
3105 return instCombineSVESel(IC, II);
3106 case Intrinsic::aarch64_sve_srshl:
3107 return instCombineSVESrshl(IC, II);
3108 case Intrinsic::aarch64_sve_dupq_lane:
3109 return instCombineSVEDupqLane(IC, II);
3110 case Intrinsic::aarch64_sve_insr:
3111 return instCombineSVEInsr(IC, II);
3112 case Intrinsic::aarch64_sve_whilelo:
3113 return instCombineWhilelo(IC, II);
3114 case Intrinsic::aarch64_sve_ptrue:
3115 return instCombinePTrue(IC, II);
3116 case Intrinsic::aarch64_sve_uxtb:
3117 return instCombineSVEUxt(IC, II, 8);
3118 case Intrinsic::aarch64_sve_uxth:
3119 return instCombineSVEUxt(IC, II, 16);
3120 case Intrinsic::aarch64_sve_uxtw:
3121 return instCombineSVEUxt(IC, II, 32);
3122 case Intrinsic::aarch64_sme_in_streaming_mode:
3123 return instCombineInStreamingMode(IC, II);
3124 }
3125
3126 return std::nullopt;
3127}
3128
3130 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
3131 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
3132 std::function<void(Instruction *, unsigned, APInt, APInt &)>
3133 SimplifyAndSetOp) const {
3134 switch (II.getIntrinsicID()) {
3135 default:
3136 break;
3137 case Intrinsic::aarch64_neon_fcvtxn:
3138 case Intrinsic::aarch64_neon_rshrn:
3139 case Intrinsic::aarch64_neon_sqrshrn:
3140 case Intrinsic::aarch64_neon_sqrshrun:
3141 case Intrinsic::aarch64_neon_sqshrn:
3142 case Intrinsic::aarch64_neon_sqshrun:
3143 case Intrinsic::aarch64_neon_sqxtn:
3144 case Intrinsic::aarch64_neon_sqxtun:
3145 case Intrinsic::aarch64_neon_uqrshrn:
3146 case Intrinsic::aarch64_neon_uqshrn:
3147 case Intrinsic::aarch64_neon_uqxtn:
3148 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
3149 break;
3150 }
3151
3152 return std::nullopt;
3153}
3154
3156 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3158}
3159
3162 switch (K) {
3164 return TypeSize::getFixed(64);
3166 if (ST->useSVEForFixedLengthVectors() &&
3167 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
3168 return TypeSize::getFixed(
3169 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
3170 else if (ST->isNeonAvailable())
3171 return TypeSize::getFixed(128);
3172 else
3173 return TypeSize::getFixed(0);
3175 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3177 return TypeSize::getScalable(128);
3178 else
3179 return TypeSize::getScalable(0);
3180 }
3181 llvm_unreachable("Unsupported register kind");
3182}
3183
3184bool AArch64TTIImpl::isSingleExtWideningInstruction(
3185 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3186 Type *SrcOverrideTy) const {
3187 // A helper that returns a vector type from the given type. The number of
3188 // elements in type Ty determines the vector width.
3189 auto toVectorTy = [&](Type *ArgTy) {
3190 return VectorType::get(ArgTy->getScalarType(),
3191 cast<VectorType>(DstTy)->getElementCount());
3192 };
3193
3194 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3195 // i32, i64]. SVE doesn't generally have the same set of instructions to
3196 // perform an extend with the add/sub/mul. There are SMULLB style
3197 // instructions, but they operate on top/bottom, requiring some sort of lane
3198 // interleaving to be used with zext/sext.
3199 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3200 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3201 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3202 return false;
3203
3204 Type *SrcTy = SrcOverrideTy;
3205 switch (Opcode) {
3206 case Instruction::Add: // UADDW(2), SADDW(2).
3207 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3208 // The second operand needs to be an extend
3209 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
3210 if (!SrcTy)
3211 SrcTy =
3212 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
3213 break;
3214 }
3215
3216 if (Opcode == Instruction::Sub)
3217 return false;
3218
3219 // UADDW(2), SADDW(2) can be commutted.
3220 if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
3221 if (!SrcTy)
3222 SrcTy =
3223 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3224 break;
3225 }
3226 return false;
3227 }
3228 default:
3229 return false;
3230 }
3231
3232 // Legalize the destination type and ensure it can be used in a widening
3233 // operation.
3234 auto DstTyL = getTypeLegalizationCost(DstTy);
3235 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3236 return false;
3237
3238 // Legalize the source type and ensure it can be used in a widening
3239 // operation.
3240 assert(SrcTy && "Expected some SrcTy");
3241 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3242 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3243 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3244 return false;
3245
3246 // Get the total number of vector elements in the legalized types.
3247 InstructionCost NumDstEls =
3248 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3249 InstructionCost NumSrcEls =
3250 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3251
3252 // Return true if the legalized types have the same number of vector elements
3253 // and the destination element type size is twice that of the source type.
3254 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3255}
3256
3257Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3259 Type *SrcOverrideTy) const {
3260 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3261 Opcode != Instruction::Mul)
3262 return nullptr;
3263
3264 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3265 // i32, i64]. SVE doesn't generally have the same set of instructions to
3266 // perform an extend with the add/sub/mul. There are SMULLB style
3267 // instructions, but they operate on top/bottom, requiring some sort of lane
3268 // interleaving to be used with zext/sext.
3269 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3270 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3271 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3272 return nullptr;
3273
3274 auto getScalarSizeWithOverride = [&](const Value *V) {
3275 if (SrcOverrideTy)
3276 return SrcOverrideTy->getScalarSizeInBits();
3277 return cast<Instruction>(V)
3278 ->getOperand(0)
3279 ->getType()
3280 ->getScalarSizeInBits();
3281 };
3282
3283 unsigned MaxEltSize = 0;
3284 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3285 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3286 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3287 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3288 MaxEltSize = std::max(EltSize0, EltSize1);
3289 } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
3290 isa<SExtInst, ZExtInst>(Args[1])) {
3291 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3292 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3293 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3294 // enough.
3295 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3296 return nullptr;
3297 MaxEltSize = DstEltSize / 2;
3298 } else if (Opcode == Instruction::Mul &&
3299 (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
3300 // If one of the operands is a Zext and the other has enough zero bits
3301 // to be treated as unsigned, we can still generate a umull, meaning the
3302 // zext is free.
3303 KnownBits Known =
3304 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3305 if (Args[0]->getType()->getScalarSizeInBits() -
3306 Known.Zero.countLeadingOnes() >
3307 DstTy->getScalarSizeInBits() / 2)
3308 return nullptr;
3309
3310 MaxEltSize =
3311 getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
3312 } else
3313 return nullptr;
3314
3315 if (MaxEltSize * 2 > DstEltSize)
3316 return nullptr;
3317
3318 Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
3319 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3320 return nullptr;
3321 return ExtTy;
3322}
3323
3324// s/urhadd instructions implement the following pattern, making the
3325// extends free:
3326// %x = add ((zext i8 -> i16), 1)
3327// %y = (zext i8 -> i16)
3328// trunc i16 (lshr (add %x, %y), 1) -> i8
3329//
3331 Type *Src) const {
3332 // The source should be a legal vector type.
3333 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3334 (Src->isScalableTy() && !ST->hasSVE2()))
3335 return false;
3336
3337 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3338 return false;
3339
3340 // Look for trunc/shl/add before trying to match the pattern.
3341 const Instruction *Add = ExtUser;
3342 auto *AddUser =
3343 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3344 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3345 Add = AddUser;
3346
3347 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3348 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3349 return false;
3350
3351 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3352 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3353 Src->getScalarSizeInBits() !=
3354 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3355 return false;
3356
3357 // Try to match the whole pattern. Ext could be either the first or second
3358 // m_ZExtOrSExt matched.
3359 Instruction *Ex1, *Ex2;
3360 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3361 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
3362 return false;
3363
3364 // Ensure both extends are of the same type
3365 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3366 Ex1->getOpcode() == Ex2->getOpcode())
3367 return true;
3368
3369 return false;
3370}
3371
3373 Type *Src,
3376 const Instruction *I) const {
3377 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3378 assert(ISD && "Invalid opcode");
3379 // If the cast is observable, and it is used by a widening instruction (e.g.,
3380 // uaddl, saddw, etc.), it may be free.
3381 if (I && I->hasOneUser()) {
3382 auto *SingleUser = cast<Instruction>(*I->user_begin());
3383 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3384 if (Type *ExtTy = isBinExtWideningInstruction(
3385 SingleUser->getOpcode(), Dst, Operands,
3386 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3387 // The cost from Src->Src*2 needs to be added if required, the cost from
3388 // Src*2->ExtTy is free.
3389 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3390 Type *DoubleSrcTy =
3391 Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
3392 return getCastInstrCost(Opcode, DoubleSrcTy, Src,
3394 }
3395
3396 return 0;
3397 }
3398
3399 if (isSingleExtWideningInstruction(
3400 SingleUser->getOpcode(), Dst, Operands,
3401 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3402 // For adds only count the second operand as free if both operands are
3403 // extends but not the same operation. (i.e both operands are not free in
3404 // add(sext, zext)).
3405 if (SingleUser->getOpcode() == Instruction::Add) {
3406 if (I == SingleUser->getOperand(1) ||
3407 (isa<CastInst>(SingleUser->getOperand(1)) &&
3408 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3409 return 0;
3410 } else {
3411 // Others are free so long as isSingleExtWideningInstruction
3412 // returned true.
3413 return 0;
3414 }
3415 }
3416
3417 // The cast will be free for the s/urhadd instructions
3418 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3419 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3420 return 0;
3421 }
3422
3423 EVT SrcTy = TLI->getValueType(DL, Src);
3424 EVT DstTy = TLI->getValueType(DL, Dst);
3425
3426 if (!SrcTy.isSimple() || !DstTy.isSimple())
3427 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3428
3429 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3430 // we use fcvtx under SVE2. Give them invalid costs.
3431 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3432 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3433 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3435
3436 static const TypeConversionCostTblEntry BF16Tbl[] = {
3437 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3438 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3439 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3440 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3441 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3442 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3443 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3444 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3445 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3446 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3447 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3448 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3449 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3450 };
3451
3452 if (ST->hasBF16())
3453 if (const auto *Entry = ConvertCostTableLookup(
3454 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3455 return Entry->Cost;
3456
3457 // We have to estimate a cost of fixed length operation upon
3458 // SVE registers(operations) with the number of registers required
3459 // for a fixed type to be represented upon SVE registers.
3460 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3461 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3462 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3463 ST->useSVEForFixedLengthVectors(WiderTy)) {
3464 std::pair<InstructionCost, MVT> LT =
3465 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3466 unsigned NumElements =
3467 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3468 return LT.first *
3470 Opcode,
3471 ScalableVectorType::get(Dst->getScalarType(), NumElements),
3472 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3473 CostKind, I);
3474 }
3475
3476 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3477 // The cost of unpacking twice is artificially increased for now in order
3478 // to avoid regressions against NEON, which will use tbl instructions directly
3479 // instead of multiple layers of [s|u]unpk[lo|hi].
3480 // We use the unpacks in cases where the destination type is illegal and
3481 // requires splitting of the input, even if the input type itself is legal.
3482 const unsigned int SVE_EXT_COST = 1;
3483 const unsigned int SVE_FCVT_COST = 1;
3484 const unsigned int SVE_UNPACK_ONCE = 4;
3485 const unsigned int SVE_UNPACK_TWICE = 16;
3486
3487 static const TypeConversionCostTblEntry ConversionTbl[] = {
3488 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3489 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3490 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3491 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3492 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3493 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3494 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3495 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3496 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3497 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3498 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3499 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3500 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3501 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3502 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3503 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3504 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3505 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3506 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3507 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3508
3509 // Truncations on nxvmiN
3510 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3511 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3512 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3513 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3514 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3515 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3516 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3517 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3518 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3519 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3520 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3521 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3522 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3523 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3524 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3525 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3526 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3527 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3528 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3529 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3530 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3531 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3532 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3533 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3534 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3535 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3536 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3537 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3538 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3539 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3540 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3541 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3542 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3543
3544 // The number of shll instructions for the extension.
3545 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3546 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3547 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3548 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3549 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3550 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3551 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3552 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3553 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3554 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3555 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3556 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3557 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3558 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3559 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3560 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3561
3562 // FP Ext and trunc
3563 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3564 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3565 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3566 // FP16
3567 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3568 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3569 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3570 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3571 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3572 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3573 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3574 // BF16 (uses shift)
3575 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3576 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3577 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3578 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3579 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3580 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3581 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3582 // FP Ext and trunc
3583 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3584 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3585 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3586 // FP16
3587 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3588 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3589 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3590 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3591 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3592 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3593 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3594 // BF16 (more complex, with +bf16 is handled above)
3595 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3596 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3597 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3598 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3599 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3600 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3601 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3602 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3603
3604 // LowerVectorINT_TO_FP:
3605 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3606 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3607 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3608 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3609 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3610 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3611
3612 // SVE: to nxv2f16
3613 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3614 SVE_EXT_COST + SVE_FCVT_COST},
3615 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3616 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3617 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3618 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3619 SVE_EXT_COST + SVE_FCVT_COST},
3620 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3621 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3622 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3623
3624 // SVE: to nxv4f16
3625 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3626 SVE_EXT_COST + SVE_FCVT_COST},
3627 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3628 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3629 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3630 SVE_EXT_COST + SVE_FCVT_COST},
3631 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3632 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3633
3634 // SVE: to nxv8f16
3635 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3636 SVE_EXT_COST + SVE_FCVT_COST},
3637 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3638 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3639 SVE_EXT_COST + SVE_FCVT_COST},
3640 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3641
3642 // SVE: to nxv16f16
3643 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3644 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3645 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3646 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3647
3648 // Complex: to v2f32
3649 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3650 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3651 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3652 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3653
3654 // SVE: to nxv2f32
3655 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3656 SVE_EXT_COST + SVE_FCVT_COST},
3657 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3658 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3659 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3660 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3661 SVE_EXT_COST + SVE_FCVT_COST},
3662 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3663 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3664 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3665
3666 // Complex: to v4f32
3667 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3668 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3669 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3670 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3671
3672 // SVE: to nxv4f32
3673 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3674 SVE_EXT_COST + SVE_FCVT_COST},
3675 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3676 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3677 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3678 SVE_EXT_COST + SVE_FCVT_COST},
3679 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3680 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3681
3682 // Complex: to v8f32
3683 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3684 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3685 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3686 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3687
3688 // SVE: to nxv8f32
3689 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3690 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3691 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3692 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3693 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3694 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3695 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3696 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3697
3698 // SVE: to nxv16f32
3699 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3700 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3701 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3702 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3703
3704 // Complex: to v16f32
3705 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3706 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3707
3708 // Complex: to v2f64
3709 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3710 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3711 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3712 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3713 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3714 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3715
3716 // SVE: to nxv2f64
3717 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3718 SVE_EXT_COST + SVE_FCVT_COST},
3719 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3720 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3721 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3722 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3723 SVE_EXT_COST + SVE_FCVT_COST},
3724 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3725 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3726 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3727
3728 // Complex: to v4f64
3729 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3730 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3731
3732 // SVE: to nxv4f64
3733 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3734 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3735 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3736 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3737 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3738 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3739 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3740 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3741 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3742 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3743 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3744 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3745
3746 // SVE: to nxv8f64
3747 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3748 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3749 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3750 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3751 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3752 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3753 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3754 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3755
3756 // LowerVectorFP_TO_INT
3757 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3758 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3759 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3760 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3761 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3762 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3763
3764 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3765 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3766 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3767 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3768 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3769 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3770 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3771
3772 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3773 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3774 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3775 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3776 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3777
3778 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3779 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3780 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3781 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3782 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3783 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3784 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3785
3786 // Complex, from nxv2f32.
3787 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3788 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3789 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3790 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3791 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3792 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3793 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3794 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3795
3796 // Complex, from nxv2f64.
3797 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3798 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3799 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3800 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3801 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3802 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3803 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3804 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3805 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3806 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3807
3808 // Complex, from nxv4f32.
3809 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3810 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3811 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3812 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3813 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3814 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3815 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3816 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3817 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3818 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3819
3820 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3821 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3822 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3823 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3824 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3825
3826 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3827 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3828 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3829 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3830 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3831 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3832 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3833
3834 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3835 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3836 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3837 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3838 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3839
3840 // Complex, from nxv8f16.
3841 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3842 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3843 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3844 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3845 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3846 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3847 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3848 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3849 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3850 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3851
3852 // Complex, from nxv4f16.
3853 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3854 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3855 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3856 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3857 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3858 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3859 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3860 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3861
3862 // Complex, from nxv2f16.
3863 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3864 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3865 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3866 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3867 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3868 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3869 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3870 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3871
3872 // Truncate from nxvmf32 to nxvmf16.
3873 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3874 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3875 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3876
3877 // Truncate from nxvmf32 to nxvmbf16.
3878 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3879 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
3880 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
3881
3882 // Truncate from nxvmf64 to nxvmf16.
3883 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3884 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3885 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3886
3887 // Truncate from nxvmf64 to nxvmbf16.
3888 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
3889 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
3890 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
3891
3892 // Truncate from nxvmf64 to nxvmf32.
3893 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3894 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3895 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3896
3897 // Extend from nxvmf16 to nxvmf32.
3898 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3899 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3900 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3901
3902 // Extend from nxvmbf16 to nxvmf32.
3903 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
3904 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
3905 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
3906
3907 // Extend from nxvmf16 to nxvmf64.
3908 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3909 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3910 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3911
3912 // Extend from nxvmbf16 to nxvmf64.
3913 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
3914 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
3915 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
3916
3917 // Extend from nxvmf32 to nxvmf64.
3918 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3919 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3920 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3921
3922 // Bitcasts from float to integer
3923 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3924 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3925 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3926
3927 // Bitcasts from integer to float
3928 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3929 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3930 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3931
3932 // Add cost for extending to illegal -too wide- scalable vectors.
3933 // zero/sign extend are implemented by multiple unpack operations,
3934 // where each operation has a cost of 1.
3935 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3936 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3937 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3938 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3939 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3940 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3941
3942 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3943 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3944 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3945 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3946 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3947 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3948 };
3949
3950 if (const auto *Entry = ConvertCostTableLookup(
3951 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3952 return Entry->Cost;
3953
3954 static const TypeConversionCostTblEntry FP16Tbl[] = {
3955 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3956 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3957 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3958 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3959 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3960 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3961 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3962 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3963 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3964 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3965 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3966 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3967 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3968 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3969 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3970 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3971 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3972 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3973 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3974 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3975 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3976 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3977 };
3978
3979 if (ST->hasFullFP16())
3980 if (const auto *Entry = ConvertCostTableLookup(
3981 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3982 return Entry->Cost;
3983
3984 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3985 // double-rounding issues.
3986 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3987 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3989 return cast<FixedVectorType>(Dst)->getNumElements() *
3990 getCastInstrCost(Opcode, Dst->getScalarType(),
3991 Src->getScalarType(), CCH, CostKind) +
3993 true, CostKind) +
3995 false, CostKind);
3996
3997 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3999 ST->isSVEorStreamingSVEAvailable() &&
4000 TLI->getTypeAction(Src->getContext(), SrcTy) ==
4002 TLI->getTypeAction(Dst->getContext(), DstTy) ==
4004 // The standard behaviour in the backend for these cases is to split the
4005 // extend up into two parts:
4006 // 1. Perform an extending load or masked load up to the legal type.
4007 // 2. Extend the loaded data to the final type.
4008 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
4009 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
4011 Opcode, LegalTy, Src, CCH, CostKind, I);
4013 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
4014 return Part1 + Part2;
4015 }
4016
4017 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
4018 // but we also want to include the TTI::CastContextHint::Masked case too.
4019 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
4021 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
4023
4024 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
4025}
4026
4029 VectorType *VecTy, unsigned Index,
4031
4032 // Make sure we were given a valid extend opcode.
4033 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
4034 "Invalid opcode");
4035
4036 // We are extending an element we extract from a vector, so the source type
4037 // of the extend is the element type of the vector.
4038 auto *Src = VecTy->getElementType();
4039
4040 // Sign- and zero-extends are for integer types only.
4041 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
4042
4043 // Get the cost for the extract. We compute the cost (if any) for the extend
4044 // below.
4045 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
4046 CostKind, Index, nullptr, nullptr);
4047
4048 // Legalize the types.
4049 auto VecLT = getTypeLegalizationCost(VecTy);
4050 auto DstVT = TLI->getValueType(DL, Dst);
4051 auto SrcVT = TLI->getValueType(DL, Src);
4052
4053 // If the resulting type is still a vector and the destination type is legal,
4054 // we may get the extension for free. If not, get the default cost for the
4055 // extend.
4056 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
4057 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4058 CostKind);
4059
4060 // The destination type should be larger than the element type. If not, get
4061 // the default cost for the extend.
4062 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
4063 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4064 CostKind);
4065
4066 switch (Opcode) {
4067 default:
4068 llvm_unreachable("Opcode should be either SExt or ZExt");
4069
4070 // For sign-extends, we only need a smov, which performs the extension
4071 // automatically.
4072 case Instruction::SExt:
4073 return Cost;
4074
4075 // For zero-extends, the extend is performed automatically by a umov unless
4076 // the destination type is i64 and the element type is i8 or i16.
4077 case Instruction::ZExt:
4078 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
4079 return Cost;
4080 }
4081
4082 // If we are unable to perform the extend for free, get the default cost.
4083 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4084 CostKind);
4085}
4086
4089 const Instruction *I) const {
4091 return Opcode == Instruction::PHI ? 0 : 1;
4092 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
4093 // Branches are assumed to be predicted.
4094 return 0;
4095}
4096
4097InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
4098 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4099 const Instruction *I, Value *Scalar,
4100 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4101 TTI::VectorInstrContext VIC) const {
4102 assert(Val->isVectorTy() && "This must be a vector type");
4103
4104 if (Index != -1U) {
4105 // Legalize the type.
4106 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4107
4108 // This type is legalized to a scalar type.
4109 if (!LT.second.isVector())
4110 return 0;
4111
4112 // The type may be split. For fixed-width vectors we can normalize the
4113 // index to the new type.
4114 if (LT.second.isFixedLengthVector()) {
4115 unsigned Width = LT.second.getVectorNumElements();
4116 Index = Index % Width;
4117 }
4118
4119 // The element at index zero is already inside the vector.
4120 // - For a insert-element or extract-element
4121 // instruction that extracts integers, an explicit FPR -> GPR move is
4122 // needed. So it has non-zero cost.
4123 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
4124 return 0;
4125
4126 // This is recognising a LD1 single-element structure to one lane of one
4127 // register instruction. I.e., if this is an `insertelement` instruction,
4128 // and its second operand is a load, then we will generate a LD1, which
4129 // are expensive instructions on some uArchs.
4130 if (VIC == TTI::VectorInstrContext::Load) {
4131 if (ST->hasFastLD1Single())
4132 return 0;
4133 return CostKind == TTI::TCK_CodeSize
4134 ? 0
4136 }
4137
4138 // i1 inserts and extract will include an extra cset or cmp of the vector
4139 // value. Increase the cost by 1 to account.
4140 if (Val->getScalarSizeInBits() == 1)
4141 return CostKind == TTI::TCK_CodeSize
4142 ? 2
4143 : ST->getVectorInsertExtractBaseCost() + 1;
4144
4145 // FIXME:
4146 // If the extract-element and insert-element instructions could be
4147 // simplified away (e.g., could be combined into users by looking at use-def
4148 // context), they have no cost. This is not done in the first place for
4149 // compile-time considerations.
4150 }
4151
4152 // In case of Neon, if there exists extractelement from lane != 0 such that
4153 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
4154 // 2. extractelement result feeds into fmul.
4155 // 3. Other operand of fmul is an extractelement from lane 0 or lane
4156 // equivalent to 0.
4157 // then the extractelement can be merged with fmul in the backend and it
4158 // incurs no cost.
4159 // e.g.
4160 // define double @foo(<2 x double> %a) {
4161 // %1 = extractelement <2 x double> %a, i32 0
4162 // %2 = extractelement <2 x double> %a, i32 1
4163 // %res = fmul double %1, %2
4164 // ret double %res
4165 // }
4166 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4167 auto ExtractCanFuseWithFmul = [&]() {
4168 // We bail out if the extract is from lane 0.
4169 if (Index == 0)
4170 return false;
4171
4172 // Check if the scalar element type of the vector operand of ExtractElement
4173 // instruction is one of the allowed types.
4174 auto IsAllowedScalarTy = [&](const Type *T) {
4175 return T->isFloatTy() || T->isDoubleTy() ||
4176 (T->isHalfTy() && ST->hasFullFP16());
4177 };
4178
4179 // Check if the extractelement user is scalar fmul.
4180 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4181 // Check if the user is scalar fmul.
4182 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
4183 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4184 !BO->getType()->isVectorTy();
4185 };
4186
4187 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4188 // certain scalar type and a certain vector register width.
4189 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4190 auto RegWidth =
4192 .getFixedValue();
4193 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4194 };
4195
4196 // Check if the type constraints on input vector type and result scalar type
4197 // of extractelement instruction are satisfied.
4198 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4199 return false;
4200
4201 if (Scalar) {
4202 DenseMap<User *, unsigned> UserToExtractIdx;
4203 for (auto *U : Scalar->users()) {
4204 if (!IsUserFMulScalarTy(U))
4205 return false;
4206 // Recording entry for the user is important. Index value is not
4207 // important.
4208 UserToExtractIdx[U];
4209 }
4210 if (UserToExtractIdx.empty())
4211 return false;
4212 for (auto &[S, U, L] : ScalarUserAndIdx) {
4213 for (auto *U : S->users()) {
4214 if (UserToExtractIdx.contains(U)) {
4215 auto *FMul = cast<BinaryOperator>(U);
4216 auto *Op0 = FMul->getOperand(0);
4217 auto *Op1 = FMul->getOperand(1);
4218 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4219 UserToExtractIdx[U] = L;
4220 break;
4221 }
4222 }
4223 }
4224 }
4225 for (auto &[U, L] : UserToExtractIdx) {
4226 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4227 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4228 return false;
4229 }
4230 } else {
4231 const auto *EE = cast<ExtractElementInst>(I);
4232
4233 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
4234 if (!IdxOp)
4235 return false;
4236
4237 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
4238 if (!IsUserFMulScalarTy(U))
4239 return false;
4240
4241 // Check if the other operand of extractelement is also extractelement
4242 // from lane equivalent to 0.
4243 const auto *BO = cast<BinaryOperator>(U);
4244 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4245 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
4246 if (OtherEE) {
4247 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
4248 if (!IdxOp)
4249 return false;
4250 return IsExtractLaneEquivalentToZero(
4251 cast<ConstantInt>(OtherEE->getIndexOperand())
4252 ->getValue()
4253 .getZExtValue(),
4254 OtherEE->getType()->getScalarSizeInBits());
4255 }
4256 return true;
4257 });
4258 }
4259 return true;
4260 };
4261
4262 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4263 ExtractCanFuseWithFmul())
4264 return 0;
4265
4266 // All other insert/extracts cost this much.
4267 return CostKind == TTI::TCK_CodeSize ? 1
4268 : ST->getVectorInsertExtractBaseCost();
4269}
4270
4272 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4273 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4274 // Treat insert at lane 0 into a poison vector as having zero cost. This
4275 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4276 // single dup) are treated as cheap.
4277 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4278 isa<PoisonValue>(Op0))
4279 return 0;
4280 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr,
4281 nullptr, {}, VIC);
4282}
4283
4285 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4286 Value *Scalar, ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4287 TTI::VectorInstrContext VIC) const {
4288 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4289 ScalarUserAndIdx, VIC);
4290}
4291
4294 TTI::TargetCostKind CostKind, unsigned Index,
4295 TTI::VectorInstrContext VIC) const {
4296 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I,
4297 nullptr, {}, VIC);
4298}
4299
4303 unsigned Index) const {
4304 if (isa<FixedVectorType>(Val))
4306 Index);
4307
4308 // This typically requires both while and lastb instructions in order
4309 // to extract the last element. If this is in a loop the while
4310 // instruction can at least be hoisted out, although it will consume a
4311 // predicate register. The cost should be more expensive than the base
4312 // extract cost, which is 2 for most CPUs.
4313 return CostKind == TTI::TCK_CodeSize
4314 ? 2
4315 : ST->getVectorInsertExtractBaseCost() + 1;
4316}
4317
4319 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4320 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
4321 TTI::VectorInstrContext VIC) const {
4324 if (Ty->getElementType()->isFloatingPointTy())
4325 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4326 CostKind);
4327 unsigned VecInstCost =
4328 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4329 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4330}
4331
4332std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4334 TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4335 std::function<InstructionCost(Type *)> InstCost) const {
4336 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4337 return std::nullopt;
4338 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4339 return std::nullopt;
4340 // If we have +sve-b16b16 the operation can be promoted to SVE.
4341 if (CanUseSVE && ST->hasSVEB16B16() && ST->isNonStreamingSVEorSME2Available())
4342 return std::nullopt;
4343
4344 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4345 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4347 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4348 Cost *= 2;
4349 Cost += InstCost(PromotedTy);
4350 if (IncludeTrunc)
4351 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4353 return Cost;
4354}
4355
4357 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4359 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4360
4361 // The code-generator is currently not able to handle scalable vectors
4362 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4363 // it. This change will be removed when code-generation for these types is
4364 // sufficiently reliable.
4365 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4366 if (VTy->getElementCount() == ElementCount::getScalable(1))
4368
4369 // TODO: Handle more cost kinds.
4371 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4372 Op2Info, Args, CxtI);
4373
4374 // Legalize the type.
4375 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4376 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4377
4378 // Increase the cost for half and bfloat types if not architecturally
4379 // supported.
4380 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4381 ISD == ISD::FDIV || ISD == ISD::FREM)
4382 if (auto PromotedCost = getFP16BF16PromoteCost(
4383 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4384 // There is not native support for fdiv/frem even with +sve-b16b16.
4385 /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
4386 [&](Type *PromotedTy) {
4387 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4388 Op1Info, Op2Info);
4389 }))
4390 return *PromotedCost;
4391
4392 // If the operation is a widening instruction (smull or umull) and both
4393 // operands are extends the cost can be cheaper by considering that the
4394 // operation will operate on the narrowest type size possible (double the
4395 // largest input size) and a further extend.
4396 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
4397 if (ExtTy != Ty)
4398 return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
4399 getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
4401 return LT.first;
4402 }
4403
4404 switch (ISD) {
4405 default:
4406 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4407 Op2Info);
4408 case ISD::SREM:
4409 case ISD::SDIV:
4410 /*
4411 Notes for sdiv/srem specific costs:
4412 1. This only considers the cases where the divisor is constant, uniform and
4413 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4414 result in some form of (ldr + adrp), corresponding to constant vectors, or
4415 scalarization of the division operation.
4416 2. Constant divisors, either negative in whole or partially, don't result in
4417 significantly different codegen as compared to positive constant divisors.
4418 So, we don't consider negative divisors separately.
4419 3. If the codegen is significantly different with SVE, it has been indicated
4420 using comments at appropriate places.
4421
4422 sdiv specific cases:
4423 -----------------------------------------------------------------------
4424 codegen | pow-of-2 | Type
4425 -----------------------------------------------------------------------
4426 add + cmp + csel + asr | Y | i64
4427 add + cmp + csel + asr | Y | i32
4428 -----------------------------------------------------------------------
4429
4430 srem specific cases:
4431 -----------------------------------------------------------------------
4432 codegen | pow-of-2 | Type
4433 -----------------------------------------------------------------------
4434 negs + and + and + csneg | Y | i64
4435 negs + and + and + csneg | Y | i32
4436 -----------------------------------------------------------------------
4437
4438 other sdiv/srem cases:
4439 -------------------------------------------------------------------------
4440 common codegen | + srem | + sdiv | pow-of-2 | Type
4441 -------------------------------------------------------------------------
4442 smulh + asr + add + add | - | - | N | i64
4443 smull + lsr + add + add | - | - | N | i32
4444 usra | and + sub | sshr | Y | <2 x i64>
4445 2 * (scalar code) | - | - | N | <2 x i64>
4446 usra | bic + sub | sshr + neg | Y | <4 x i32>
4447 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4448 + sshr + usra | | | |
4449 -------------------------------------------------------------------------
4450 */
4451 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4452 InstructionCost AddCost =
4453 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4454 Op1Info.getNoProps(), Op2Info.getNoProps());
4455 InstructionCost AsrCost =
4456 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4457 Op1Info.getNoProps(), Op2Info.getNoProps());
4458 InstructionCost MulCost =
4459 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4460 Op1Info.getNoProps(), Op2Info.getNoProps());
4461 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4462 // have similar cost.
4463 auto VT = TLI->getValueType(DL, Ty);
4464 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4465 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4466 // Neg can be folded into the asr instruction.
4467 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4468 : (3 * AsrCost + AddCost);
4469 } else {
4470 return MulCost + AsrCost + 2 * AddCost;
4471 }
4472 } else if (VT.isVector()) {
4473 InstructionCost UsraCost = 2 * AsrCost;
4474 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4475 // Division with scalable types corresponds to native 'asrd'
4476 // instruction when SVE is available.
4477 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4478
4479 // One more for the negation in SDIV
4481 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4482 if (Ty->isScalableTy() && ST->hasSVE())
4483 Cost += 2 * AsrCost;
4484 else {
4485 Cost +=
4486 UsraCost +
4487 (ISD == ISD::SDIV
4488 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4489 : 2 * AddCost);
4490 }
4491 return Cost;
4492 } else if (LT.second == MVT::v2i64) {
4493 return VT.getVectorNumElements() *
4494 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4495 Op1Info.getNoProps(),
4496 Op2Info.getNoProps());
4497 } else {
4498 // When SVE is available, we get:
4499 // smulh + lsr + add/sub + asr + add/sub.
4500 if (Ty->isScalableTy() && ST->hasSVE())
4501 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4502 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4503 }
4504 }
4505 }
4506 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4507 LT.second.isFixedLengthVector()) {
4508 // FIXME: When the constant vector is non-uniform, this may result in
4509 // loading the vector from constant pool or in some cases, may also result
4510 // in scalarization. For now, we are approximating this with the
4511 // scalarization cost.
4512 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4513 CostKind, -1, nullptr, nullptr);
4514 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4515 CostKind, -1, nullptr, nullptr);
4516 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4517 return ExtractCost + InsertCost +
4518 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4519 CostKind, Op1Info.getNoProps(),
4520 Op2Info.getNoProps());
4521 }
4522 [[fallthrough]];
4523 case ISD::UDIV:
4524 case ISD::UREM: {
4525 auto VT = TLI->getValueType(DL, Ty);
4526 if (Op2Info.isConstant()) {
4527 // If the operand is a power of 2 we can use the shift or and cost.
4528 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4529 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4530 Op1Info.getNoProps(),
4531 Op2Info.getNoProps());
4532 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4533 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4534 Op1Info.getNoProps(),
4535 Op2Info.getNoProps());
4536
4537 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4538 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4539 // The MULHU will be expanded to UMULL for the types not listed below,
4540 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4541 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4542 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4543 LT.second == MVT::nxv16i8;
4544 bool Is128bit = LT.second.is128BitVector();
4545
4546 InstructionCost MulCost =
4547 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4548 Op1Info.getNoProps(), Op2Info.getNoProps());
4549 InstructionCost AddCost =
4550 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4551 Op1Info.getNoProps(), Op2Info.getNoProps());
4552 InstructionCost ShrCost =
4553 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4554 Op1Info.getNoProps(), Op2Info.getNoProps());
4555 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4556 (HasMULH ? 0 : ShrCost) + // UMULL shift
4557 AddCost * 2 + ShrCost;
4558 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4559 }
4560 }
4561
4562 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4563 // emitted by the backend even when those functions are not declared in the
4564 // module.
4565 if (!VT.isVector() && VT.getSizeInBits() > 64)
4566 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4567
4569 Opcode, Ty, CostKind, Op1Info, Op2Info);
4570 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4571 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4572 // SDIV/UDIV operations are lowered using SVE, then we can have less
4573 // costs.
4574 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4575 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4576 static const CostTblEntry DivTbl[]{
4577 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4578 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4579 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4580 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4581 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4582 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4583
4584 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4585 if (nullptr != Entry)
4586 return Entry->Cost;
4587 }
4588 // For 8/16-bit elements, the cost is higher because the type
4589 // requires promotion and possibly splitting:
4590 if (LT.second.getScalarType() == MVT::i8)
4591 Cost *= 8;
4592 else if (LT.second.getScalarType() == MVT::i16)
4593 Cost *= 4;
4594 return Cost;
4595 } else {
4596 // If one of the operands is a uniform constant then the cost for each
4597 // element is Cost for insertion, extraction and division.
4598 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4599 // operation with scalar type
4600 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4601 (Op2Info.isConstant() && Op2Info.isUniform())) {
4602 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4604 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4605 return (4 + DivCost) * VTy->getNumElements();
4606 }
4607 }
4608 // On AArch64, without SVE, vector divisions are expanded
4609 // into scalar divisions of each pair of elements.
4610 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4611 -1, nullptr, nullptr);
4612 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4613 nullptr, nullptr);
4614 }
4615
4616 // TODO: if one of the arguments is scalar, then it's not necessary to
4617 // double the cost of handling the vector elements.
4618 Cost += Cost;
4619 }
4620 return Cost;
4621 }
4622 case ISD::MUL:
4623 // When SVE is available, then we can lower the v2i64 operation using
4624 // the SVE mul instruction, which has a lower cost.
4625 if (LT.second == MVT::v2i64 && ST->hasSVE())
4626 return LT.first;
4627
4628 // When SVE is not available, there is no MUL.2d instruction,
4629 // which means mul <2 x i64> is expensive as elements are extracted
4630 // from the vectors and the muls scalarized.
4631 // As getScalarizationOverhead is a bit too pessimistic, we
4632 // estimate the cost for a i64 vector directly here, which is:
4633 // - four 2-cost i64 extracts,
4634 // - two 2-cost i64 inserts, and
4635 // - two 1-cost muls.
4636 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4637 // LT.first = 2 the cost is 28.
4638 if (LT.second != MVT::v2i64)
4639 return LT.first;
4640 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4641 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4642 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4643 nullptr, nullptr) *
4644 2 +
4645 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4646 nullptr, nullptr));
4647 case ISD::ADD:
4648 case ISD::XOR:
4649 case ISD::OR:
4650 case ISD::AND:
4651 case ISD::SRL:
4652 case ISD::SRA:
4653 case ISD::SHL:
4654 // These nodes are marked as 'custom' for combining purposes only.
4655 // We know that they are legal. See LowerAdd in ISelLowering.
4656 return LT.first;
4657
4658 case ISD::FNEG:
4659 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4660 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4661 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4662 CxtI &&
4663 ((CxtI->hasOneUse() &&
4664 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4665 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4666 return 0;
4667 [[fallthrough]];
4668 case ISD::FADD:
4669 case ISD::FSUB:
4670 if (!Ty->getScalarType()->isFP128Ty())
4671 return LT.first;
4672 [[fallthrough]];
4673 case ISD::FMUL:
4674 case ISD::FDIV:
4675 // These nodes are marked as 'custom' just to lower them to SVE.
4676 // We know said lowering will incur no additional cost.
4677 if (!Ty->getScalarType()->isFP128Ty())
4678 return 2 * LT.first;
4679
4680 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4681 Op2Info);
4682 case ISD::FREM:
4683 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4684 // those functions are not declared in the module.
4685 if (!Ty->isVectorTy())
4686 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4687 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4688 Op2Info);
4689 }
4690}
4691
4694 const SCEV *Ptr,
4696 // Address computations in vectorized code with non-consecutive addresses will
4697 // likely result in more instructions compared to scalar code where the
4698 // computation can more often be merged into the index mode. The resulting
4699 // extra micro-ops can significantly decrease throughput.
4700 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4701 int MaxMergeDistance = 64;
4702
4703 if (PtrTy->isVectorTy() && SE &&
4704 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4705 return NumVectorInstToHideOverhead;
4706
4707 // In many cases the address computation is not merged into the instruction
4708 // addressing mode.
4709 return 1;
4710}
4711
4712/// Check whether Opcode1 has less throughput according to the scheduling
4713/// model than Opcode2.
4715 unsigned Opcode1, unsigned Opcode2) const {
4716 const MCSchedModel &Sched = ST->getSchedModel();
4717 const TargetInstrInfo *TII = ST->getInstrInfo();
4718 if (!Sched.hasInstrSchedModel())
4719 return false;
4720
4721 const MCSchedClassDesc *SCD1 =
4722 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4723 const MCSchedClassDesc *SCD2 =
4724 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4725 // We cannot handle variant scheduling classes without an MI. If we need to
4726 // support them for any of the instructions we query the information of we
4727 // might need to add a way to resolve them without a MI or not use the
4728 // scheduling info.
4729 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4730 "Cannot handle variant scheduling classes without an MI");
4731 if (!SCD1->isValid() || !SCD2->isValid())
4732 return false;
4733
4734 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4736}
4737
4739 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4741 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4742 // We don't lower some vector selects well that are wider than the register
4743 // width. TODO: Improve this with different cost kinds.
4744 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4745 // We would need this many instructions to hide the scalarization happening.
4746 const int AmortizationCost = 20;
4747
4748 // If VecPred is not set, check if we can get a predicate from the context
4749 // instruction, if its type matches the requested ValTy.
4750 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4751 CmpPredicate CurrentPred;
4752 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4753 m_Value())))
4754 VecPred = CurrentPred;
4755 }
4756 // Check if we have a compare/select chain that can be lowered using
4757 // a (F)CMxx & BFI pair.
4758 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4759 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4760 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4761 VecPred == CmpInst::FCMP_UNE) {
4762 static const auto ValidMinMaxTys = {
4763 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4764 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4765 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4766
4767 auto LT = getTypeLegalizationCost(ValTy);
4768 if (any_of(ValidMinMaxTys, equal_to(LT.second)) ||
4769 (ST->hasFullFP16() &&
4770 any_of(ValidFP16MinMaxTys, equal_to(LT.second))))
4771 return LT.first;
4772 }
4773
4774 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4775 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4776 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4777 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4778 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4779 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4780 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4781 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4782 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4783 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4784 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4785 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4786
4787 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4788 EVT SelValTy = TLI->getValueType(DL, ValTy);
4789 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4790 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4791 SelCondTy.getSimpleVT(),
4792 SelValTy.getSimpleVT()))
4793 return Entry->Cost;
4794 }
4795 }
4796
4797 if (Opcode == Instruction::FCmp) {
4798 if (auto PromotedCost = getFP16BF16PromoteCost(
4799 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4800 // TODO: Consider costing SVE FCMPs.
4801 /*CanUseSVE=*/false, [&](Type *PromotedTy) {
4803 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4804 CostKind, Op1Info, Op2Info);
4805 if (isa<VectorType>(PromotedTy))
4807 Instruction::Trunc,
4811 return Cost;
4812 }))
4813 return *PromotedCost;
4814
4815 auto LT = getTypeLegalizationCost(ValTy);
4816 // Model unknown fp compares as a libcall.
4817 if (LT.second.getScalarType() != MVT::f64 &&
4818 LT.second.getScalarType() != MVT::f32 &&
4819 LT.second.getScalarType() != MVT::f16)
4820 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4821 {ValTy, ValTy}, CostKind);
4822
4823 // Some comparison operators require expanding to multiple compares + or.
4824 unsigned Factor = 1;
4825 if (!CondTy->isVectorTy() &&
4826 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4827 Factor = 2; // fcmp with 2 selects
4828 else if (isa<FixedVectorType>(ValTy) &&
4829 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4830 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4831 Factor = 3; // fcmxx+fcmyy+or
4832 else if (isa<ScalableVectorType>(ValTy) &&
4833 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4834 Factor = 3; // fcmxx+fcmyy+or
4835
4836 if (isa<ScalableVectorType>(ValTy) &&
4838 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4839 AArch64::FCMEQv4f32))
4840 Factor *= 2;
4841
4842 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4843 }
4844
4845 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4846 // icmp(and, 0) as free, as we can make use of ands, but only if the
4847 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4848 // providing it will not cause performance regressions.
4849 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4850 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4851 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4852 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4853 if (match(I->getOperand(1), m_Zero()))
4854 return 0;
4855
4856 // x >= 1 / x < 1 -> x > 0 / x <= 0
4857 if (match(I->getOperand(1), m_One()) &&
4858 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4859 return 0;
4860
4861 // x <= -1 / x > -1 -> x > 0 / x <= 0
4862 if (match(I->getOperand(1), m_AllOnes()) &&
4863 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4864 return 0;
4865 }
4866
4867 // The base case handles scalable vectors fine for now, since it treats the
4868 // cost as 1 * legalization cost.
4869 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4870 Op1Info, Op2Info, I);
4871}
4872
4874AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4876 if (ST->requiresStrictAlign()) {
4877 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4878 // a bunch of instructions when strict align is enabled.
4879 return Options;
4880 }
4881 Options.AllowOverlappingLoads = true;
4882 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4883 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4884 // TODO: Though vector loads usually perform well on AArch64, in some targets
4885 // they may wake up the FP unit, which raises the power consumption. Perhaps
4886 // they could be used with no holds barred (-O3).
4887 Options.LoadSizes = {8, 4, 2, 1};
4888 Options.AllowedTailExpansions = {3, 5, 6};
4889 return Options;
4890}
4891
4893 return ST->hasSVE();
4894}
4895
4899 switch (MICA.getID()) {
4900 case Intrinsic::masked_scatter:
4901 case Intrinsic::masked_gather:
4902 return getGatherScatterOpCost(MICA, CostKind);
4903 case Intrinsic::masked_load:
4904 case Intrinsic::masked_expandload:
4905 case Intrinsic::masked_store:
4906 return getMaskedMemoryOpCost(MICA, CostKind);
4907 }
4909}
4910
4914 Type *Src = MICA.getDataType();
4915
4916 if (useNeonVector(Src))
4918 auto LT = getTypeLegalizationCost(Src);
4919 if (!LT.first.isValid())
4921
4922 // Return an invalid cost for element types that we are unable to lower.
4923 auto *VT = cast<VectorType>(Src);
4924 if (VT->getElementType()->isIntegerTy(1))
4926
4927 // The code-generator is currently not able to handle scalable vectors
4928 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4929 // it. This change will be removed when code-generation for these types is
4930 // sufficiently reliable.
4931 if (VT->getElementCount() == ElementCount::getScalable(1))
4933
4934 InstructionCost MemOpCost = LT.first;
4935 if (MICA.getID() == Intrinsic::masked_expandload) {
4936 if (!isLegalMaskedExpandLoad(Src, MICA.getAlignment()))
4938
4939 // Operation will be split into expand of masked.load
4940 MemOpCost *= 2;
4941 }
4942
4943 // If we need to split the memory operation, we will also need to split the
4944 // mask. This will likely lead to overestimating the cost in some cases if
4945 // multiple memory operations use the same mask, but we often don't have
4946 // enough context to figure that out here.
4947 //
4948 // If the elements being loaded are bytes then the mask will already be split,
4949 // since the number of bits in a P register matches the number of bytes in a
4950 // Z register.
4951 if (LT.first > 1 && LT.second.getScalarSizeInBits() > 8)
4952 return MemOpCost * 2;
4953
4954 return MemOpCost;
4955}
4956
4957// This function returns gather/scatter overhead either from
4958// user-provided value or specialized values per-target from \p ST.
4959static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4960 const AArch64Subtarget *ST) {
4961 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4962 "Should be called on only load or stores.");
4963 switch (Opcode) {
4964 case Instruction::Load:
4965 if (SVEGatherOverhead.getNumOccurrences() > 0)
4966 return SVEGatherOverhead;
4967 return ST->getGatherOverhead();
4968 break;
4969 case Instruction::Store:
4970 if (SVEScatterOverhead.getNumOccurrences() > 0)
4971 return SVEScatterOverhead;
4972 return ST->getScatterOverhead();
4973 break;
4974 default:
4975 llvm_unreachable("Shouldn't have reached here");
4976 }
4977}
4978
4982
4983 unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather ||
4984 MICA.getID() == Intrinsic::vp_gather)
4985 ? Instruction::Load
4986 : Instruction::Store;
4987
4988 Type *DataTy = MICA.getDataType();
4989 Align Alignment = MICA.getAlignment();
4990 const Instruction *I = MICA.getInst();
4991
4992 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
4994 auto *VT = cast<VectorType>(DataTy);
4995 auto LT = getTypeLegalizationCost(DataTy);
4996 if (!LT.first.isValid())
4998
4999 // Return an invalid cost for element types that we are unable to lower.
5000 if (!LT.second.isVector() ||
5001 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
5002 VT->getElementType()->isIntegerTy(1))
5004
5005 // The code-generator is currently not able to handle scalable vectors
5006 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5007 // it. This change will be removed when code-generation for these types is
5008 // sufficiently reliable.
5009 if (VT->getElementCount() == ElementCount::getScalable(1))
5011
5012 ElementCount LegalVF = LT.second.getVectorElementCount();
5013 InstructionCost MemOpCost =
5014 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
5015 {TTI::OK_AnyValue, TTI::OP_None}, I);
5016 // Add on an overhead cost for using gathers/scatters.
5017 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
5018 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
5019}
5020
5022 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
5023}
5024
5026 Align Alignment,
5027 unsigned AddressSpace,
5029 TTI::OperandValueInfo OpInfo,
5030 const Instruction *I) const {
5031 EVT VT = TLI->getValueType(DL, Ty, true);
5032 // Type legalization can't handle structs
5033 if (VT == MVT::Other)
5034 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
5035 CostKind);
5036
5037 auto LT = getTypeLegalizationCost(Ty);
5038 if (!LT.first.isValid())
5040
5041 // The code-generator is currently not able to handle scalable vectors
5042 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5043 // it. This change will be removed when code-generation for these types is
5044 // sufficiently reliable.
5045 // We also only support full register predicate loads and stores.
5046 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5047 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
5048 (VTy->getElementType()->isIntegerTy(1) &&
5049 !VTy->getElementCount().isKnownMultipleOf(
5052
5053 // TODO: consider latency as well for TCK_SizeAndLatency.
5055 return LT.first;
5056
5058 return 1;
5059
5060 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
5061 LT.second.is128BitVector() && Alignment < Align(16)) {
5062 // Unaligned stores are extremely inefficient. We don't split all
5063 // unaligned 128-bit stores because the negative impact that has shown in
5064 // practice on inlined block copy code.
5065 // We make such stores expensive so that we will only vectorize if there
5066 // are 6 other instructions getting vectorized.
5067 const int AmortizationCost = 6;
5068
5069 return LT.first * 2 * AmortizationCost;
5070 }
5071
5072 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
5073 if (Ty->isPtrOrPtrVectorTy())
5074 return LT.first;
5075
5076 if (useNeonVector(Ty)) {
5077 // Check truncating stores and extending loads.
5078 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
5079 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
5080 if (VT == MVT::v4i8)
5081 return 2;
5082 // Otherwise we need to scalarize.
5083 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
5084 }
5085 EVT EltVT = VT.getVectorElementType();
5086 unsigned EltSize = EltVT.getScalarSizeInBits();
5087 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
5088 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
5089 return LT.first;
5090 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
5091 // widening to v4i8, which produces suboptimal results.
5092 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
5093 return LT.first;
5094
5095 // Check non-power-of-2 loads/stores for legal vector element types with
5096 // NEON. Non-power-of-2 memory ops will get broken down to a set of
5097 // operations on smaller power-of-2 ops, including ld1/st1.
5098 LLVMContext &C = Ty->getContext();
5100 SmallVector<EVT> TypeWorklist;
5101 TypeWorklist.push_back(VT);
5102 while (!TypeWorklist.empty()) {
5103 EVT CurrVT = TypeWorklist.pop_back_val();
5104 unsigned CurrNumElements = CurrVT.getVectorNumElements();
5105 if (isPowerOf2_32(CurrNumElements)) {
5106 Cost += 1;
5107 continue;
5108 }
5109
5110 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
5111 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
5112 TypeWorklist.push_back(
5113 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
5114 }
5115 return Cost;
5116 }
5117
5118 return LT.first;
5119}
5120
5122 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
5123 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
5124 bool UseMaskForCond, bool UseMaskForGaps) const {
5125 assert(Factor >= 2 && "Invalid interleave factor");
5126 auto *VecVTy = cast<VectorType>(VecTy);
5127
5128 if (VecTy->isScalableTy() && !ST->hasSVE())
5130
5131 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
5132 // only have lowering for power-of-2 factors.
5133 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
5134 // InterleavedAccessPass for ld3/st3
5135 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
5137
5138 // Vectorization for masked interleaved accesses is only enabled for scalable
5139 // VF.
5140 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
5142
5143 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
5144 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
5145 auto *SubVecTy =
5146 VectorType::get(VecVTy->getElementType(),
5147 VecVTy->getElementCount().divideCoefficientBy(Factor));
5148
5149 // ldN/stN only support legal vector types of size 64 or 128 in bits.
5150 // Accesses having vector types that are a multiple of 128 bits can be
5151 // matched to more than one ldN/stN instruction.
5152 bool UseScalable;
5153 if (MinElts % Factor == 0 &&
5154 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
5155 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
5156 }
5157
5158 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5159 Alignment, AddressSpace, CostKind,
5160 UseMaskForCond, UseMaskForGaps);
5161}
5162
5167 for (auto *I : Tys) {
5168 if (!I->isVectorTy())
5169 continue;
5170 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
5171 128)
5172 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
5173 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
5174 }
5175 return Cost;
5176}
5177
5179 Align Alignment) const {
5180 // Neon types should be scalarised when we are not choosing to use SVE.
5181 if (useNeonVector(DataTy))
5182 return false;
5183
5184 // Return true only if we are able to lower using the SVE2p2/SME2p2
5185 // expand instruction.
5186 return (ST->isSVEAvailable() && ST->hasSVE2p2()) ||
5187 (ST->isSVEorStreamingSVEAvailable() && ST->hasSME2p2());
5188}
5189
5191 return ST->getMaxInterleaveFactor();
5192}
5193
5194// For Falkor, we want to avoid having too many strided loads in a loop since
5195// that can exhaust the HW prefetcher resources. We adjust the unroller
5196// MaxCount preference below to attempt to ensure unrolling doesn't create too
5197// many strided loads.
5198static void
5201 enum { MaxStridedLoads = 7 };
5202 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
5203 int StridedLoads = 0;
5204 // FIXME? We could make this more precise by looking at the CFG and
5205 // e.g. not counting loads in each side of an if-then-else diamond.
5206 for (const auto BB : L->blocks()) {
5207 for (auto &I : *BB) {
5208 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
5209 if (!LMemI)
5210 continue;
5211
5212 Value *PtrValue = LMemI->getPointerOperand();
5213 if (L->isLoopInvariant(PtrValue))
5214 continue;
5215
5216 const SCEV *LSCEV = SE.getSCEV(PtrValue);
5217 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
5218 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
5219 continue;
5220
5221 // FIXME? We could take pairing of unrolled load copies into account
5222 // by looking at the AddRec, but we would probably have to limit this
5223 // to loops with no stores or other memory optimization barriers.
5224 ++StridedLoads;
5225 // We've seen enough strided loads that seeing more won't make a
5226 // difference.
5227 if (StridedLoads > MaxStridedLoads / 2)
5228 return StridedLoads;
5229 }
5230 }
5231 return StridedLoads;
5232 };
5233
5234 int StridedLoads = countStridedLoads(L, SE);
5235 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5236 << " strided loads\n");
5237 // Pick the largest power of 2 unroll count that won't result in too many
5238 // strided loads.
5239 if (StridedLoads) {
5240 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
5241 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5242 << UP.MaxCount << '\n');
5243 }
5244}
5245
5246// This function returns true if the loop:
5247// 1. Has a valid cost, and
5248// 2. Has a cost within the supplied budget.
5249// Otherwise it returns false.
5251 InstructionCost Budget,
5252 unsigned *FinalSize) {
5253 // Estimate the size of the loop.
5254 InstructionCost LoopCost = 0;
5255
5256 for (auto *BB : L->getBlocks()) {
5257 for (auto &I : *BB) {
5258 SmallVector<const Value *, 4> Operands(I.operand_values());
5259 InstructionCost Cost =
5260 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
5261 // This can happen with intrinsics that don't currently have a cost model
5262 // or for some operations that require SVE.
5263 if (!Cost.isValid())
5264 return false;
5265
5266 LoopCost += Cost;
5267 if (LoopCost > Budget)
5268 return false;
5269 }
5270 }
5271
5272 if (FinalSize)
5273 *FinalSize = LoopCost.getValue();
5274 return true;
5275}
5276
5278 const AArch64TTIImpl &TTI) {
5279 // Only consider loops with unknown trip counts for which we can determine
5280 // a symbolic expression. Multi-exit loops with small known trip counts will
5281 // likely be unrolled anyway.
5282 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5284 return false;
5285
5286 // It might not be worth unrolling loops with low max trip counts. Restrict
5287 // this to max trip counts > 32 for now.
5288 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5289 if (MaxTC > 0 && MaxTC <= 32)
5290 return false;
5291
5292 // Make sure the loop size is <= 5.
5293 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
5294 return false;
5295
5296 // Small search loops with multiple exits can be highly beneficial to unroll.
5297 // We only care about loops with exactly two exiting blocks, although each
5298 // block could jump to the same exit block.
5299 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5300 if (Blocks.size() != 2)
5301 return false;
5302
5303 if (any_of(Blocks, [](BasicBlock *BB) {
5305 }))
5306 return false;
5307
5308 return true;
5309}
5310
5311/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5312/// OOO engine's wide instruction window and various predictors.
5313static void
5316 const AArch64TTIImpl &TTI) {
5317 // Limit loops with structure that is highly likely to benefit from runtime
5318 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5319 // likely with complex control flow). Note that the heuristics here may be
5320 // overly conservative and we err on the side of avoiding runtime unrolling
5321 // rather than unroll excessively. They are all subject to further refinement.
5322 if (!L->isInnermost() || L->getNumBlocks() > 8)
5323 return;
5324
5325 // Loops with multiple exits are handled by common code.
5326 if (!L->getExitBlock())
5327 return;
5328
5329 // Check if the loop contains any reductions that could be parallelized when
5330 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5331 // a multiple of 2.
5332 bool HasParellelizableReductions =
5333 L->getNumBlocks() == 1 &&
5334 any_of(L->getHeader()->phis(),
5335 [&SE, L](PHINode &Phi) {
5336 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
5337 }) &&
5338 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
5339 if (HasParellelizableReductions &&
5340 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
5341 UP.Partial = true;
5342 UP.MaxCount = 4;
5343 UP.AddAdditionalAccumulators = true;
5344 }
5345
5346 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5348 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5349 SE.getSmallConstantMaxTripCount(L) <= 32))
5350 return;
5351
5352 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
5353 return;
5354
5356 return;
5357
5358 // Limit to loops with trip counts that are cheap to expand.
5359 UP.SCEVExpansionBudget = 1;
5360
5361 if (HasParellelizableReductions) {
5362 UP.Runtime = true;
5364 UP.AddAdditionalAccumulators = true;
5365 }
5366
5367 // Try to unroll small loops, of few-blocks with low budget, if they have
5368 // load/store dependencies, to expose more parallel memory access streams,
5369 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5370 BasicBlock *Header = L->getHeader();
5371 BasicBlock *Latch = L->getLoopLatch();
5372 if (Header == Latch) {
5373 // Estimate the size of the loop.
5374 unsigned Size;
5375 unsigned Width = 10;
5376 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5377 return;
5378
5379 // Try to find an unroll count that maximizes the use of the instruction
5380 // window, i.e. trying to fetch as many instructions per cycle as possible.
5381 unsigned MaxInstsPerLine = 16;
5382 unsigned UC = 1;
5383 unsigned BestUC = 1;
5384 unsigned SizeWithBestUC = BestUC * Size;
5385 while (UC <= 8) {
5386 unsigned SizeWithUC = UC * Size;
5387 if (SizeWithUC > 48)
5388 break;
5389 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5390 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5391 BestUC = UC;
5392 SizeWithBestUC = BestUC * Size;
5393 }
5394 UC++;
5395 }
5396
5397 if (BestUC == 1)
5398 return;
5399
5400 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5402 for (auto *BB : L->blocks()) {
5403 for (auto &I : *BB) {
5405 if (!Ptr)
5406 continue;
5407 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5408 if (SE.isLoopInvariant(PtrSCEV, L))
5409 continue;
5410 if (isa<LoadInst>(&I)) {
5411 LoadedValuesPlus.insert(&I);
5412 // Include in-loop 1st users of loaded values.
5413 for (auto *U : I.users())
5414 if (L->contains(cast<Instruction>(U)))
5415 LoadedValuesPlus.insert(U);
5416 } else
5417 Stores.push_back(cast<StoreInst>(&I));
5418 }
5419 }
5420
5421 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5422 return LoadedValuesPlus.contains(SI->getOperand(0));
5423 }))
5424 return;
5425
5426 UP.Runtime = true;
5427 UP.DefaultUnrollRuntimeCount = BestUC;
5428 return;
5429 }
5430
5431 // Try to runtime-unroll loops with early-continues depending on loop-varying
5432 // loads; this helps with branch-prediction for the early-continues.
5433 auto *Term = dyn_cast<CondBrInst>(Header->getTerminator());
5435 if (!Term || Preds.size() == 1 || !llvm::is_contained(Preds, Header) ||
5436 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5437 return;
5438
5439 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5440 [&](Instruction *I, unsigned Depth) -> bool {
5441 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5442 return false;
5443
5444 if (isa<LoadInst>(I))
5445 return true;
5446
5447 return any_of(I->operands(), [&](Value *V) {
5448 auto *I = dyn_cast<Instruction>(V);
5449 return I && DependsOnLoopLoad(I, Depth + 1);
5450 });
5451 };
5452 CmpPredicate Pred;
5453 Instruction *I;
5454 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5455 m_Value())) &&
5456 DependsOnLoopLoad(I, 0)) {
5457 UP.Runtime = true;
5458 }
5459}
5460
5463 OptimizationRemarkEmitter *ORE) const {
5464 // Enable partial unrolling and runtime unrolling.
5465 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5466
5467 UP.UpperBound = true;
5468
5469 // For inner loop, it is more likely to be a hot one, and the runtime check
5470 // can be promoted out from LICM pass, so the overhead is less, let's try
5471 // a larger threshold to unroll more loops.
5472 if (L->getLoopDepth() > 1)
5473 UP.PartialThreshold *= 2;
5474
5475 // Disable partial & runtime unrolling on -Os.
5477
5478 // Scan the loop: don't unroll loops with calls as this could prevent
5479 // inlining. Don't unroll auto-vectorized loops either, though do allow
5480 // unrolling of the scalar remainder.
5481 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5483 for (auto *BB : L->getBlocks()) {
5484 for (auto &I : *BB) {
5485 // Both auto-vectorized loops and the scalar remainder have the
5486 // isvectorized attribute, so differentiate between them by the presence
5487 // of vector instructions.
5488 if (IsVectorized && I.getType()->isVectorTy())
5489 return;
5490 if (isa<CallBase>(I)) {
5493 if (!isLoweredToCall(F))
5494 continue;
5495 return;
5496 }
5497
5498 SmallVector<const Value *, 4> Operands(I.operand_values());
5499 Cost += getInstructionCost(&I, Operands,
5501 }
5502 }
5503
5504 // Apply subtarget-specific unrolling preferences.
5505 if (ST->isAppleMLike())
5506 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5507 else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
5510
5511 // If this is a small, multi-exit loop similar to something like std::find,
5512 // then there is typically a performance improvement achieved by unrolling.
5513 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5514 UP.RuntimeUnrollMultiExit = true;
5515 UP.Runtime = true;
5516 // Limit unroll count.
5518 // Allow slightly more costly trip-count expansion to catch search loops
5519 // with pointer inductions.
5520 UP.SCEVExpansionBudget = 5;
5521 return;
5522 }
5523
5524 // Enable runtime unrolling for in-order models
5525 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5526 // checking for that case, we can ensure that the default behaviour is
5527 // unchanged
5528 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5529 !ST->getSchedModel().isOutOfOrder()) {
5530 UP.Runtime = true;
5531 UP.Partial = true;
5532 UP.UnrollRemainder = true;
5534
5535 UP.UnrollAndJam = true;
5537 }
5538
5539 // Force unrolling small loops can be very useful because of the branch
5540 // taken cost of the backedge.
5542 UP.Force = true;
5543}
5544
5549
5551 Type *ExpectedType,
5552 bool CanCreate) const {
5553 switch (Inst->getIntrinsicID()) {
5554 default:
5555 return nullptr;
5556 case Intrinsic::aarch64_neon_st2:
5557 case Intrinsic::aarch64_neon_st3:
5558 case Intrinsic::aarch64_neon_st4: {
5559 // Create a struct type
5560 StructType *ST = dyn_cast<StructType>(ExpectedType);
5561 if (!CanCreate || !ST)
5562 return nullptr;
5563 unsigned NumElts = Inst->arg_size() - 1;
5564 if (ST->getNumElements() != NumElts)
5565 return nullptr;
5566 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5567 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5568 return nullptr;
5569 }
5570 Value *Res = PoisonValue::get(ExpectedType);
5571 IRBuilder<> Builder(Inst);
5572 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5573 Value *L = Inst->getArgOperand(i);
5574 Res = Builder.CreateInsertValue(Res, L, i);
5575 }
5576 return Res;
5577 }
5578 case Intrinsic::aarch64_neon_ld2:
5579 case Intrinsic::aarch64_neon_ld3:
5580 case Intrinsic::aarch64_neon_ld4:
5581 if (Inst->getType() == ExpectedType)
5582 return Inst;
5583 return nullptr;
5584 }
5585}
5586
5588 MemIntrinsicInfo &Info) const {
5589 switch (Inst->getIntrinsicID()) {
5590 default:
5591 break;
5592 case Intrinsic::aarch64_neon_ld2:
5593 case Intrinsic::aarch64_neon_ld3:
5594 case Intrinsic::aarch64_neon_ld4:
5595 Info.ReadMem = true;
5596 Info.WriteMem = false;
5597 Info.PtrVal = Inst->getArgOperand(0);
5598 break;
5599 case Intrinsic::aarch64_neon_st2:
5600 case Intrinsic::aarch64_neon_st3:
5601 case Intrinsic::aarch64_neon_st4:
5602 Info.ReadMem = false;
5603 Info.WriteMem = true;
5604 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5605 break;
5606 }
5607
5608 switch (Inst->getIntrinsicID()) {
5609 default:
5610 return false;
5611 case Intrinsic::aarch64_neon_ld2:
5612 case Intrinsic::aarch64_neon_st2:
5613 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5614 break;
5615 case Intrinsic::aarch64_neon_ld3:
5616 case Intrinsic::aarch64_neon_st3:
5617 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5618 break;
5619 case Intrinsic::aarch64_neon_ld4:
5620 case Intrinsic::aarch64_neon_st4:
5621 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5622 break;
5623 }
5624 return true;
5625}
5626
5627/// See if \p I should be considered for address type promotion. We check if \p
5628/// I is a sext with right type and used in memory accesses. If it used in a
5629/// "complex" getelementptr, we allow it to be promoted without finding other
5630/// sext instructions that sign extended the same initial value. A getelementptr
5631/// is considered as "complex" if it has more than 2 operands.
5633 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5634 bool Considerable = false;
5635 AllowPromotionWithoutCommonHeader = false;
5636 if (!isa<SExtInst>(&I))
5637 return false;
5638 Type *ConsideredSExtType =
5639 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5640 if (I.getType() != ConsideredSExtType)
5641 return false;
5642 // See if the sext is the one with the right type and used in at least one
5643 // GetElementPtrInst.
5644 for (const User *U : I.users()) {
5645 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5646 Considerable = true;
5647 // A getelementptr is considered as "complex" if it has more than 2
5648 // operands. We will promote a SExt used in such complex GEP as we
5649 // expect some computation to be merged if they are done on 64 bits.
5650 if (GEPInst->getNumOperands() > 2) {
5651 AllowPromotionWithoutCommonHeader = true;
5652 break;
5653 }
5654 }
5655 }
5656 return Considerable;
5657}
5658
5660 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5661 if (!VF.isScalable())
5662 return true;
5663
5664 Type *Ty = RdxDesc.getRecurrenceType();
5665 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5666 return false;
5667
5668 switch (RdxDesc.getRecurrenceKind()) {
5669 case RecurKind::Sub:
5671 case RecurKind::Add:
5672 case RecurKind::FAdd:
5673 case RecurKind::And:
5674 case RecurKind::Or:
5675 case RecurKind::Xor:
5676 case RecurKind::SMin:
5677 case RecurKind::SMax:
5678 case RecurKind::UMin:
5679 case RecurKind::UMax:
5680 case RecurKind::FMin:
5681 case RecurKind::FMax:
5682 case RecurKind::FMulAdd:
5683 case RecurKind::AnyOf:
5685 return true;
5686 default:
5687 return false;
5688 }
5689}
5690
5693 FastMathFlags FMF,
5695 // The code-generator is currently not able to handle scalable vectors
5696 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5697 // it. This change will be removed when code-generation for these types is
5698 // sufficiently reliable.
5699 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5700 if (VTy->getElementCount() == ElementCount::getScalable(1))
5702
5703 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5704
5705 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5706 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5707
5708 InstructionCost LegalizationCost = 0;
5709 if (LT.first > 1) {
5710 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5711 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5712 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5713 }
5714
5715 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5716}
5717
5719 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5720 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5721 InstructionCost LegalizationCost = 0;
5722 if (LT.first > 1) {
5723 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5724 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5725 LegalizationCost *= LT.first - 1;
5726 }
5727
5728 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5729 assert(ISD && "Invalid opcode");
5730 // Add the final reduction cost for the legal horizontal reduction
5731 switch (ISD) {
5732 case ISD::ADD:
5733 case ISD::AND:
5734 case ISD::OR:
5735 case ISD::XOR:
5736 case ISD::FADD:
5737 return LegalizationCost + 2;
5738 default:
5740 }
5741}
5742
5745 std::optional<FastMathFlags> FMF,
5747 // The code-generator is currently not able to handle scalable vectors
5748 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5749 // it. This change will be removed when code-generation for these types is
5750 // sufficiently reliable.
5751 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5752 if (VTy->getElementCount() == ElementCount::getScalable(1))
5754
5756 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5757 InstructionCost BaseCost =
5758 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5759 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5760 // end up vectorizing for more computationally intensive loops.
5761 return BaseCost + FixedVTy->getNumElements();
5762 }
5763
5764 if (Opcode != Instruction::FAdd)
5766
5767 auto *VTy = cast<ScalableVectorType>(ValTy);
5769 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5770 Cost *= getMaxNumElements(VTy->getElementCount());
5771 return Cost;
5772 }
5773
5774 if (isa<ScalableVectorType>(ValTy))
5775 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5776
5777 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5778 MVT MTy = LT.second;
5779 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5780 assert(ISD && "Invalid opcode");
5781
5782 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5783 // instructions as twice a normal vector add, plus 1 for each legalization
5784 // step (LT.first). This is the only arithmetic vector reduction operation for
5785 // which we have an instruction.
5786 // OR, XOR and AND costs should match the codegen from:
5787 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5788 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5789 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5790 static const CostTblEntry CostTblNoPairwise[]{
5791 {ISD::ADD, MVT::v8i8, 2},
5792 {ISD::ADD, MVT::v16i8, 2},
5793 {ISD::ADD, MVT::v4i16, 2},
5794 {ISD::ADD, MVT::v8i16, 2},
5795 {ISD::ADD, MVT::v2i32, 2},
5796 {ISD::ADD, MVT::v4i32, 2},
5797 {ISD::ADD, MVT::v2i64, 2},
5798 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5799 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
5800 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
5801 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
5802 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
5803 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
5804 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
5805 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
5806 {ISD::XOR, MVT::v16i8, 7},
5807 {ISD::XOR, MVT::v4i16, 4},
5808 {ISD::XOR, MVT::v8i16, 6},
5809 {ISD::XOR, MVT::v2i32, 3},
5810 {ISD::XOR, MVT::v4i32, 5},
5811 {ISD::XOR, MVT::v2i64, 3},
5812 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
5813 {ISD::AND, MVT::v16i8, 7},
5814 {ISD::AND, MVT::v4i16, 4},
5815 {ISD::AND, MVT::v8i16, 6},
5816 {ISD::AND, MVT::v2i32, 3},
5817 {ISD::AND, MVT::v4i32, 5},
5818 {ISD::AND, MVT::v2i64, 3},
5819 };
5820 switch (ISD) {
5821 default:
5822 break;
5823 case ISD::FADD:
5824 if (Type *EltTy = ValTy->getScalarType();
5825 // FIXME: For half types without fullfp16 support, this could extend and
5826 // use a fp32 faddp reduction but current codegen unrolls.
5827 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5828 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5829 const unsigned NElts = MTy.getVectorNumElements();
5830 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5831 isPowerOf2_32(NElts))
5832 // Reduction corresponding to series of fadd instructions is lowered to
5833 // series of faddp instructions. faddp has latency/throughput that
5834 // matches fadd instruction and hence, every faddp instruction can be
5835 // considered to have a relative cost = 1 with
5836 // CostKind = TCK_RecipThroughput.
5837 // An faddp will pairwise add vector elements, so the size of input
5838 // vector reduces by half every time, requiring
5839 // #(faddp instructions) = log2_32(NElts).
5840 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5841 }
5842 break;
5843 case ISD::ADD:
5844 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
5845 return (LT.first - 1) + Entry->Cost;
5846 break;
5847 case ISD::XOR:
5848 case ISD::AND:
5849 case ISD::OR:
5850 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5851 if (!Entry)
5852 break;
5853 auto *ValVTy = cast<FixedVectorType>(ValTy);
5854 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5855 isPowerOf2_32(ValVTy->getNumElements())) {
5856 InstructionCost ExtraCost = 0;
5857 if (LT.first != 1) {
5858 // Type needs to be split, so there is an extra cost of LT.first - 1
5859 // arithmetic ops.
5860 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5861 MTy.getVectorNumElements());
5862 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5863 ExtraCost *= LT.first - 1;
5864 }
5865 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5866 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
5867 return Cost + ExtraCost;
5868 }
5869 break;
5870 }
5871 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5872}
5873
5875 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5876 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5877 EVT VecVT = TLI->getValueType(DL, VecTy);
5878 EVT ResVT = TLI->getValueType(DL, ResTy);
5879
5880 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5881 VecVT.getSizeInBits() >= 64) {
5882 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5883
5884 // The legal cases are:
5885 // UADDLV 8/16/32->32
5886 // UADDLP 32->64
5887 unsigned RevVTSize = ResVT.getSizeInBits();
5888 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5889 RevVTSize <= 32) ||
5890 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5891 RevVTSize <= 32) ||
5892 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5893 RevVTSize <= 64))
5894 return (LT.first - 1) * 2 + 2;
5895 }
5896
5897 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
5898 CostKind);
5899}
5900
5902AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5903 Type *ResTy, VectorType *VecTy,
5905 EVT VecVT = TLI->getValueType(DL, VecTy);
5906 EVT ResVT = TLI->getValueType(DL, ResTy);
5907
5908 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5909 RedOpcode == Instruction::Add) {
5910 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5911
5912 // The legal cases with dotprod are
5913 // UDOT 8->32
5914 // Which requires an additional uaddv to sum the i32 values.
5915 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5916 ResVT == MVT::i32)
5917 return LT.first + 2;
5918 }
5919
5920 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
5921 CostKind);
5922}
5923
5927 static const CostTblEntry ShuffleTbl[] = {
5928 { TTI::SK_Splice, MVT::nxv16i8, 1 },
5929 { TTI::SK_Splice, MVT::nxv8i16, 1 },
5930 { TTI::SK_Splice, MVT::nxv4i32, 1 },
5931 { TTI::SK_Splice, MVT::nxv2i64, 1 },
5932 { TTI::SK_Splice, MVT::nxv2f16, 1 },
5933 { TTI::SK_Splice, MVT::nxv4f16, 1 },
5934 { TTI::SK_Splice, MVT::nxv8f16, 1 },
5935 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
5936 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
5937 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
5938 { TTI::SK_Splice, MVT::nxv2f32, 1 },
5939 { TTI::SK_Splice, MVT::nxv4f32, 1 },
5940 { TTI::SK_Splice, MVT::nxv2f64, 1 },
5941 };
5942
5943 // The code-generator is currently not able to handle scalable vectors
5944 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5945 // it. This change will be removed when code-generation for these types is
5946 // sufficiently reliable.
5949
5950 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
5951 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
5952 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5953 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
5954 : LT.second;
5955 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
5956 InstructionCost LegalizationCost = 0;
5957 if (Index < 0) {
5958 LegalizationCost =
5959 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
5961 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
5963 }
5964
5965 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5966 // Cost performed on a promoted type.
5967 if (LT.second.getScalarType() == MVT::i1) {
5968 LegalizationCost +=
5969 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
5971 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
5973 }
5974 const auto *Entry =
5975 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
5976 assert(Entry && "Illegal Type for Splice");
5977 LegalizationCost += Entry->Cost;
5978 return LegalizationCost * LT.first;
5979}
5980
5982 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5984 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5985 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
5987
5989 return Invalid;
5990
5991 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub &&
5992 Opcode != Instruction::FAdd) ||
5993 OpAExtend == TTI::PR_None)
5994 return Invalid;
5995
5996 // Floating-point partial reductions are invalid if `reassoc` and `contract`
5997 // are not allowed.
5998 if (AccumType->isFloatingPointTy()) {
5999 assert(FMF && "Missing FastMathFlags for floating-point partial reduction");
6000 if (!FMF->allowReassoc() || !FMF->allowContract())
6001 return Invalid;
6002 } else {
6003 assert(!FMF &&
6004 "FastMathFlags only apply to floating-point partial reductions");
6005 }
6006
6007 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
6008 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
6009 "Unexpected values for OpBExtend or InputTypeB");
6010
6011 // We only support multiply binary operations for now, and for muls we
6012 // require the types being extended to be the same.
6013 if (BinOp && ((*BinOp != Instruction::Mul && *BinOp != Instruction::FMul) ||
6014 InputTypeA != InputTypeB))
6015 return Invalid;
6016
6017 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
6018 if (IsUSDot && !ST->hasMatMulInt8())
6019 // FIXME: Remove this early bailout in favour of expand cost.
6020 return Invalid;
6021
6022 unsigned Ratio =
6023 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
6024 if (VF.getKnownMinValue() <= Ratio)
6025 return Invalid;
6026
6027 VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
6028 VectorType *AccumVectorType =
6029 VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
6030 // We don't yet support all kinds of legalization.
6031 auto TC = TLI->getTypeConversion(AccumVectorType->getContext(),
6032 EVT::getEVT(AccumVectorType));
6033 switch (TC.first) {
6034 default:
6035 return Invalid;
6039 // The legalised type (e.g. after splitting) must be legal too.
6040 if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) !=
6042 return Invalid;
6043 break;
6044 }
6045
6046 std::pair<InstructionCost, MVT> AccumLT =
6047 getTypeLegalizationCost(AccumVectorType);
6048 std::pair<InstructionCost, MVT> InputLT =
6049 getTypeLegalizationCost(InputVectorType);
6050
6051 // Returns true if the subtarget supports the operation for a given type.
6052 auto IsSupported = [&](bool SVEPred, bool NEONPred) -> bool {
6053 return (ST->isSVEorStreamingSVEAvailable() && SVEPred) ||
6054 (AccumLT.second.isFixedLengthVector() &&
6055 AccumLT.second.getSizeInBits() <= 128 && ST->isNeonAvailable() &&
6056 NEONPred);
6057 };
6058
6059 bool IsSub = Opcode == Instruction::Sub;
6060 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
6061 // Integer partial sub-reductions that don't map to a specific instruction,
6062 // carry an extra cost for implementing a double negation:
6063 // partial_reduce_umls acc, lhs, rhs
6064 // <=> -partial_reduce_umla -acc, lhs, rhs
6065 InstructionCost INegCost = IsSub ? 2 * InputLT.first * TTI::TCC_Basic : 0;
6066
6067 if (AccumLT.second.getScalarType() == MVT::i32 &&
6068 InputLT.second.getScalarType() == MVT::i8) {
6069 // i8 -> i32 is natively supported with udot/sdot for both NEON and SVE.
6070 if (!IsUSDot && IsSupported(true, ST->hasDotProd()))
6071 return Cost + INegCost;
6072 // i8 -> i32 usdot requires +i8mm
6073 if (IsUSDot && IsSupported(ST->hasMatMulInt8(), ST->hasMatMulInt8()))
6074 return Cost + INegCost;
6075 }
6076
6077 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
6078 // i16 -> i64 is natively supported for udot/sdot
6079 if (AccumLT.second.getScalarType() == MVT::i64 &&
6080 InputLT.second.getScalarType() == MVT::i16)
6081 return Cost + INegCost;
6082 // i16 -> i32 is natively supported with SVE2p1 udot/sdot.
6083 // For sub-reductions, we prefer using the *mlslb/t instructions.
6084 if (AccumLT.second.getScalarType() == MVT::i32 &&
6085 InputLT.second.getScalarType() == MVT::i16 &&
6086 (ST->hasSVE2p1() || ST->hasSME2()) && !IsSub)
6087 return Cost;
6088 // i8 -> i64 is supported with an extra level of extends
6089 if (AccumLT.second.getScalarType() == MVT::i64 &&
6090 InputLT.second.getScalarType() == MVT::i8)
6091 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
6092 // because it requires two extra extends on the inputs. But if we'd change
6093 // that now, a regular reduction would be cheaper because the costs of
6094 // the extends in the IR are still counted. This can be fixed
6095 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
6096 return Cost + INegCost;
6097 // i8 -> i16 is natively supported with SVE2p3 udot/sdot
6098 // For sub-reductions, we prefer using the *mlslb/t instructions.
6099 if (AccumLT.second.getScalarType() == MVT::i16 &&
6100 InputLT.second.getScalarType() == MVT::i8 &&
6101 (ST->hasSVE2p3() || ST->hasSME2p3()) && !IsSub)
6102 return Cost;
6103 }
6104
6105 // f16 -> f32 is natively supported for fdot using either
6106 // SVE or NEON instruction.
6107 if (Opcode == Instruction::FAdd && !IsSub &&
6108 IsSupported(ST->hasSME2() || ST->hasSVE2p1(), ST->hasF16F32DOT()) &&
6109 AccumLT.second.getScalarType() == MVT::f32 &&
6110 InputLT.second.getScalarType() == MVT::f16)
6111 return Cost;
6112
6113 // For a ratio of 2, we can use *mlal and *mlsl top/bottom instructions.
6114 if (Ratio == 2 && !IsUSDot) {
6115 MVT InVT = InputLT.second.getScalarType();
6116
6117 // SVE2 [us]ml[as]lb/t and NEON [us]ml[as]l(2)
6118 if (IsSupported(ST->hasSVE2(), true) &&
6119 llvm::is_contained({MVT::i8, MVT::i16, MVT::i32}, InVT.SimpleTy))
6120 return Cost * 2;
6121
6122 // SVE2 fmlalb/t and NEON fmlal(2)
6123 if (IsSupported(ST->hasSVE2(), ST->hasFP16FML()) && InVT == MVT::f16)
6124 return Cost * 2;
6125
6126 // SVE and NEON bfmlalb/t
6127 if (IsSupported(ST->hasBF16(), ST->hasBF16()) && InVT == MVT::bf16)
6128 return Cost * 2;
6129 }
6130
6131 return BaseT::getPartialReductionCost(Opcode, InputTypeA, InputTypeB,
6132 AccumType, VF, OpAExtend, OpBExtend,
6133 BinOp, CostKind, FMF);
6134}
6135
6138 VectorType *SrcTy, ArrayRef<int> Mask,
6139 TTI::TargetCostKind CostKind, int Index,
6141 const Instruction *CxtI) const {
6142 assert((Mask.empty() || DstTy->isScalableTy() ||
6143 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
6144 "Expected the Mask to match the return size if given");
6145 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
6146 "Expected the same scalar types");
6147 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
6148
6149 // If we have a Mask, and the LT is being legalized somehow, split the Mask
6150 // into smaller vectors and sum the cost of each shuffle.
6151 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
6152 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
6153 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
6154 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
6155 // Check for LD3/LD4 instructions, which are represented in llvm IR as
6156 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
6157 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
6158 // cost than just the load.
6159 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
6162 return std::max<InstructionCost>(1, LT.first / 4);
6163
6164 // Check for ST3/ST4 instructions, which are represented in llvm IR as
6165 // store(interleaving-shuffle). The shuffle cost could potentially be free,
6166 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
6167 // cost than just the store.
6168 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
6170 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
6172 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
6173 return LT.first;
6174
6175 unsigned TpNumElts = Mask.size();
6176 unsigned LTNumElts = LT.second.getVectorNumElements();
6177 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
6178 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
6179 LT.second.getVectorElementCount());
6181 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
6182 PreviousCosts;
6183 for (unsigned N = 0; N < NumVecs; N++) {
6184 SmallVector<int> NMask;
6185 // Split the existing mask into chunks of size LTNumElts. Track the source
6186 // sub-vectors to ensure the result has at most 2 inputs.
6187 unsigned Source1 = -1U, Source2 = -1U;
6188 unsigned NumSources = 0;
6189 for (unsigned E = 0; E < LTNumElts; E++) {
6190 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
6192 if (MaskElt < 0) {
6194 continue;
6195 }
6196
6197 // Calculate which source from the input this comes from and whether it
6198 // is new to us.
6199 unsigned Source = MaskElt / LTNumElts;
6200 if (NumSources == 0) {
6201 Source1 = Source;
6202 NumSources = 1;
6203 } else if (NumSources == 1 && Source != Source1) {
6204 Source2 = Source;
6205 NumSources = 2;
6206 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
6207 NumSources++;
6208 }
6209
6210 // Add to the new mask. For the NumSources>2 case these are not correct,
6211 // but are only used for the modular lane number.
6212 if (Source == Source1)
6213 NMask.push_back(MaskElt % LTNumElts);
6214 else if (Source == Source2)
6215 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
6216 else
6217 NMask.push_back(MaskElt % LTNumElts);
6218 }
6219 // Check if we have already generated this sub-shuffle, which means we
6220 // will have already generated the output. For example a <16 x i32> splat
6221 // will be the same sub-splat 4 times, which only needs to be generated
6222 // once and reused.
6223 auto Result =
6224 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
6225 // Check if it was already in the map (already costed).
6226 if (!Result.second)
6227 continue;
6228 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
6229 // getShuffleCost. If not then cost it using the worst case as the number
6230 // of element moves into a new vector.
6231 InstructionCost NCost =
6232 NumSources <= 2
6233 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
6235 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
6236 CxtI)
6237 : LTNumElts;
6238 Result.first->second = NCost;
6239 Cost += NCost;
6240 }
6241 return Cost;
6242 }
6243
6244 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
6245 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
6246 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
6247 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
6248 // This currently only handles low or high extracts to prevent SLP vectorizer
6249 // regressions.
6250 // Note that SVE's ext instruction is destructive, but it can be fused with
6251 // a movprfx to act like a constructive instruction.
6252 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6253 if (LT.second.getFixedSizeInBits() >= 128 &&
6254 cast<FixedVectorType>(SubTp)->getNumElements() ==
6255 LT.second.getVectorNumElements() / 2) {
6256 if (Index == 0)
6257 return 0;
6258 if (Index == (int)LT.second.getVectorNumElements() / 2)
6259 return 1;
6260 }
6262 }
6263 // FIXME: This was added to keep the costs equal when adding DstTys. Update
6264 // the code to handle length-changing shuffles.
6265 if (Kind == TTI::SK_InsertSubvector) {
6266 LT = getTypeLegalizationCost(DstTy);
6267 SrcTy = DstTy;
6268 }
6269
6270 // Check for identity masks, which we can treat as free for both fixed and
6271 // scalable vector paths.
6272 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6273 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6274 all_of(enumerate(Mask), [](const auto &M) {
6275 return M.value() < 0 || M.value() == (int)M.index();
6276 }))
6277 return 0;
6278
6279 // Segmented shuffle matching.
6280 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
6281 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6282 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6284
6286 unsigned Segments =
6288 unsigned SegmentElts = VTy->getNumElements() / Segments;
6289
6290 // dupq zd.t, zn.t[idx]
6291 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6292 ST->isSVEorStreamingSVEAvailable() &&
6293 isDUPQMask(Mask, Segments, SegmentElts))
6294 return LT.first;
6295
6296 // mov zd.q, vn
6297 if (ST->isSVEorStreamingSVEAvailable() &&
6298 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
6299 return LT.first;
6300 }
6301
6302 // Check for broadcast loads, which are supported by the LD1R instruction.
6303 // In terms of code-size, the shuffle vector is free when a load + dup get
6304 // folded into a LD1R. That's what we check and return here. For performance
6305 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6306 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6307 // that we model the load + dup sequence slightly higher because LD1R is a
6308 // high latency instruction.
6309 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6310 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
6311 if (IsLoad && LT.second.isVector() &&
6312 isLegalBroadcastLoad(SrcTy->getElementType(),
6313 LT.second.getVectorElementCount()))
6314 return 0;
6315 }
6316
6317 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6318 // from the perfect shuffle tables.
6319 if (Mask.size() == 4 &&
6320 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
6321 (SrcTy->getScalarSizeInBits() == 16 ||
6322 SrcTy->getScalarSizeInBits() == 32) &&
6323 all_of(Mask, [](int E) { return E < 8; }))
6324 return getPerfectShuffleCost(Mask);
6325
6326 // Check for other shuffles that are not SK_ kinds but we have native
6327 // instructions for, for example ZIP and UZP.
6328 unsigned Unused;
6329 if (LT.second.isFixedLengthVector() &&
6330 LT.second.getVectorNumElements() == Mask.size() &&
6331 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc ||
6332 // Discrepancies between isTRNMask and ShuffleVectorInst::isTransposeMask
6333 // mean that we can end up with shuffles that satisfy isTRNMask, but end
6334 // up labelled as TTI::SK_InsertSubvector. (e.g. {2, 0}).
6335 Kind == TTI::SK_InsertSubvector) &&
6336 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6337 isTRNMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6338 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6339 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6340 LT.second.getVectorNumElements(), 16) ||
6341 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6342 LT.second.getVectorNumElements(), 32) ||
6343 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6344 LT.second.getVectorNumElements(), 64) ||
6345 // Check for non-zero lane splats
6346 all_of(drop_begin(Mask),
6347 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6348 return 1;
6349
6350 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6351 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6352 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6353 static const CostTblEntry ShuffleTbl[] = {
6354 // Broadcast shuffle kinds can be performed with 'dup'.
6355 {TTI::SK_Broadcast, MVT::v8i8, 1},
6356 {TTI::SK_Broadcast, MVT::v16i8, 1},
6357 {TTI::SK_Broadcast, MVT::v4i16, 1},
6358 {TTI::SK_Broadcast, MVT::v8i16, 1},
6359 {TTI::SK_Broadcast, MVT::v2i32, 1},
6360 {TTI::SK_Broadcast, MVT::v4i32, 1},
6361 {TTI::SK_Broadcast, MVT::v2i64, 1},
6362 {TTI::SK_Broadcast, MVT::v4f16, 1},
6363 {TTI::SK_Broadcast, MVT::v8f16, 1},
6364 {TTI::SK_Broadcast, MVT::v4bf16, 1},
6365 {TTI::SK_Broadcast, MVT::v8bf16, 1},
6366 {TTI::SK_Broadcast, MVT::v2f32, 1},
6367 {TTI::SK_Broadcast, MVT::v4f32, 1},
6368 {TTI::SK_Broadcast, MVT::v2f64, 1},
6369 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6370 // 'zip1/zip2' instructions.
6371 {TTI::SK_Transpose, MVT::v8i8, 1},
6372 {TTI::SK_Transpose, MVT::v16i8, 1},
6373 {TTI::SK_Transpose, MVT::v4i16, 1},
6374 {TTI::SK_Transpose, MVT::v8i16, 1},
6375 {TTI::SK_Transpose, MVT::v2i32, 1},
6376 {TTI::SK_Transpose, MVT::v4i32, 1},
6377 {TTI::SK_Transpose, MVT::v2i64, 1},
6378 {TTI::SK_Transpose, MVT::v4f16, 1},
6379 {TTI::SK_Transpose, MVT::v8f16, 1},
6380 {TTI::SK_Transpose, MVT::v4bf16, 1},
6381 {TTI::SK_Transpose, MVT::v8bf16, 1},
6382 {TTI::SK_Transpose, MVT::v2f32, 1},
6383 {TTI::SK_Transpose, MVT::v4f32, 1},
6384 {TTI::SK_Transpose, MVT::v2f64, 1},
6385 // Select shuffle kinds.
6386 // TODO: handle vXi8/vXi16.
6387 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
6388 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
6389 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
6390 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
6391 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
6392 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
6393 // PermuteSingleSrc shuffle kinds.
6394 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
6395 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
6396 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
6397 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
6398 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
6399 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
6400 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
6401 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
6402 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
6403 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
6404 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
6405 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
6406 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
6407 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
6408 // Reverse can be lowered with `rev`.
6409 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
6410 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
6411 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
6412 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
6413 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
6414 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
6415 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
6416 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
6417 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
6418 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
6419 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
6420 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
6421 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
6422 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
6423 // Splice can all be lowered as `ext`.
6424 {TTI::SK_Splice, MVT::v2i32, 1},
6425 {TTI::SK_Splice, MVT::v4i32, 1},
6426 {TTI::SK_Splice, MVT::v2i64, 1},
6427 {TTI::SK_Splice, MVT::v2f32, 1},
6428 {TTI::SK_Splice, MVT::v4f32, 1},
6429 {TTI::SK_Splice, MVT::v2f64, 1},
6430 {TTI::SK_Splice, MVT::v8f16, 1},
6431 {TTI::SK_Splice, MVT::v8bf16, 1},
6432 {TTI::SK_Splice, MVT::v8i16, 1},
6433 {TTI::SK_Splice, MVT::v16i8, 1},
6434 {TTI::SK_Splice, MVT::v4f16, 1},
6435 {TTI::SK_Splice, MVT::v4bf16, 1},
6436 {TTI::SK_Splice, MVT::v4i16, 1},
6437 {TTI::SK_Splice, MVT::v8i8, 1},
6438 // Broadcast shuffle kinds for scalable vectors
6439 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
6440 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6441 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6442 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6443 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6444 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6445 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6446 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6447 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6448 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6449 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6450 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6451 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6452 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6453 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6454 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6455 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6456 // Handle the cases for vector.reverse with scalable vectors
6457 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6458 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6459 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6460 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6461 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6462 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6463 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6464 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6465 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6466 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6467 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6468 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6469 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6470 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6471 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6472 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6473 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6474 };
6475 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6476 return LT.first * Entry->Cost;
6477 }
6478
6479 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6480 return getSpliceCost(SrcTy, Index, CostKind);
6481
6482 // Inserting a subvector can often be done with either a D, S or H register
6483 // move, so long as the inserted vector is "aligned".
6484 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6485 LT.second.getSizeInBits() <= 128 && SubTp) {
6486 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6487 if (SubLT.second.isVector()) {
6488 int NumElts = LT.second.getVectorNumElements();
6489 int NumSubElts = SubLT.second.getVectorNumElements();
6490 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6491 return SubLT.first;
6492 }
6493 }
6494
6495 // Restore optimal kind.
6496 if (IsExtractSubvector)
6498 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6499 Args, CxtI);
6500}
6501
6504 const DominatorTree &DT) {
6505 const auto &Strides = DenseMap<Value *, const SCEV *>();
6506 for (BasicBlock *BB : TheLoop->blocks()) {
6507 // Scan the instructions in the block and look for addresses that are
6508 // consecutive and decreasing.
6509 for (Instruction &I : *BB) {
6510 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6512 Type *AccessTy = getLoadStoreType(&I);
6513 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
6514 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6515 .value_or(0) < 0)
6516 return true;
6517 }
6518 }
6519 }
6520 return false;
6521}
6522
6524 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6526 // For cases like post-LTO vectorization, when we eventually know the trip
6527 // count, epilogue with fixed-width vectorization can be deleted if the trip
6528 // count is less than the epilogue iterations. That's why we prefer
6529 // fixed-width vectorization in epilogue in case of equal costs.
6530 if (IsEpilogue)
6531 return true;
6532 return ST->useFixedOverScalableIfEqualCost();
6533}
6534
6536 return ST->getEpilogueVectorizationMinVF();
6537}
6538
6540 if (!ST->hasSVE())
6541 return false;
6542
6543 // We don't currently support vectorisation with interleaving for SVE - with
6544 // such loops we're better off not using tail-folding. This gives us a chance
6545 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6546 if (TFI->IAI->hasGroups())
6547 return false;
6548
6550 if (TFI->LVL->getReductionVars().size())
6552 if (TFI->LVL->getFixedOrderRecurrences().size())
6554
6555 // We call this to discover whether any load/store pointers in the loop have
6556 // negative strides. This will require extra work to reverse the loop
6557 // predicate, which may be expensive.
6560 *TFI->LVL->getDominatorTree()))
6564
6565 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6566 Required))
6567 return false;
6568
6569 // Don't tail-fold for tight loops where we would be better off interleaving
6570 // with an unpredicated loop.
6571 unsigned NumInsns = 0;
6572 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6573 NumInsns += BB->size();
6574 }
6575
6576 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6577 return NumInsns >= SVETailFoldInsnThreshold;
6578}
6579
6582 StackOffset BaseOffset, bool HasBaseReg,
6583 int64_t Scale, unsigned AddrSpace) const {
6584 // Scaling factors are not free at all.
6585 // Operands | Rt Latency
6586 // -------------------------------------------
6587 // Rt, [Xn, Xm] | 4
6588 // -------------------------------------------
6589 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6590 // Rt, [Xn, Wm, <extend> #imm] |
6592 AM.BaseGV = BaseGV;
6593 AM.BaseOffs = BaseOffset.getFixed();
6594 AM.HasBaseReg = HasBaseReg;
6595 AM.Scale = Scale;
6596 AM.ScalableOffset = BaseOffset.getScalable();
6597 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6598 // Scale represents reg2 * scale, thus account for 1 if
6599 // it is not equal to 0 or 1.
6600 return AM.Scale != 0 && AM.Scale != 1;
6602}
6603
6605 const Instruction *I) const {
6607 // For the binary operators (e.g. or) we need to be more careful than
6608 // selects, here we only transform them if they are already at a natural
6609 // break point in the code - the end of a block with an unconditional
6610 // terminator.
6611 if (I->getOpcode() == Instruction::Or &&
6612 isa<UncondBrInst>(I->getNextNode()))
6613 return true;
6614
6615 if (I->getOpcode() == Instruction::Add ||
6616 I->getOpcode() == Instruction::Sub)
6617 return true;
6618 }
6620}
6621
6624 const TargetTransformInfo::LSRCost &C2) const {
6625 // AArch64 specific here is adding the number of instructions to the
6626 // comparison (though not as the first consideration, as some targets do)
6627 // along with changing the priority of the base additions.
6628 // TODO: Maybe a more nuanced tradeoff between instruction count
6629 // and number of registers? To be investigated at a later date.
6630 if (EnableLSRCostOpt)
6631 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6632 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6633 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6634 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6635
6637}
6638
6639static bool isSplatShuffle(Value *V) {
6640 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6641 return all_equal(Shuf->getShuffleMask());
6642 return false;
6643}
6644
6645/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6646/// or upper half of the vector elements.
6647static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6648 bool AllowSplat = false) {
6649 // Scalable types can't be extract shuffle vectors.
6650 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6651 return false;
6652
6653 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6654 auto *FullTy = FullV->getType();
6655 auto *HalfTy = HalfV->getType();
6656 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6657 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6658 };
6659
6660 auto extractHalf = [](Value *FullV, Value *HalfV) {
6661 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6662 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6663 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6664 };
6665
6666 ArrayRef<int> M1, M2;
6667 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6668 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6669 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6670 return false;
6671
6672 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6673 // it is not checked as an extract below.
6674 if (AllowSplat && isSplatShuffle(Op1))
6675 S1Op1 = nullptr;
6676 if (AllowSplat && isSplatShuffle(Op2))
6677 S2Op1 = nullptr;
6678
6679 // Check that the operands are half as wide as the result and we extract
6680 // half of the elements of the input vectors.
6681 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6682 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6683 return false;
6684
6685 // Check the mask extracts either the lower or upper half of vector
6686 // elements.
6687 int M1Start = 0;
6688 int M2Start = 0;
6689 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6690 if ((S1Op1 &&
6691 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6692 (S2Op1 &&
6693 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6694 return false;
6695
6696 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6697 (M2Start != 0 && M2Start != (NumElements / 2)))
6698 return false;
6699 if (S1Op1 && S2Op1 && M1Start != M2Start)
6700 return false;
6701
6702 return true;
6703}
6704
6705/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6706/// of the vector elements.
6707static bool areExtractExts(Value *Ext1, Value *Ext2) {
6708 auto areExtDoubled = [](Instruction *Ext) {
6709 return Ext->getType()->getScalarSizeInBits() ==
6710 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6711 };
6712
6713 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6714 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6715 !areExtDoubled(cast<Instruction>(Ext1)) ||
6716 !areExtDoubled(cast<Instruction>(Ext2)))
6717 return false;
6718
6719 return true;
6720}
6721
6722/// Check if Op could be used with vmull_high_p64 intrinsic.
6724 Value *VectorOperand = nullptr;
6725 ConstantInt *ElementIndex = nullptr;
6726 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6727 m_ConstantInt(ElementIndex))) &&
6728 ElementIndex->getValue() == 1 &&
6729 isa<FixedVectorType>(VectorOperand->getType()) &&
6730 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6731}
6732
6733/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6734static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6736}
6737
6739 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6740 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6741 if (!GEP || GEP->getNumOperands() != 2)
6742 return false;
6743
6744 Value *Base = GEP->getOperand(0);
6745 Value *Offsets = GEP->getOperand(1);
6746
6747 // We only care about scalar_base+vector_offsets.
6748 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6749 return false;
6750
6751 // Sink extends that would allow us to use 32-bit offset vectors.
6752 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6753 auto *OffsetsInst = cast<Instruction>(Offsets);
6754 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6755 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6756 Ops.push_back(&GEP->getOperandUse(1));
6757 }
6758
6759 // Sink the GEP.
6760 return true;
6761}
6762
6763/// We want to sink following cases:
6764/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6765/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6767 if (match(Op, m_VScale()))
6768 return true;
6769 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
6771 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6772 return true;
6773 }
6774 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
6776 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
6777 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
6778 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6779 return true;
6780 }
6781 return false;
6782}
6783
6784static bool isFNeg(Value *Op) { return match(Op, m_FNeg(m_Value())); }
6785
6786/// Check if sinking \p I's operands to I's basic block is profitable, because
6787/// the operands can be folded into a target instruction, e.g.
6788/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6792 switch (II->getIntrinsicID()) {
6793 case Intrinsic::aarch64_neon_smull:
6794 case Intrinsic::aarch64_neon_umull:
6795 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6796 /*AllowSplat=*/true)) {
6797 Ops.push_back(&II->getOperandUse(0));
6798 Ops.push_back(&II->getOperandUse(1));
6799 return true;
6800 }
6801 [[fallthrough]];
6802
6803 case Intrinsic::fma:
6804 case Intrinsic::fmuladd:
6805 if (isa<VectorType>(I->getType()) &&
6806 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6807 !ST->hasFullFP16())
6808 return false;
6809
6810 if (isFNeg(II->getOperand(0)))
6811 Ops.push_back(&II->getOperandUse(0));
6812 if (isFNeg(II->getOperand(1)))
6813 Ops.push_back(&II->getOperandUse(1));
6814
6815 [[fallthrough]];
6816 case Intrinsic::aarch64_neon_sqdmull:
6817 case Intrinsic::aarch64_neon_sqdmulh:
6818 case Intrinsic::aarch64_neon_sqrdmulh:
6819 // Sink splats for index lane variants
6820 if (isSplatShuffle(II->getOperand(0)))
6821 Ops.push_back(&II->getOperandUse(0));
6822 if (isSplatShuffle(II->getOperand(1)))
6823 Ops.push_back(&II->getOperandUse(1));
6824 return !Ops.empty();
6825 case Intrinsic::aarch64_neon_fmlal:
6826 case Intrinsic::aarch64_neon_fmlal2:
6827 case Intrinsic::aarch64_neon_fmlsl:
6828 case Intrinsic::aarch64_neon_fmlsl2:
6829 // Sink splats for index lane variants
6830 if (isSplatShuffle(II->getOperand(1)))
6831 Ops.push_back(&II->getOperandUse(1));
6832 if (isSplatShuffle(II->getOperand(2)))
6833 Ops.push_back(&II->getOperandUse(2));
6834 return !Ops.empty();
6835 case Intrinsic::aarch64_sve_ptest_first:
6836 case Intrinsic::aarch64_sve_ptest_last:
6837 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6838 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6839 Ops.push_back(&II->getOperandUse(0));
6840 return !Ops.empty();
6841 case Intrinsic::aarch64_sme_write_horiz:
6842 case Intrinsic::aarch64_sme_write_vert:
6843 case Intrinsic::aarch64_sme_writeq_horiz:
6844 case Intrinsic::aarch64_sme_writeq_vert: {
6845 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
6846 if (!Idx || Idx->getOpcode() != Instruction::Add)
6847 return false;
6848 Ops.push_back(&II->getOperandUse(1));
6849 return true;
6850 }
6851 case Intrinsic::aarch64_sme_read_horiz:
6852 case Intrinsic::aarch64_sme_read_vert:
6853 case Intrinsic::aarch64_sme_readq_horiz:
6854 case Intrinsic::aarch64_sme_readq_vert:
6855 case Intrinsic::aarch64_sme_ld1b_vert:
6856 case Intrinsic::aarch64_sme_ld1h_vert:
6857 case Intrinsic::aarch64_sme_ld1w_vert:
6858 case Intrinsic::aarch64_sme_ld1d_vert:
6859 case Intrinsic::aarch64_sme_ld1q_vert:
6860 case Intrinsic::aarch64_sme_st1b_vert:
6861 case Intrinsic::aarch64_sme_st1h_vert:
6862 case Intrinsic::aarch64_sme_st1w_vert:
6863 case Intrinsic::aarch64_sme_st1d_vert:
6864 case Intrinsic::aarch64_sme_st1q_vert:
6865 case Intrinsic::aarch64_sme_ld1b_horiz:
6866 case Intrinsic::aarch64_sme_ld1h_horiz:
6867 case Intrinsic::aarch64_sme_ld1w_horiz:
6868 case Intrinsic::aarch64_sme_ld1d_horiz:
6869 case Intrinsic::aarch64_sme_ld1q_horiz:
6870 case Intrinsic::aarch64_sme_st1b_horiz:
6871 case Intrinsic::aarch64_sme_st1h_horiz:
6872 case Intrinsic::aarch64_sme_st1w_horiz:
6873 case Intrinsic::aarch64_sme_st1d_horiz:
6874 case Intrinsic::aarch64_sme_st1q_horiz: {
6875 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
6876 if (!Idx || Idx->getOpcode() != Instruction::Add)
6877 return false;
6878 Ops.push_back(&II->getOperandUse(3));
6879 return true;
6880 }
6881 case Intrinsic::aarch64_neon_pmull:
6882 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
6883 return false;
6884 Ops.push_back(&II->getOperandUse(0));
6885 Ops.push_back(&II->getOperandUse(1));
6886 return true;
6887 case Intrinsic::aarch64_neon_pmull64:
6888 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
6889 II->getArgOperand(1)))
6890 return false;
6891 Ops.push_back(&II->getArgOperandUse(0));
6892 Ops.push_back(&II->getArgOperandUse(1));
6893 return true;
6894 case Intrinsic::masked_gather:
6895 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
6896 return false;
6897 Ops.push_back(&II->getArgOperandUse(0));
6898 return true;
6899 case Intrinsic::masked_scatter:
6900 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
6901 return false;
6902 Ops.push_back(&II->getArgOperandUse(1));
6903 return true;
6904 default:
6905 return false;
6906 }
6907 }
6908
6909 auto ShouldSinkCondition = [](Value *Cond,
6910 SmallVectorImpl<Use *> &Ops) -> bool {
6912 return false;
6914 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6915 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
6916 return false;
6917 if (isa<CmpInst>(II->getOperand(0)))
6918 Ops.push_back(&II->getOperandUse(0));
6919 return true;
6920 };
6921
6922 switch (I->getOpcode()) {
6923 case Instruction::GetElementPtr:
6924 case Instruction::Add:
6925 case Instruction::Sub:
6926 // Sink vscales closer to uses for better isel
6927 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6928 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
6929 Ops.push_back(&I->getOperandUse(Op));
6930 return true;
6931 }
6932 }
6933 break;
6934 case Instruction::Select: {
6935 if (!ShouldSinkCondition(I->getOperand(0), Ops))
6936 return false;
6937
6938 Ops.push_back(&I->getOperandUse(0));
6939 return true;
6940 }
6941 case Instruction::UncondBr:
6942 return false;
6943 case Instruction::CondBr: {
6944 if (!ShouldSinkCondition(cast<CondBrInst>(I)->getCondition(), Ops))
6945 return false;
6946
6947 Ops.push_back(&I->getOperandUse(0));
6948 return true;
6949 }
6950 case Instruction::FMul:
6951 // fmul with contract flag can be combined with fadd into fma.
6952 // Sinking fneg into this block enables fmls pattern.
6953 if (cast<FPMathOperator>(I)->hasAllowContract()) {
6954 if (isFNeg(I->getOperand(0)))
6955 Ops.push_back(&I->getOperandUse(0));
6956 if (isFNeg(I->getOperand(1)))
6957 Ops.push_back(&I->getOperandUse(1));
6958 }
6959 break;
6960
6961 // Type | BIC | ORN | EON
6962 // ----------------+-----------+-----------+-----------
6963 // scalar | Base | Base | Base
6964 // scalar w/shift | - | - | -
6965 // fixed vector | NEON/Base | NEON/Base | BSL2N/Base
6966 // scalable vector | SVE | - | BSL2N
6967 case Instruction::Xor:
6968 // EON only for scalars (possibly expanded fixed vectors)
6969 // and vectors using the SVE2/SME BSL2N instruction.
6970 if (I->getType()->isVectorTy() && ST->isNeonAvailable()) {
6971 bool HasBSL2N =
6972 ST->isSVEorStreamingSVEAvailable() && (ST->hasSVE2() || ST->hasSME());
6973 if (!HasBSL2N)
6974 break;
6975 }
6976 [[fallthrough]];
6977 case Instruction::And:
6978 case Instruction::Or:
6979 // Even though we could use the SVE2/SME BSL2N instruction,
6980 // it might pessimize with an extra MOV depending on register allocation.
6981 if (I->getOpcode() == Instruction::Or &&
6982 isa<ScalableVectorType>(I->getType()))
6983 break;
6984 // Shift can be fold into scalar AND/ORR/EOR,
6985 // but not the non-negated operand of BIC/ORN/EON.
6986 if (!(I->getType()->isVectorTy() && ST->hasNEON()) &&
6988 break;
6989 for (auto &Op : I->operands()) {
6990 // (and/or/xor X, (not Y)) -> (bic/orn/eon X, Y)
6991 if (match(Op.get(), m_Not(m_Value()))) {
6992 Ops.push_back(&Op);
6993 return true;
6994 }
6995 // (and/or/xor X, (splat (not Y))) -> (bic/orn/eon X, (splat Y))
6996 if (match(Op.get(),
6998 m_Value(), m_ZeroMask()))) {
6999 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
7000 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
7001 Ops.push_back(&Not);
7002 Ops.push_back(&InsertElt);
7003 Ops.push_back(&Op);
7004 return true;
7005 }
7006 }
7007 break;
7008 default:
7009 break;
7010 }
7011
7012 if (!I->getType()->isVectorTy())
7013 return !Ops.empty();
7014
7015 switch (I->getOpcode()) {
7016 case Instruction::Sub:
7017 case Instruction::Add: {
7018 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
7019 return false;
7020
7021 // If the exts' operands extract either the lower or upper elements, we
7022 // can sink them too.
7023 auto Ext1 = cast<Instruction>(I->getOperand(0));
7024 auto Ext2 = cast<Instruction>(I->getOperand(1));
7025 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
7026 Ops.push_back(&Ext1->getOperandUse(0));
7027 Ops.push_back(&Ext2->getOperandUse(0));
7028 }
7029
7030 Ops.push_back(&I->getOperandUse(0));
7031 Ops.push_back(&I->getOperandUse(1));
7032
7033 return true;
7034 }
7035 case Instruction::Or: {
7036 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
7037 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
7038 if (ST->hasNEON()) {
7039 Instruction *OtherAnd, *IA, *IB;
7040 Value *MaskValue;
7041 // MainAnd refers to And instruction that has 'Not' as one of its operands
7042 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
7043 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
7044 m_Instruction(IA)))))) {
7045 if (match(OtherAnd,
7046 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
7047 Instruction *MainAnd = I->getOperand(0) == OtherAnd
7048 ? cast<Instruction>(I->getOperand(1))
7049 : cast<Instruction>(I->getOperand(0));
7050
7051 // Both Ands should be in same basic block as Or
7052 if (I->getParent() != MainAnd->getParent() ||
7053 I->getParent() != OtherAnd->getParent())
7054 return false;
7055
7056 // Non-mask operands of both Ands should also be in same basic block
7057 if (I->getParent() != IA->getParent() ||
7058 I->getParent() != IB->getParent())
7059 return false;
7060
7061 Ops.push_back(
7062 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
7063 Ops.push_back(&I->getOperandUse(0));
7064 Ops.push_back(&I->getOperandUse(1));
7065
7066 return true;
7067 }
7068 }
7069 }
7070
7071 return false;
7072 }
7073 case Instruction::Mul: {
7074 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
7075 auto *Ty = cast<VectorType>(V->getType());
7076 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7077 if (Ty->isScalableTy())
7078 return false;
7079
7080 // Indexed variants of Mul exist for i16 and i32 element types only.
7081 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
7082 };
7083
7084 int NumZExts = 0, NumSExts = 0;
7085 for (auto &Op : I->operands()) {
7086 // Make sure we are not already sinking this operand
7087 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7088 continue;
7089
7090 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
7091 auto *Ext = cast<Instruction>(Op);
7092 auto *ExtOp = Ext->getOperand(0);
7093 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
7094 Ops.push_back(&Ext->getOperandUse(0));
7095 Ops.push_back(&Op);
7096
7097 if (isa<SExtInst>(Ext)) {
7098 NumSExts++;
7099 } else {
7100 NumZExts++;
7101 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
7102 if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
7103 I->getType()->getScalarSizeInBits())
7104 NumSExts++;
7105 }
7106
7107 continue;
7108 }
7109
7111 if (!Shuffle)
7112 continue;
7113
7114 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
7115 // operand and the s/zext can help create indexed s/umull. This is
7116 // especially useful to prevent i64 mul being scalarized.
7117 if (isSplatShuffle(Shuffle) &&
7118 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
7119 Ops.push_back(&Shuffle->getOperandUse(0));
7120 Ops.push_back(&Op);
7121 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
7122 NumSExts++;
7123 else
7124 NumZExts++;
7125 continue;
7126 }
7127
7128 Value *ShuffleOperand = Shuffle->getOperand(0);
7129 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
7130 if (!Insert)
7131 continue;
7132
7133 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
7134 if (!OperandInstr)
7135 continue;
7136
7137 ConstantInt *ElementConstant =
7138 dyn_cast<ConstantInt>(Insert->getOperand(2));
7139 // Check that the insertelement is inserting into element 0
7140 if (!ElementConstant || !ElementConstant->isZero())
7141 continue;
7142
7143 unsigned Opcode = OperandInstr->getOpcode();
7144 if (Opcode == Instruction::SExt)
7145 NumSExts++;
7146 else if (Opcode == Instruction::ZExt)
7147 NumZExts++;
7148 else {
7149 // If we find that the top bits are known 0, then we can sink and allow
7150 // the backend to generate a umull.
7151 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
7152 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
7153 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
7154 continue;
7155 NumZExts++;
7156 }
7157
7158 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
7159 // the And, just to hoist it again back to the load.
7160 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
7161 Ops.push_back(&Insert->getOperandUse(1));
7162 Ops.push_back(&Shuffle->getOperandUse(0));
7163 Ops.push_back(&Op);
7164 }
7165
7166 // It is profitable to sink if we found two of the same type of extends.
7167 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
7168 return true;
7169
7170 // Otherwise, see if we should sink splats for indexed variants.
7171 if (!ShouldSinkSplatForIndexedVariant(I))
7172 return false;
7173
7174 Ops.clear();
7175 if (isSplatShuffle(I->getOperand(0)))
7176 Ops.push_back(&I->getOperandUse(0));
7177 if (isSplatShuffle(I->getOperand(1)))
7178 Ops.push_back(&I->getOperandUse(1));
7179
7180 return !Ops.empty();
7181 }
7182 case Instruction::FMul: {
7183 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7184 if (I->getType()->isScalableTy())
7185 return !Ops.empty();
7186
7187 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
7188 !ST->hasFullFP16())
7189 return !Ops.empty();
7190
7191 // Sink splats for index lane variants
7192 if (isSplatShuffle(I->getOperand(0)))
7193 Ops.push_back(&I->getOperandUse(0));
7194 if (isSplatShuffle(I->getOperand(1)))
7195 Ops.push_back(&I->getOperandUse(1));
7196 return !Ops.empty();
7197 }
7198 default:
7199 return false;
7200 }
7201 return false;
7202}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineWhilelo(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST)
static void extractAttrFeatures(const Function &F, const AArch64TTIImpl *TTI, SmallVectorImpl< StringRef > &Features)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< int > Aarch64ForceUnrollThreshold("aarch64-force-unroll-threshold", cl::init(0), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in AArch64"))
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static bool isFNeg(Value *Op)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE, const DominatorTree &DT)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
static Value * getCondition(Instruction *I)
Hexagon Common GEP
const HexagonInstrInfo * TII
#define _
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
#define LLVM_DEBUG(...)
Definition Debug.h:114
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool preferTailFoldingOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
APInt getPriorityMask(const Function &F) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const override
bool enableScalableVectorization() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
unsigned countLeadingOnes() const
Definition APInt.h:1647
void negate()
Negate this APInt in place.
Definition APInt.h:1491
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1084
unsigned logBase2() const
Definition APInt.h:1784
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:834
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:219
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
bool isUnsigned() const
Definition InstrTypes.h:936
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
LLVM_ABI Constant * getSplatValue(bool AllowPoison=false) const
If all elements of the vector constant have the same value, return that value.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:791
bool empty() const
Definition DenseMap.h:109
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool noSignedZeros() const
Definition FMF.h:70
bool noInfs() const
Definition FMF.h:69
bool approxFunc() const
Definition FMF.h:73
bool allowContract() const
Definition FMF.h:72
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2627
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1149
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2615
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:599
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:619
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:586
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:604
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2010
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:534
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2324
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2539
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1759
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2242
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1918
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2649
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1931
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:614
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2315
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2858
This instruction inserts a single (scalar) element into a VectorType value.
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
LLVM_ABI FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
static MVT getScalableVectorVT(MVT VT, unsigned NumElements)
bool isFixedLengthVector() const
MVT getVectorElementType() const
size_type size() const
Definition MapVector.h:58
Information for memory intrinsic cost model.
const Instruction * getInst() const
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:895
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
An instruction for storing to memory.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:730
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:65
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:158
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:290
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:964
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:399
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
LLVM_ABI APInt getCpuSupportsMask(ArrayRef< StringRef > Features)
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
match_bind< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
brc_match< Cond_t, match_bind< BasicBlock >, match_bind< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
LLVM_ABI Libcall getPOW(EVT RetVT)
getPOW - Return the POW_* value for the given types, or UNKNOWN_LIBCALL if there is none.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0,...
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Uninitialized
Definition Threading.h:60
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:328
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0,...
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:292
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:381
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:367
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:324
bool isFixedLengthVector() const
Definition ValueTypes.h:189
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:331
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:182
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:336
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:344
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
bool isVariant() const
Definition MCSchedule.h:144
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...