LLVM 23.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
81 "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
82 cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83
84namespace {
85class TailFoldingOption {
86 // These bitfields will only ever be set to something non-zero in operator=,
87 // when setting the -sve-tail-folding option. This option should always be of
88 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
89 // InitialBits is one of (disabled|all|simple). EnableBits represents
90 // additional flags we're enabling, and DisableBits for those flags we're
91 // disabling. The default flag is tracked in the variable NeedsDefault, since
92 // at the time of setting the option we may not know what the default value
93 // for the CPU is.
97
98 // This value needs to be initialised to true in case the user does not
99 // explicitly set the -sve-tail-folding option.
100 bool NeedsDefault = true;
101
102 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104 void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106 void setEnableBit(TailFoldingOpts Bit) {
107 EnableBits |= Bit;
108 DisableBits &= ~Bit;
109 }
110
111 void setDisableBit(TailFoldingOpts Bit) {
112 EnableBits &= ~Bit;
113 DisableBits |= Bit;
114 }
115
116 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
123 Bits |= EnableBits;
124 Bits &= ~DisableBits;
125
126 return Bits;
127 }
128
129 void reportError(std::string Opt) {
130 errs() << "invalid argument '" << Opt
131 << "' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
134 report_fatal_error("Unrecognised tail-folding option");
135 }
136
137public:
138
139 void operator=(const std::string &Val) {
140 // If the user explicitly sets -sve-tail-folding= then treat as an error.
141 if (Val.empty()) {
142 reportError("");
143 return;
144 }
145
146 // Since the user is explicitly setting the option we don't automatically
147 // need the default unless they require it.
148 setNeedsDefault(false);
149
150 SmallVector<StringRef, 4> TailFoldTypes;
151 StringRef(Val).split(TailFoldTypes, '+', -1, false);
152
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] == "disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] == "all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] == "default")
159 setNeedsDefault(true);
160 else if (TailFoldTypes[0] == "simple")
161 setInitialBits(TailFoldingOpts::Simple);
162 else {
163 StartIdx = 0;
164 setInitialBits(TailFoldingOpts::Disabled);
165 }
166
167 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168 if (TailFoldTypes[I] == "reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[I] == "noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[I] == "norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[I] == "noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
180 else
181 reportError(Val);
182 }
183 }
184
185 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186 return (getBits(DefaultBits) & Required) == Required;
187 }
188};
189} // namespace
190
191TailFoldingOption TailFoldingOptionLoc;
192
194 "sve-tail-folding",
195 cl::desc(
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
199 "tail-folding"
200 "\ndefault (Initial) Uses the default tail-folding settings for "
201 "the target CPU"
202 "\nall (Initial) All legal loop types will vectorize using "
203 "tail-folding"
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
209 "recurrences"
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
212 "predicates"
213 "\nnoreverse Inverse of above"),
215
216// Experimental option that will only be fully functional when the
217// code-generator is changed to use SVE instead of NEON for all fixed-width
218// operations.
220 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
221
222// Experimental option that will only be fully functional when the cost-model
223// and code-generator have been changed to avoid using scalable vector
224// instructions that are not legal in streaming SVE mode.
226 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
227
228static bool isSMEABIRoutineCall(const CallInst &CI,
229 const AArch64TargetLowering &TLI) {
230 const auto *F = CI.getCalledFunction();
231 return F &&
233}
234
235/// Returns true if the function has explicit operations that can only be
236/// lowered using incompatible instructions for the selected mode. This also
237/// returns true if the function F may use or modify ZA state.
239 const AArch64TargetLowering &TLI) {
240 for (const BasicBlock &BB : *F) {
241 for (const Instruction &I : BB) {
242 // Be conservative for now and assume that any call to inline asm or to
243 // intrinsics could could result in non-streaming ops (e.g. calls to
244 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
245 // all native LLVM instructions can be lowered to compatible instructions.
246 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
247 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
249 return true;
250 }
251 }
252 return false;
253}
254
256 SmallVectorImpl<StringRef> &Features) {
257 StringRef AttributeStr =
258 TTI->isMultiversionedFunction(F) ? "fmv-features" : "target-features";
259 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
260 FeatureStr.split(Features, ",");
261}
262
265 extractAttrFeatures(F, this, Features);
266 return AArch64::getCpuSupportsMask(Features);
267}
268
271 extractAttrFeatures(F, this, Features);
272 return AArch64::getFMVPriority(Features);
273}
274
276 return F.hasFnAttribute("fmv-features");
277}
278
279const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
280 AArch64::FeatureExecuteOnly,
281};
282
284 const Function *Callee) const {
285 SMECallAttrs CallAttrs(*Caller, *Callee);
286
287 // Never inline a function explicitly marked as being streaming,
288 // into a non-streaming function. Assume it was marked as streaming
289 // for a reason.
290 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
292 return false;
293
294 // When inlining, we should consider the body of the function, not the
295 // interface.
296 if (CallAttrs.callee().hasStreamingBody()) {
297 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
298 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
299 }
300
301 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
302 return false;
303
304 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
305 CallAttrs.requiresPreservingZT0() ||
306 CallAttrs.requiresPreservingAllZAState()) {
307 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
308 return false;
309 }
310
311 const TargetMachine &TM = getTLI()->getTargetMachine();
312 const FeatureBitset &CallerBits =
313 TM.getSubtargetImpl(*Caller)->getFeatureBits();
314 const FeatureBitset &CalleeBits =
315 TM.getSubtargetImpl(*Callee)->getFeatureBits();
316 // Adjust the feature bitsets by inverting some of the bits. This is needed
317 // for target features that represent restrictions rather than capabilities,
318 // for example a "+execute-only" callee can be inlined into a caller without
319 // "+execute-only", but not vice versa.
320 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
321 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
322
323 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
324}
325
327 const Function *Callee,
328 ArrayRef<Type *> Types) const {
329 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
330 return false;
331
332 // We need to ensure that argument promotion does not attempt to promote
333 // pointers to fixed-length vector types larger than 128 bits like
334 // <8 x float> (and pointers to aggregate types which have such fixed-length
335 // vector type members) into the values of the pointees. Such vector types
336 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
337 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
338 // types can be safely treated as 128-bit NEON types and they cannot be
339 // distinguished in IR.
340 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
341 auto FVTy = dyn_cast<FixedVectorType>(Ty);
342 return FVTy &&
343 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
344 }))
345 return false;
346
347 return true;
348}
349
350unsigned
352 unsigned DefaultCallPenalty) const {
353 // This function calculates a penalty for executing Call in F.
354 //
355 // There are two ways this function can be called:
356 // (1) F:
357 // call from F -> G (the call here is Call)
358 //
359 // For (1), Call.getCaller() == F, so it will always return a high cost if
360 // a streaming-mode change is required (thus promoting the need to inline the
361 // function)
362 //
363 // (2) F:
364 // call from F -> G (the call here is not Call)
365 // G:
366 // call from G -> H (the call here is Call)
367 //
368 // For (2), if after inlining the body of G into F the call to H requires a
369 // streaming-mode change, and the call to G from F would also require a
370 // streaming-mode change, then there is benefit to do the streaming-mode
371 // change only once and avoid inlining of G into F.
372
373 SMEAttrs FAttrs(*F);
374 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
375
376 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
377 if (F == Call.getCaller()) // (1)
378 return CallPenaltyChangeSM * DefaultCallPenalty;
379 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
380 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
381 }
382
383 return DefaultCallPenalty;
384}
385
389
390 if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
391 return true;
392
394 ST->isSVEorStreamingSVEAvailable() &&
395 !ST->disableMaximizeScalableBandwidth();
396}
397
398/// Calculate the cost of materializing a 64-bit value. This helper
399/// method might only calculate a fraction of a larger immediate. Therefore it
400/// is valid to return a cost of ZERO.
402 // Check if the immediate can be encoded within an instruction.
403 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
404 return 0;
405
406 if (Val < 0)
407 Val = ~Val;
408
409 // Calculate how many moves we will need to materialize this constant.
411 AArch64_IMM::expandMOVImm(Val, 64, Insn);
412 return Insn.size();
413}
414
415/// Calculate the cost of materializing the given constant.
419 assert(Ty->isIntegerTy());
420
421 unsigned BitSize = Ty->getPrimitiveSizeInBits();
422 if (BitSize == 0)
423 return ~0U;
424
425 // Sign-extend all constants to a multiple of 64-bit.
426 APInt ImmVal = Imm;
427 if (BitSize & 0x3f)
428 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
429
430 // Split the constant into 64-bit chunks and calculate the cost for each
431 // chunk.
433 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
434 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
435 int64_t Val = Tmp.getSExtValue();
436 Cost += getIntImmCost(Val);
437 }
438 // We need at least one instruction to materialze the constant.
439 return std::max<InstructionCost>(1, Cost);
440}
441
443 const APInt &Imm, Type *Ty,
445 Instruction *Inst) const {
446 assert(Ty->isIntegerTy());
447
448 unsigned BitSize = Ty->getPrimitiveSizeInBits();
449 // There is no cost model for constants with a bit size of 0. Return TCC_Free
450 // here, so that constant hoisting will ignore this constant.
451 if (BitSize == 0)
452 return TTI::TCC_Free;
453
454 unsigned ImmIdx = ~0U;
455 switch (Opcode) {
456 default:
457 return TTI::TCC_Free;
458 case Instruction::GetElementPtr:
459 // Always hoist the base address of a GetElementPtr.
460 if (Idx == 0)
461 return 2 * TTI::TCC_Basic;
462 return TTI::TCC_Free;
463 case Instruction::Store:
464 ImmIdx = 0;
465 break;
466 case Instruction::Add:
467 case Instruction::Sub:
468 case Instruction::Mul:
469 case Instruction::UDiv:
470 case Instruction::SDiv:
471 case Instruction::URem:
472 case Instruction::SRem:
473 case Instruction::And:
474 case Instruction::Or:
475 case Instruction::Xor:
476 case Instruction::ICmp:
477 ImmIdx = 1;
478 break;
479 // Always return TCC_Free for the shift value of a shift instruction.
480 case Instruction::Shl:
481 case Instruction::LShr:
482 case Instruction::AShr:
483 if (Idx == 1)
484 return TTI::TCC_Free;
485 break;
486 case Instruction::Trunc:
487 case Instruction::ZExt:
488 case Instruction::SExt:
489 case Instruction::IntToPtr:
490 case Instruction::PtrToInt:
491 case Instruction::BitCast:
492 case Instruction::PHI:
493 case Instruction::Call:
494 case Instruction::Select:
495 case Instruction::Ret:
496 case Instruction::Load:
497 break;
498 }
499
500 if (Idx == ImmIdx) {
501 int NumConstants = (BitSize + 63) / 64;
503 return (Cost <= NumConstants * TTI::TCC_Basic)
504 ? static_cast<int>(TTI::TCC_Free)
505 : Cost;
506 }
508}
509
512 const APInt &Imm, Type *Ty,
514 assert(Ty->isIntegerTy());
515
516 unsigned BitSize = Ty->getPrimitiveSizeInBits();
517 // There is no cost model for constants with a bit size of 0. Return TCC_Free
518 // here, so that constant hoisting will ignore this constant.
519 if (BitSize == 0)
520 return TTI::TCC_Free;
521
522 // Most (all?) AArch64 intrinsics do not support folding immediates into the
523 // selected instruction, so we compute the materialization cost for the
524 // immediate directly.
525 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
527
528 switch (IID) {
529 default:
530 return TTI::TCC_Free;
531 case Intrinsic::sadd_with_overflow:
532 case Intrinsic::uadd_with_overflow:
533 case Intrinsic::ssub_with_overflow:
534 case Intrinsic::usub_with_overflow:
535 case Intrinsic::smul_with_overflow:
536 case Intrinsic::umul_with_overflow:
537 if (Idx == 1) {
538 int NumConstants = (BitSize + 63) / 64;
540 return (Cost <= NumConstants * TTI::TCC_Basic)
541 ? static_cast<int>(TTI::TCC_Free)
542 : Cost;
543 }
544 break;
545 case Intrinsic::experimental_stackmap:
546 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
547 return TTI::TCC_Free;
548 break;
549 case Intrinsic::experimental_patchpoint_void:
550 case Intrinsic::experimental_patchpoint:
551 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
552 return TTI::TCC_Free;
553 break;
554 case Intrinsic::experimental_gc_statepoint:
555 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
556 return TTI::TCC_Free;
557 break;
558 }
560}
561
563AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
564 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
565 if (TyWidth == 32 || TyWidth == 64)
567 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
568 return TTI::PSK_Software;
569}
570
571static bool isUnpackedVectorVT(EVT VecVT) {
572 return VecVT.isScalableVector() &&
574}
575
577 const IntrinsicCostAttributes &ICA) {
578 // We need to know at least the number of elements in the vector of buckets
579 // and the size of each element to update.
580 if (ICA.getArgTypes().size() < 2)
582
583 // Only interested in costing for the hardware instruction from SVE2.
584 if (!ST->hasSVE2())
586
587 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
588 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
589 unsigned TotalHistCnts = 1;
590
591 unsigned EltSize = EltTy->getScalarSizeInBits();
592 // Only allow (up to 64b) integers or pointers
593 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
595
596 // FIXME: We should be able to generate histcnt for fixed-length vectors
597 // using ptrue with a specific VL.
598 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
599 unsigned EC = VTy->getElementCount().getKnownMinValue();
600 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
602
603 // HistCnt only supports 32b and 64b element types
604 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
605
606 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
608
609 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
610 TotalHistCnts = EC / NaturalVectorWidth;
611
612 return InstructionCost(BaseHistCntCost * TotalHistCnts);
613 }
614
616}
617
621 // The code-generator is currently not able to handle scalable vectors
622 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
623 // it. This change will be removed when code-generation for these types is
624 // sufficiently reliable.
625 auto *RetTy = ICA.getReturnType();
626 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
627 if (VTy->getElementCount() == ElementCount::getScalable(1))
629
630 switch (ICA.getID()) {
631 case Intrinsic::experimental_vector_histogram_add: {
632 InstructionCost HistCost = getHistogramCost(ST, ICA);
633 // If the cost isn't valid, we may still be able to scalarize
634 if (HistCost.isValid())
635 return HistCost;
636 break;
637 }
638 case Intrinsic::umin:
639 case Intrinsic::umax:
640 case Intrinsic::smin:
641 case Intrinsic::smax: {
642 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
643 MVT::v8i16, MVT::v2i32, MVT::v4i32,
644 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
645 MVT::nxv2i64};
646 auto LT = getTypeLegalizationCost(RetTy);
647 // v2i64 types get converted to cmp+bif hence the cost of 2
648 if (LT.second == MVT::v2i64)
649 return LT.first * 2;
650 if (any_of(ValidMinMaxTys, equal_to(LT.second)))
651 return LT.first;
652 break;
653 }
654 case Intrinsic::sadd_sat:
655 case Intrinsic::ssub_sat:
656 case Intrinsic::uadd_sat:
657 case Intrinsic::usub_sat: {
658 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
659 MVT::v8i16, MVT::v2i32, MVT::v4i32,
660 MVT::v2i64};
661 auto LT = getTypeLegalizationCost(RetTy);
662 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
663 // need to extend the type, as it uses shr(qadd(shl, shl)).
664 unsigned Instrs =
665 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
666 if (any_of(ValidSatTys, equal_to(LT.second)))
667 return LT.first * Instrs;
668
670 uint64_t VectorSize = TS.getKnownMinValue();
671
672 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
673 return LT.first * Instrs;
674
675 break;
676 }
677 case Intrinsic::abs: {
678 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
679 MVT::v8i16, MVT::v2i32, MVT::v4i32,
680 MVT::v2i64};
681 auto LT = getTypeLegalizationCost(RetTy);
682 if (any_of(ValidAbsTys, equal_to(LT.second)))
683 return LT.first;
684 break;
685 }
686 case Intrinsic::bswap: {
687 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
688 MVT::v4i32, MVT::v2i64};
689 auto LT = getTypeLegalizationCost(RetTy);
690 if (any_of(ValidAbsTys, equal_to(LT.second)) &&
691 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
692 return LT.first;
693 break;
694 }
695 case Intrinsic::fma:
696 case Intrinsic::fmuladd: {
697 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
698 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
699 Type *EltTy = RetTy->getScalarType();
700 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
701 (EltTy->isHalfTy() && ST->hasFullFP16()))
702 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
703 break;
704 }
705 case Intrinsic::stepvector: {
706 InstructionCost Cost = 1; // Cost of the `index' instruction
707 auto LT = getTypeLegalizationCost(RetTy);
708 // Legalisation of illegal vectors involves an `index' instruction plus
709 // (LT.first - 1) vector adds.
710 if (LT.first > 1) {
711 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
712 InstructionCost AddCost =
713 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
714 Cost += AddCost * (LT.first - 1);
715 }
716 return Cost;
717 }
718 case Intrinsic::vector_extract:
719 case Intrinsic::vector_insert: {
720 // If both the vector and subvector types are legal types and the index
721 // is 0, then this should be a no-op or simple operation; return a
722 // relatively low cost.
723
724 // If arguments aren't actually supplied, then we cannot determine the
725 // value of the index. We also want to skip predicate types.
726 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
728 break;
729
730 LLVMContext &C = RetTy->getContext();
731 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
732 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
733 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
734 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
735 // Skip this if either the vector or subvector types are unpacked
736 // SVE types; they may get lowered to stack stores and loads.
737 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
738 break;
739
741 getTLI()->getTypeConversion(C, SubVecVT);
743 getTLI()->getTypeConversion(C, VecVT);
744 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
745 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
746 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
747 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
748 return TTI::TCC_Free;
749 break;
750 }
751 case Intrinsic::bitreverse: {
752 static const CostTblEntry BitreverseTbl[] = {
753 {Intrinsic::bitreverse, MVT::i32, 1},
754 {Intrinsic::bitreverse, MVT::i64, 1},
755 {Intrinsic::bitreverse, MVT::v8i8, 1},
756 {Intrinsic::bitreverse, MVT::v16i8, 1},
757 {Intrinsic::bitreverse, MVT::v4i16, 2},
758 {Intrinsic::bitreverse, MVT::v8i16, 2},
759 {Intrinsic::bitreverse, MVT::v2i32, 2},
760 {Intrinsic::bitreverse, MVT::v4i32, 2},
761 {Intrinsic::bitreverse, MVT::v1i64, 2},
762 {Intrinsic::bitreverse, MVT::v2i64, 2},
763 };
764 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
765 const auto *Entry =
766 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
767 if (Entry) {
768 // Cost Model is using the legal type(i32) that i8 and i16 will be
769 // converted to +1 so that we match the actual lowering cost
770 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
771 TLI->getValueType(DL, RetTy, true) == MVT::i16)
772 return LegalisationCost.first * Entry->Cost + 1;
773
774 return LegalisationCost.first * Entry->Cost;
775 }
776 break;
777 }
778 case Intrinsic::ctpop: {
779 if (!ST->hasNEON()) {
780 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
781 return getTypeLegalizationCost(RetTy).first * 12;
782 }
783 static const CostTblEntry CtpopCostTbl[] = {
784 {ISD::CTPOP, MVT::v2i64, 4},
785 {ISD::CTPOP, MVT::v4i32, 3},
786 {ISD::CTPOP, MVT::v8i16, 2},
787 {ISD::CTPOP, MVT::v16i8, 1},
788 {ISD::CTPOP, MVT::i64, 4},
789 {ISD::CTPOP, MVT::v2i32, 3},
790 {ISD::CTPOP, MVT::v4i16, 2},
791 {ISD::CTPOP, MVT::v8i8, 1},
792 {ISD::CTPOP, MVT::i32, 5},
793 };
794 auto LT = getTypeLegalizationCost(RetTy);
795 MVT MTy = LT.second;
796 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
797 // Extra cost of +1 when illegal vector types are legalized by promoting
798 // the integer type.
799 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
800 RetTy->getScalarSizeInBits()
801 ? 1
802 : 0;
803 return LT.first * Entry->Cost + ExtraCost;
804 }
805 break;
806 }
807 case Intrinsic::sadd_with_overflow:
808 case Intrinsic::uadd_with_overflow:
809 case Intrinsic::ssub_with_overflow:
810 case Intrinsic::usub_with_overflow:
811 case Intrinsic::smul_with_overflow:
812 case Intrinsic::umul_with_overflow: {
813 static const CostTblEntry WithOverflowCostTbl[] = {
814 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
815 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
816 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
817 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
818 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
819 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
820 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
821 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
822 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
823 {Intrinsic::usub_with_overflow, MVT::i8, 3},
824 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
825 {Intrinsic::usub_with_overflow, MVT::i16, 3},
826 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
827 {Intrinsic::usub_with_overflow, MVT::i32, 1},
828 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
829 {Intrinsic::usub_with_overflow, MVT::i64, 1},
830 {Intrinsic::smul_with_overflow, MVT::i8, 5},
831 {Intrinsic::umul_with_overflow, MVT::i8, 4},
832 {Intrinsic::smul_with_overflow, MVT::i16, 5},
833 {Intrinsic::umul_with_overflow, MVT::i16, 4},
834 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
835 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
836 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
837 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
838 };
839 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
840 if (MTy.isSimple())
841 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
842 MTy.getSimpleVT()))
843 return Entry->Cost;
844 break;
845 }
846 case Intrinsic::fptosi_sat:
847 case Intrinsic::fptoui_sat: {
848 if (ICA.getArgTypes().empty())
849 break;
850 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
851 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
852 EVT MTy = TLI->getValueType(DL, RetTy);
853 // Check for the legal types, which are where the size of the input and the
854 // output are the same, or we are using cvt f64->i32 or f32->i64.
855 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
856 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
857 LT.second == MVT::v2f64)) {
858 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
859 (LT.second == MVT::f64 && MTy == MVT::i32) ||
860 (LT.second == MVT::f32 && MTy == MVT::i64)))
861 return LT.first;
862 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
863 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
864 MTy.getScalarSizeInBits() == 64)
865 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
866 }
867 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
868 // f32.
869 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
870 return LT.first + getIntrinsicInstrCost(
871 {ICA.getID(),
872 RetTy,
873 {ICA.getArgTypes()[0]->getWithNewType(
874 Type::getFloatTy(RetTy->getContext()))}},
875 CostKind);
876 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
877 (LT.second == MVT::f16 && MTy == MVT::i64) ||
878 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
879 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
880 return LT.first;
881 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
882 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
883 MTy.getScalarSizeInBits() == 32)
884 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
885 // Extending vector types v8f16->v8i32. These current scalarize but the
886 // codegen could be better.
887 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
888 MTy.getScalarSizeInBits() == 64)
889 return MTy.getVectorNumElements() * 3;
890
891 // If we can we use a legal convert followed by a min+max
892 if ((LT.second.getScalarType() == MVT::f32 ||
893 LT.second.getScalarType() == MVT::f64 ||
894 LT.second.getScalarType() == MVT::f16) &&
895 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
896 Type *LegalTy =
897 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
898 if (LT.second.isVector())
899 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
901 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
902 LegalTy, {LegalTy, LegalTy});
904 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
905 LegalTy, {LegalTy, LegalTy});
907 return LT.first * Cost +
908 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
909 : 1);
910 }
911 // Otherwise we need to follow the default expansion that clamps the value
912 // using a float min/max with a fcmp+sel for nan handling when signed.
913 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
914 RetTy = RetTy->getScalarType();
915 if (LT.second.isVector()) {
916 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
917 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
918 }
919 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
921 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
923 Cost +=
924 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
926 if (IsSigned) {
927 Type *CondTy = RetTy->getWithNewBitWidth(1);
928 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
930 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
932 }
933 return LT.first * Cost;
934 }
935 case Intrinsic::fshl:
936 case Intrinsic::fshr: {
937 if (ICA.getArgs().empty())
938 break;
939
940 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
941
942 // ROTR / ROTL is a funnel shift with equal first and second operand. For
943 // ROTR on integer registers (i32/i64) this can be done in a single ror
944 // instruction. A fshl with a non-constant shift uses a neg + ror.
945 if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] &&
946 (RetTy->getPrimitiveSizeInBits() == 32 ||
947 RetTy->getPrimitiveSizeInBits() == 64)) {
948 InstructionCost NegCost =
949 (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0;
950 return 1 + NegCost;
951 }
952
953 // TODO: Add handling for fshl where third argument is not a constant.
954 if (!OpInfoZ.isConstant())
955 break;
956
957 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
958 if (OpInfoZ.isUniform()) {
959 static const CostTblEntry FshlTbl[] = {
960 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
961 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
962 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
963 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
964 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
965 // to avoid having to duplicate the costs.
966 const auto *Entry =
967 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
968 if (Entry)
969 return LegalisationCost.first * Entry->Cost;
970 }
971
972 auto TyL = getTypeLegalizationCost(RetTy);
973 if (!RetTy->isIntegerTy())
974 break;
975
976 // Estimate cost manually, as types like i8 and i16 will get promoted to
977 // i32 and CostTableLookup will ignore the extra conversion cost.
978 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
979 RetTy->getScalarSizeInBits() < 64) ||
980 (RetTy->getScalarSizeInBits() % 64 != 0);
981 unsigned ExtraCost = HigherCost ? 1 : 0;
982 if (RetTy->getScalarSizeInBits() == 32 ||
983 RetTy->getScalarSizeInBits() == 64)
984 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
985 // extr instruction.
986 else if (HigherCost)
987 ExtraCost = 1;
988 else
989 break;
990 return TyL.first + ExtraCost;
991 }
992 case Intrinsic::get_active_lane_mask: {
993 auto RetTy = cast<VectorType>(ICA.getReturnType());
994 EVT RetVT = getTLI()->getValueType(DL, RetTy);
995 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
996 if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
997 break;
998
999 if (RetTy->isScalableTy()) {
1000 if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
1002 break;
1003
1004 auto LT = getTypeLegalizationCost(RetTy);
1005 InstructionCost Cost = LT.first;
1006 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
1007 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
1008 // nxv32i1 = get_active_lane_mask(base, idx) ->
1009 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
1010 if (ST->hasSVE2p1() || ST->hasSME2()) {
1011 Cost /= 2;
1012 if (Cost == 1)
1013 return Cost;
1014 }
1015
1016 // If more than one whilelo intrinsic is required, include the extra cost
1017 // required by the saturating add & select required to increment the
1018 // start value after the first intrinsic call.
1019 Type *OpTy = ICA.getArgTypes()[0];
1020 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
1021 InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
1022 Type *CondTy = OpTy->getWithNewBitWidth(1);
1023 SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
1025 return Cost + (SplitCost * (Cost - 1));
1026 } else if (!getTLI()->isTypeLegal(RetVT)) {
1027 // We don't have enough context at this point to determine if the mask
1028 // is going to be kept live after the block, which will force the vXi1
1029 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1030 // For now, we just assume the vectorizer created this intrinsic and
1031 // the result will be the input for a PHI. In this case the cost will
1032 // be extremely high for fixed-width vectors.
1033 // NOTE: getScalarizationOverhead returns a cost that's far too
1034 // pessimistic for the actual generated codegen. In reality there are
1035 // two instructions generated per lane.
1036 return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
1037 }
1038 break;
1039 }
1040 case Intrinsic::experimental_vector_match: {
1041 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
1042 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1043 unsigned SearchSize = NeedleTy->getNumElements();
1044 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
1045 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1046 // Neoverse V3, these are cheap operations with the same latency as a
1047 // vector ADD. In most cases, however, we also need to do an extra DUP.
1048 // For fixed-length vectors we currently need an extra five--six
1049 // instructions besides the MATCH.
1051 if (isa<FixedVectorType>(RetTy))
1052 Cost += 10;
1053 return Cost;
1054 }
1055 break;
1056 }
1057 case Intrinsic::experimental_cttz_elts: {
1058 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1059 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1060 // This will consist of a SVE brkb and a cntp instruction. These
1061 // typically have the same latency and half the throughput as a vector
1062 // add instruction.
1063 return 4;
1064 }
1065 break;
1066 }
1067 case Intrinsic::loop_dependence_raw_mask:
1068 case Intrinsic::loop_dependence_war_mask: {
1069 // The whilewr/rw instructions require SVE2 or SME.
1070 if (ST->hasSVE2() || ST->hasSME()) {
1071 EVT VecVT = getTLI()->getValueType(DL, RetTy);
1072 unsigned EltSizeInBytes =
1073 cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
1074 if (is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes) &&
1075 VecVT.getVectorMinNumElements() == (16 / EltSizeInBytes))
1076 return 1;
1077 }
1078 break;
1079 }
1080 case Intrinsic::experimental_vector_extract_last_active:
1081 if (ST->isSVEorStreamingSVEAvailable()) {
1082 auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1083 // This should turn into chained clastb instructions.
1084 return LegalCost;
1085 }
1086 break;
1087 default:
1088 break;
1089 }
1091}
1092
1093/// The function will remove redundant reinterprets casting in the presence
1094/// of the control flow
1095static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1096 IntrinsicInst &II) {
1098 auto RequiredType = II.getType();
1099
1100 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1101 assert(PN && "Expected Phi Node!");
1102
1103 // Don't create a new Phi unless we can remove the old one.
1104 if (!PN->hasOneUse())
1105 return std::nullopt;
1106
1107 for (Value *IncValPhi : PN->incoming_values()) {
1108 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1109 if (!Reinterpret ||
1110 Reinterpret->getIntrinsicID() !=
1111 Intrinsic::aarch64_sve_convert_to_svbool ||
1112 RequiredType != Reinterpret->getArgOperand(0)->getType())
1113 return std::nullopt;
1114 }
1115
1116 // Create the new Phi
1117 IC.Builder.SetInsertPoint(PN);
1118 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1119 Worklist.push_back(PN);
1120
1121 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1122 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1123 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1124 Worklist.push_back(Reinterpret);
1125 }
1126
1127 // Cleanup Phi Node and reinterprets
1128 return IC.replaceInstUsesWith(II, NPN);
1129}
1130
1131// A collection of properties common to SVE intrinsics that allow for combines
1132// to be written without needing to know the specific intrinsic.
1134 //
1135 // Helper routines for common intrinsic definitions.
1136 //
1137
1138 // e.g. llvm.aarch64.sve.add pg, op1, op2
1139 // with IID ==> llvm.aarch64.sve.add_u
1140 static SVEIntrinsicInfo
1147
1148 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1155
1156 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1162
1163 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1169
1170 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1171 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1172 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1173 return SVEIntrinsicInfo()
1176 }
1177
1178 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1179 // llvm.aarch64.sve.ld1 pg, ptr
1186
1187 // All properties relate to predication and thus having a general predicate
1188 // is the minimum requirement to say there is intrinsic info to act on.
1189 explicit operator bool() const { return hasGoverningPredicate(); }
1190
1191 //
1192 // Properties relating to the governing predicate.
1193 //
1194
1196 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1197 }
1198
1200 assert(hasGoverningPredicate() && "Propery not set!");
1201 return GoverningPredicateIdx;
1202 }
1203
1205 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1206 GoverningPredicateIdx = Index;
1207 return *this;
1208 }
1209
1210 //
1211 // Properties relating to operations the intrinsic could be transformed into.
1212 // NOTE: This does not mean such a transformation is always possible, but the
1213 // knowledge makes it possible to reuse existing optimisations without needing
1214 // to embed specific handling for each intrinsic. For example, instruction
1215 // simplification can be used to optimise an intrinsic's active lanes.
1216 //
1217
1219 return UndefIntrinsic != Intrinsic::not_intrinsic;
1220 }
1221
1223 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1224 return UndefIntrinsic;
1225 }
1226
1228 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1229 UndefIntrinsic = IID;
1230 return *this;
1231 }
1232
1233 bool hasMatchingIROpode() const { return IROpcode != 0; }
1234
1235 unsigned getMatchingIROpode() const {
1236 assert(hasMatchingIROpode() && "Propery not set!");
1237 return IROpcode;
1238 }
1239
1241 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1242 IROpcode = Opcode;
1243 return *this;
1244 }
1245
1246 //
1247 // Properties relating to the result of inactive lanes.
1248 //
1249
1251 return ResultLanes == InactiveLanesTakenFromOperand;
1252 }
1253
1255 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1256 return OperandIdxForInactiveLanes;
1257 }
1258
1260 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1261 ResultLanes = InactiveLanesTakenFromOperand;
1262 OperandIdxForInactiveLanes = Index;
1263 return *this;
1264 }
1265
1267 return ResultLanes == InactiveLanesAreNotDefined;
1268 }
1269
1271 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1272 ResultLanes = InactiveLanesAreNotDefined;
1273 return *this;
1274 }
1275
1277 return ResultLanes == InactiveLanesAreUnused;
1278 }
1279
1281 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1282 ResultLanes = InactiveLanesAreUnused;
1283 return *this;
1284 }
1285
1286 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1287 // inactiveLanesAreZeroed =
1288 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1289 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1290
1292 ResultIsZeroInitialized = true;
1293 return *this;
1294 }
1295
1296 //
1297 // The first operand of unary merging operations is typically only used to
1298 // set the result for inactive lanes. Knowing this allows us to deadcode the
1299 // operand when we can prove there are no inactive lanes.
1300 //
1301
1303 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1304 }
1305
1307 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1308 return OperandIdxWithNoActiveLanes;
1309 }
1310
1312 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1313 OperandIdxWithNoActiveLanes = Index;
1314 return *this;
1315 }
1316
1317private:
1318 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1319
1320 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1321 unsigned IROpcode = 0;
1322
1323 enum PredicationStyle {
1325 InactiveLanesTakenFromOperand,
1326 InactiveLanesAreNotDefined,
1327 InactiveLanesAreUnused
1328 } ResultLanes = Uninitialized;
1329
1330 bool ResultIsZeroInitialized = false;
1331 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1332 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1333};
1334
1336 // Some SVE intrinsics do not use scalable vector types, but since they are
1337 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1338 if (!isa<ScalableVectorType>(II.getType()) &&
1339 all_of(II.args(), [&](const Value *V) {
1340 return !isa<ScalableVectorType>(V->getType());
1341 }))
1342 return SVEIntrinsicInfo();
1343
1344 Intrinsic::ID IID = II.getIntrinsicID();
1345 switch (IID) {
1346 default:
1347 break;
1348 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1349 case Intrinsic::aarch64_sve_fcvt_f16f32:
1350 case Intrinsic::aarch64_sve_fcvt_f16f64:
1351 case Intrinsic::aarch64_sve_fcvt_f32f16:
1352 case Intrinsic::aarch64_sve_fcvt_f32f64:
1353 case Intrinsic::aarch64_sve_fcvt_f64f16:
1354 case Intrinsic::aarch64_sve_fcvt_f64f32:
1355 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1356 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1357 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1358 case Intrinsic::aarch64_sve_fcvtzs:
1359 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1360 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1361 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1362 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1363 case Intrinsic::aarch64_sve_fcvtzu:
1364 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1365 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1366 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1367 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1368 case Intrinsic::aarch64_sve_scvtf:
1369 case Intrinsic::aarch64_sve_scvtf_f16i32:
1370 case Intrinsic::aarch64_sve_scvtf_f16i64:
1371 case Intrinsic::aarch64_sve_scvtf_f32i64:
1372 case Intrinsic::aarch64_sve_scvtf_f64i32:
1373 case Intrinsic::aarch64_sve_ucvtf:
1374 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1375 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1376 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1377 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1379
1380 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1381 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1382 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1383 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1385
1386 case Intrinsic::aarch64_sve_fabd:
1387 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1388 case Intrinsic::aarch64_sve_fadd:
1389 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1390 .setMatchingIROpcode(Instruction::FAdd);
1391 case Intrinsic::aarch64_sve_fdiv:
1392 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1393 .setMatchingIROpcode(Instruction::FDiv);
1394 case Intrinsic::aarch64_sve_fmax:
1395 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1396 case Intrinsic::aarch64_sve_fmaxnm:
1397 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1398 case Intrinsic::aarch64_sve_fmin:
1399 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1400 case Intrinsic::aarch64_sve_fminnm:
1401 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1402 case Intrinsic::aarch64_sve_fmla:
1403 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1404 case Intrinsic::aarch64_sve_fmls:
1405 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1406 case Intrinsic::aarch64_sve_fmul:
1407 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1408 .setMatchingIROpcode(Instruction::FMul);
1409 case Intrinsic::aarch64_sve_fmulx:
1410 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1411 case Intrinsic::aarch64_sve_fnmla:
1412 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1413 case Intrinsic::aarch64_sve_fnmls:
1414 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1415 case Intrinsic::aarch64_sve_fsub:
1416 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1417 .setMatchingIROpcode(Instruction::FSub);
1418 case Intrinsic::aarch64_sve_add:
1419 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1420 .setMatchingIROpcode(Instruction::Add);
1421 case Intrinsic::aarch64_sve_mla:
1422 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1423 case Intrinsic::aarch64_sve_mls:
1424 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1425 case Intrinsic::aarch64_sve_mul:
1426 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1427 .setMatchingIROpcode(Instruction::Mul);
1428 case Intrinsic::aarch64_sve_sabd:
1429 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1430 case Intrinsic::aarch64_sve_sdiv:
1431 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1432 .setMatchingIROpcode(Instruction::SDiv);
1433 case Intrinsic::aarch64_sve_smax:
1434 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1435 case Intrinsic::aarch64_sve_smin:
1436 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1437 case Intrinsic::aarch64_sve_smulh:
1438 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1439 case Intrinsic::aarch64_sve_sub:
1440 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1441 .setMatchingIROpcode(Instruction::Sub);
1442 case Intrinsic::aarch64_sve_uabd:
1443 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1444 case Intrinsic::aarch64_sve_udiv:
1445 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1446 .setMatchingIROpcode(Instruction::UDiv);
1447 case Intrinsic::aarch64_sve_umax:
1448 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1449 case Intrinsic::aarch64_sve_umin:
1450 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1451 case Intrinsic::aarch64_sve_umulh:
1452 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1453 case Intrinsic::aarch64_sve_asr:
1454 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1455 .setMatchingIROpcode(Instruction::AShr);
1456 case Intrinsic::aarch64_sve_lsl:
1457 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1458 .setMatchingIROpcode(Instruction::Shl);
1459 case Intrinsic::aarch64_sve_lsr:
1460 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1461 .setMatchingIROpcode(Instruction::LShr);
1462 case Intrinsic::aarch64_sve_and:
1463 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1464 .setMatchingIROpcode(Instruction::And);
1465 case Intrinsic::aarch64_sve_bic:
1466 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1467 case Intrinsic::aarch64_sve_eor:
1468 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1469 .setMatchingIROpcode(Instruction::Xor);
1470 case Intrinsic::aarch64_sve_orr:
1471 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1472 .setMatchingIROpcode(Instruction::Or);
1473 case Intrinsic::aarch64_sve_shsub:
1474 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_shsub_u);
1475 case Intrinsic::aarch64_sve_shsubr:
1477 case Intrinsic::aarch64_sve_sqrshl:
1478 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqrshl_u);
1479 case Intrinsic::aarch64_sve_sqshl:
1480 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqshl_u);
1481 case Intrinsic::aarch64_sve_sqsub:
1482 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1483 case Intrinsic::aarch64_sve_srshl:
1484 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_srshl_u);
1485 case Intrinsic::aarch64_sve_uhsub:
1486 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uhsub_u);
1487 case Intrinsic::aarch64_sve_uhsubr:
1489 case Intrinsic::aarch64_sve_uqrshl:
1490 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqrshl_u);
1491 case Intrinsic::aarch64_sve_uqshl:
1492 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqshl_u);
1493 case Intrinsic::aarch64_sve_uqsub:
1494 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1495 case Intrinsic::aarch64_sve_urshl:
1496 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_urshl_u);
1497
1498 case Intrinsic::aarch64_sve_add_u:
1500 Instruction::Add);
1501 case Intrinsic::aarch64_sve_and_u:
1503 Instruction::And);
1504 case Intrinsic::aarch64_sve_asr_u:
1506 Instruction::AShr);
1507 case Intrinsic::aarch64_sve_eor_u:
1509 Instruction::Xor);
1510 case Intrinsic::aarch64_sve_fadd_u:
1512 Instruction::FAdd);
1513 case Intrinsic::aarch64_sve_fdiv_u:
1515 Instruction::FDiv);
1516 case Intrinsic::aarch64_sve_fmul_u:
1518 Instruction::FMul);
1519 case Intrinsic::aarch64_sve_fsub_u:
1521 Instruction::FSub);
1522 case Intrinsic::aarch64_sve_lsl_u:
1524 Instruction::Shl);
1525 case Intrinsic::aarch64_sve_lsr_u:
1527 Instruction::LShr);
1528 case Intrinsic::aarch64_sve_mul_u:
1530 Instruction::Mul);
1531 case Intrinsic::aarch64_sve_orr_u:
1533 Instruction::Or);
1534 case Intrinsic::aarch64_sve_sdiv_u:
1536 Instruction::SDiv);
1537 case Intrinsic::aarch64_sve_sub_u:
1539 Instruction::Sub);
1540 case Intrinsic::aarch64_sve_udiv_u:
1542 Instruction::UDiv);
1543
1544 case Intrinsic::aarch64_sve_addqv:
1545 case Intrinsic::aarch64_sve_and_z:
1546 case Intrinsic::aarch64_sve_bic_z:
1547 case Intrinsic::aarch64_sve_brka_z:
1548 case Intrinsic::aarch64_sve_brkb_z:
1549 case Intrinsic::aarch64_sve_brkn_z:
1550 case Intrinsic::aarch64_sve_brkpa_z:
1551 case Intrinsic::aarch64_sve_brkpb_z:
1552 case Intrinsic::aarch64_sve_cntp:
1553 case Intrinsic::aarch64_sve_compact:
1554 case Intrinsic::aarch64_sve_eor_z:
1555 case Intrinsic::aarch64_sve_eorv:
1556 case Intrinsic::aarch64_sve_eorqv:
1557 case Intrinsic::aarch64_sve_nand_z:
1558 case Intrinsic::aarch64_sve_nor_z:
1559 case Intrinsic::aarch64_sve_orn_z:
1560 case Intrinsic::aarch64_sve_orr_z:
1561 case Intrinsic::aarch64_sve_orv:
1562 case Intrinsic::aarch64_sve_orqv:
1563 case Intrinsic::aarch64_sve_pnext:
1564 case Intrinsic::aarch64_sve_rdffr_z:
1565 case Intrinsic::aarch64_sve_saddv:
1566 case Intrinsic::aarch64_sve_uaddv:
1567 case Intrinsic::aarch64_sve_umaxv:
1568 case Intrinsic::aarch64_sve_umaxqv:
1569 case Intrinsic::aarch64_sve_cmpeq:
1570 case Intrinsic::aarch64_sve_cmpeq_wide:
1571 case Intrinsic::aarch64_sve_cmpge:
1572 case Intrinsic::aarch64_sve_cmpge_wide:
1573 case Intrinsic::aarch64_sve_cmpgt:
1574 case Intrinsic::aarch64_sve_cmpgt_wide:
1575 case Intrinsic::aarch64_sve_cmphi:
1576 case Intrinsic::aarch64_sve_cmphi_wide:
1577 case Intrinsic::aarch64_sve_cmphs:
1578 case Intrinsic::aarch64_sve_cmphs_wide:
1579 case Intrinsic::aarch64_sve_cmple_wide:
1580 case Intrinsic::aarch64_sve_cmplo_wide:
1581 case Intrinsic::aarch64_sve_cmpls_wide:
1582 case Intrinsic::aarch64_sve_cmplt_wide:
1583 case Intrinsic::aarch64_sve_cmpne:
1584 case Intrinsic::aarch64_sve_cmpne_wide:
1585 case Intrinsic::aarch64_sve_facge:
1586 case Intrinsic::aarch64_sve_facgt:
1587 case Intrinsic::aarch64_sve_fcmpeq:
1588 case Intrinsic::aarch64_sve_fcmpge:
1589 case Intrinsic::aarch64_sve_fcmpgt:
1590 case Intrinsic::aarch64_sve_fcmpne:
1591 case Intrinsic::aarch64_sve_fcmpuo:
1592 case Intrinsic::aarch64_sve_ld1:
1593 case Intrinsic::aarch64_sve_ld1_gather:
1594 case Intrinsic::aarch64_sve_ld1_gather_index:
1595 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1596 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1597 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1598 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1599 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1600 case Intrinsic::aarch64_sve_ld1q_gather_index:
1601 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1602 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1603 case Intrinsic::aarch64_sve_ld1ro:
1604 case Intrinsic::aarch64_sve_ld1rq:
1605 case Intrinsic::aarch64_sve_ld1udq:
1606 case Intrinsic::aarch64_sve_ld1uwq:
1607 case Intrinsic::aarch64_sve_ld2_sret:
1608 case Intrinsic::aarch64_sve_ld2q_sret:
1609 case Intrinsic::aarch64_sve_ld3_sret:
1610 case Intrinsic::aarch64_sve_ld3q_sret:
1611 case Intrinsic::aarch64_sve_ld4_sret:
1612 case Intrinsic::aarch64_sve_ld4q_sret:
1613 case Intrinsic::aarch64_sve_ldff1:
1614 case Intrinsic::aarch64_sve_ldff1_gather:
1615 case Intrinsic::aarch64_sve_ldff1_gather_index:
1616 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1617 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1618 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1619 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1620 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1621 case Intrinsic::aarch64_sve_ldnf1:
1622 case Intrinsic::aarch64_sve_ldnt1:
1623 case Intrinsic::aarch64_sve_ldnt1_gather:
1624 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1625 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1626 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1628
1629 case Intrinsic::aarch64_sve_prf:
1630 case Intrinsic::aarch64_sve_prfb_gather_index:
1631 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1632 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1633 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1634 case Intrinsic::aarch64_sve_prfd_gather_index:
1635 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1636 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1637 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1638 case Intrinsic::aarch64_sve_prfh_gather_index:
1639 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1640 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1641 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1642 case Intrinsic::aarch64_sve_prfw_gather_index:
1643 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1644 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1645 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1647
1648 case Intrinsic::aarch64_sve_st1_scatter:
1649 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1650 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1651 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1652 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1653 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1654 case Intrinsic::aarch64_sve_st1dq:
1655 case Intrinsic::aarch64_sve_st1q_scatter_index:
1656 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1657 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1658 case Intrinsic::aarch64_sve_st1wq:
1659 case Intrinsic::aarch64_sve_stnt1:
1660 case Intrinsic::aarch64_sve_stnt1_scatter:
1661 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1662 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1663 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1665 case Intrinsic::aarch64_sve_st2:
1666 case Intrinsic::aarch64_sve_st2q:
1668 case Intrinsic::aarch64_sve_st3:
1669 case Intrinsic::aarch64_sve_st3q:
1671 case Intrinsic::aarch64_sve_st4:
1672 case Intrinsic::aarch64_sve_st4q:
1674 }
1675
1676 return SVEIntrinsicInfo();
1677}
1678
1679static bool isAllActivePredicate(Value *Pred) {
1680 Value *UncastedPred;
1681
1682 // Look through predicate casts that only remove lanes.
1684 m_Value(UncastedPred)))) {
1685 auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
1686 Pred = UncastedPred;
1687
1689 m_Value(UncastedPred))))
1690 // If the predicate has the same or less lanes than the uncasted predicate
1691 // then we know the casting has no effect.
1692 if (OrigPredTy->getMinNumElements() <=
1693 cast<ScalableVectorType>(UncastedPred->getType())
1694 ->getMinNumElements())
1695 Pred = UncastedPred;
1696 }
1697
1698 auto *C = dyn_cast<Constant>(Pred);
1699 return C && C->isAllOnesValue();
1700}
1701
1702// Simplify `V` by only considering the operations that affect active lanes.
1703// This function should only return existing Values or newly created Constants.
1704static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1705 auto *Dup = dyn_cast<IntrinsicInst>(V);
1706 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1707 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1709 cast<VectorType>(V->getType())->getElementCount(),
1710 cast<Constant>(Dup->getOperand(2)));
1711
1712 return V;
1713}
1714
1715static std::optional<Instruction *>
1717 const SVEIntrinsicInfo &IInfo) {
1718 const unsigned Opc = IInfo.getMatchingIROpode();
1719 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1720
1721 Value *Pg = II.getOperand(0);
1722 Value *Op1 = II.getOperand(1);
1723 Value *Op2 = II.getOperand(2);
1724 const DataLayout &DL = II.getDataLayout();
1725
1726 // Canonicalise constants to the RHS.
1728 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1729 IC.replaceOperand(II, 1, Op2);
1730 IC.replaceOperand(II, 2, Op1);
1731 return &II;
1732 }
1733
1734 // Only active lanes matter when simplifying the operation.
1735 Op1 = stripInactiveLanes(Op1, Pg);
1736 Op2 = stripInactiveLanes(Op2, Pg);
1737
1738 Value *SimpleII;
1739 if (auto FII = dyn_cast<FPMathOperator>(&II))
1740 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1741 else
1742 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1743
1744 // An SVE intrinsic's result is always defined. However, this is not the case
1745 // for its equivalent IR instruction (e.g. when shifting by an amount more
1746 // than the data's bitwidth). Simplifications to an undefined result must be
1747 // ignored to preserve the intrinsic's expected behaviour.
1748 if (!SimpleII || isa<UndefValue>(SimpleII))
1749 return std::nullopt;
1750
1751 if (IInfo.inactiveLanesAreNotDefined())
1752 return IC.replaceInstUsesWith(II, SimpleII);
1753
1754 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1755
1756 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1757 if (SimpleII == Inactive)
1758 return IC.replaceInstUsesWith(II, SimpleII);
1759
1760 // Inactive lanes must be preserved.
1761 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1762 return IC.replaceInstUsesWith(II, SimpleII);
1763}
1764
1765// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1766// to operations with less strict inactive lane requirements.
1767static std::optional<Instruction *>
1769 const SVEIntrinsicInfo &IInfo) {
1770 if (!IInfo.hasGoverningPredicate())
1771 return std::nullopt;
1772
1773 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1774
1775 // If there are no active lanes.
1776 if (match(OpPredicate, m_ZeroInt())) {
1778 return IC.replaceInstUsesWith(
1779 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1780
1781 if (IInfo.inactiveLanesAreUnused()) {
1782 if (IInfo.resultIsZeroInitialized())
1784
1785 return IC.eraseInstFromFunction(II);
1786 }
1787 }
1788
1789 // If there are no inactive lanes.
1790 if (isAllActivePredicate(OpPredicate)) {
1791 if (IInfo.hasOperandWithNoActiveLanes()) {
1792 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1793 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1794 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1795 }
1796
1797 if (IInfo.hasMatchingUndefIntrinsic()) {
1798 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1799 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1800 II.setCalledFunction(NewDecl);
1801 return &II;
1802 }
1803 }
1804
1805 // Operation specific simplifications.
1806 if (IInfo.hasMatchingIROpode() &&
1808 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1809
1810 return std::nullopt;
1811}
1812
1813// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1814// => (binop (pred) (from_svbool _) (from_svbool _))
1815//
1816// The above transformation eliminates a `to_svbool` in the predicate
1817// operand of bitwise operation `binop` by narrowing the vector width of
1818// the operation. For example, it would convert a `<vscale x 16 x i1>
1819// and` into a `<vscale x 4 x i1> and`. This is profitable because
1820// to_svbool must zero the new lanes during widening, whereas
1821// from_svbool is free.
1822static std::optional<Instruction *>
1824 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1825 if (!BinOp)
1826 return std::nullopt;
1827
1828 auto IntrinsicID = BinOp->getIntrinsicID();
1829 switch (IntrinsicID) {
1830 case Intrinsic::aarch64_sve_and_z:
1831 case Intrinsic::aarch64_sve_bic_z:
1832 case Intrinsic::aarch64_sve_eor_z:
1833 case Intrinsic::aarch64_sve_nand_z:
1834 case Intrinsic::aarch64_sve_nor_z:
1835 case Intrinsic::aarch64_sve_orn_z:
1836 case Intrinsic::aarch64_sve_orr_z:
1837 break;
1838 default:
1839 return std::nullopt;
1840 }
1841
1842 auto BinOpPred = BinOp->getOperand(0);
1843 auto BinOpOp1 = BinOp->getOperand(1);
1844 auto BinOpOp2 = BinOp->getOperand(2);
1845
1846 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1847 if (!PredIntr ||
1848 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1849 return std::nullopt;
1850
1851 auto PredOp = PredIntr->getOperand(0);
1852 auto PredOpTy = cast<VectorType>(PredOp->getType());
1853 if (PredOpTy != II.getType())
1854 return std::nullopt;
1855
1856 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1857 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1858 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1859 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1860 if (BinOpOp1 == BinOpOp2)
1861 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1862 else
1863 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1864 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1865
1866 auto NarrowedBinOp =
1867 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1868 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1869}
1870
1871static std::optional<Instruction *>
1873 // If the reinterpret instruction operand is a PHI Node
1874 if (isa<PHINode>(II.getArgOperand(0)))
1875 return processPhiNode(IC, II);
1876
1877 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1878 return BinOpCombine;
1879
1880 // Ignore converts to/from svcount_t.
1881 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1882 isa<TargetExtType>(II.getType()))
1883 return std::nullopt;
1884
1885 SmallVector<Instruction *, 32> CandidatesForRemoval;
1886 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1887
1888 const auto *IVTy = cast<VectorType>(II.getType());
1889
1890 // Walk the chain of conversions.
1891 while (Cursor) {
1892 // If the type of the cursor has fewer lanes than the final result, zeroing
1893 // must take place, which breaks the equivalence chain.
1894 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1895 if (CursorVTy->getElementCount().getKnownMinValue() <
1896 IVTy->getElementCount().getKnownMinValue())
1897 break;
1898
1899 // If the cursor has the same type as I, it is a viable replacement.
1900 if (Cursor->getType() == IVTy)
1901 EarliestReplacement = Cursor;
1902
1903 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1904
1905 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1906 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1907 Intrinsic::aarch64_sve_convert_to_svbool ||
1908 IntrinsicCursor->getIntrinsicID() ==
1909 Intrinsic::aarch64_sve_convert_from_svbool))
1910 break;
1911
1912 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1913 Cursor = IntrinsicCursor->getOperand(0);
1914 }
1915
1916 // If no viable replacement in the conversion chain was found, there is
1917 // nothing to do.
1918 if (!EarliestReplacement)
1919 return std::nullopt;
1920
1921 return IC.replaceInstUsesWith(II, EarliestReplacement);
1922}
1923
1924static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1925 IntrinsicInst &II) {
1926 // svsel(ptrue, x, y) => x
1927 auto *OpPredicate = II.getOperand(0);
1928 if (isAllActivePredicate(OpPredicate))
1929 return IC.replaceInstUsesWith(II, II.getOperand(1));
1930
1931 auto Select =
1932 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1933 return IC.replaceInstUsesWith(II, Select);
1934}
1935
1936static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1937 IntrinsicInst &II) {
1938 Value *Pg = II.getOperand(1);
1939
1940 // sve.dup(V, all_active, X) ==> splat(X)
1941 if (isAllActivePredicate(Pg)) {
1942 auto *RetTy = cast<ScalableVectorType>(II.getType());
1943 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1944 II.getArgOperand(2));
1945 return IC.replaceInstUsesWith(II, Splat);
1946 }
1947
1949 m_SpecificInt(AArch64SVEPredPattern::vl1))))
1950 return std::nullopt;
1951
1952 // sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0
1953 Value *Insert = IC.Builder.CreateInsertElement(
1954 II.getArgOperand(0), II.getArgOperand(2), uint64_t(0));
1955 return IC.replaceInstUsesWith(II, Insert);
1956}
1957
1958static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1959 IntrinsicInst &II) {
1960 // Replace DupX with a regular IR splat.
1961 auto *RetTy = cast<ScalableVectorType>(II.getType());
1962 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1963 II.getArgOperand(0));
1964 Splat->takeName(&II);
1965 return IC.replaceInstUsesWith(II, Splat);
1966}
1967
1968static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1969 IntrinsicInst &II) {
1970 LLVMContext &Ctx = II.getContext();
1971
1972 if (!isAllActivePredicate(II.getArgOperand(0)))
1973 return std::nullopt;
1974
1975 // Check that we have a compare of zero..
1976 auto *SplatValue =
1978 if (!SplatValue || !SplatValue->isZero())
1979 return std::nullopt;
1980
1981 // ..against a dupq
1982 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1983 if (!DupQLane ||
1984 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1985 return std::nullopt;
1986
1987 // Where the dupq is a lane 0 replicate of a vector insert
1988 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1989 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1990 return std::nullopt;
1991
1992 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1993 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1994 return std::nullopt;
1995
1996 // Where the vector insert is a fixed constant vector insert into undef at
1997 // index zero
1998 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1999 return std::nullopt;
2000
2001 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
2002 return std::nullopt;
2003
2004 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
2005 if (!ConstVec)
2006 return std::nullopt;
2007
2008 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
2009 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
2010 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
2011 return std::nullopt;
2012
2013 unsigned NumElts = VecTy->getNumElements();
2014 unsigned PredicateBits = 0;
2015
2016 // Expand intrinsic operands to a 16-bit byte level predicate
2017 for (unsigned I = 0; I < NumElts; ++I) {
2018 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
2019 if (!Arg)
2020 return std::nullopt;
2021 if (!Arg->isZero())
2022 PredicateBits |= 1 << (I * (16 / NumElts));
2023 }
2024
2025 // If all bits are zero bail early with an empty predicate
2026 if (PredicateBits == 0) {
2027 auto *PFalse = Constant::getNullValue(II.getType());
2028 PFalse->takeName(&II);
2029 return IC.replaceInstUsesWith(II, PFalse);
2030 }
2031
2032 // Calculate largest predicate type used (where byte predicate is largest)
2033 unsigned Mask = 8;
2034 for (unsigned I = 0; I < 16; ++I)
2035 if ((PredicateBits & (1 << I)) != 0)
2036 Mask |= (I % 8);
2037
2038 unsigned PredSize = Mask & -Mask;
2039 auto *PredType = ScalableVectorType::get(
2040 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
2041
2042 // Ensure all relevant bits are set
2043 for (unsigned I = 0; I < 16; I += PredSize)
2044 if ((PredicateBits & (1 << I)) == 0)
2045 return std::nullopt;
2046
2047 auto *PTruePat =
2048 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2049 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2050 {PredType}, {PTruePat});
2051 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
2052 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
2053 auto *ConvertFromSVBool =
2054 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
2055 {II.getType()}, {ConvertToSVBool});
2056
2057 ConvertFromSVBool->takeName(&II);
2058 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
2059}
2060
2061static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2062 IntrinsicInst &II) {
2063 Value *Pg = II.getArgOperand(0);
2064 Value *Vec = II.getArgOperand(1);
2065 auto IntrinsicID = II.getIntrinsicID();
2066 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2067
2068 // lastX(splat(X)) --> X
2069 if (auto *SplatVal = getSplatValue(Vec))
2070 return IC.replaceInstUsesWith(II, SplatVal);
2071
2072 // If x and/or y is a splat value then:
2073 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2074 Value *LHS, *RHS;
2075 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
2076 if (isSplatValue(LHS) || isSplatValue(RHS)) {
2077 auto *OldBinOp = cast<BinaryOperator>(Vec);
2078 auto OpC = OldBinOp->getOpcode();
2079 auto *NewLHS =
2080 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
2081 auto *NewRHS =
2082 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
2084 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
2085 return IC.replaceInstUsesWith(II, NewBinOp);
2086 }
2087 }
2088
2089 auto *C = dyn_cast<Constant>(Pg);
2090 if (IsAfter && C && C->isNullValue()) {
2091 // The intrinsic is extracting lane 0 so use an extract instead.
2092 auto *IdxTy = Type::getInt64Ty(II.getContext());
2093 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
2094 Extract->insertBefore(II.getIterator());
2095 Extract->takeName(&II);
2096 return IC.replaceInstUsesWith(II, Extract);
2097 }
2098
2099 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
2100 if (!IntrPG)
2101 return std::nullopt;
2102
2103 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2104 return std::nullopt;
2105
2106 const auto PTruePattern =
2107 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2108
2109 // Can the intrinsic's predicate be converted to a known constant index?
2110 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2111 if (!MinNumElts)
2112 return std::nullopt;
2113
2114 unsigned Idx = MinNumElts - 1;
2115 // Increment the index if extracting the element after the last active
2116 // predicate element.
2117 if (IsAfter)
2118 ++Idx;
2119
2120 // Ignore extracts whose index is larger than the known minimum vector
2121 // length. NOTE: This is an artificial constraint where we prefer to
2122 // maintain what the user asked for until an alternative is proven faster.
2123 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2124 if (Idx >= PgVTy->getMinNumElements())
2125 return std::nullopt;
2126
2127 // The intrinsic is extracting a fixed lane so use an extract instead.
2128 auto *IdxTy = Type::getInt64Ty(II.getContext());
2129 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2130 Extract->insertBefore(II.getIterator());
2131 Extract->takeName(&II);
2132 return IC.replaceInstUsesWith(II, Extract);
2133}
2134
2135static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2136 IntrinsicInst &II) {
2137 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2138 // integer variant across a variety of micro-architectures. Replace scalar
2139 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2140 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2141 // depending on the micro-architecture, but has been observed as generally
2142 // being faster, particularly when the CLAST[AB] op is a loop-carried
2143 // dependency.
2144 Value *Pg = II.getArgOperand(0);
2145 Value *Fallback = II.getArgOperand(1);
2146 Value *Vec = II.getArgOperand(2);
2147 Type *Ty = II.getType();
2148
2149 if (!Ty->isIntegerTy())
2150 return std::nullopt;
2151
2152 Type *FPTy;
2153 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2154 default:
2155 return std::nullopt;
2156 case 16:
2157 FPTy = IC.Builder.getHalfTy();
2158 break;
2159 case 32:
2160 FPTy = IC.Builder.getFloatTy();
2161 break;
2162 case 64:
2163 FPTy = IC.Builder.getDoubleTy();
2164 break;
2165 }
2166
2167 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2168 auto *FPVTy = VectorType::get(
2169 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2170 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2171 auto *FPII = IC.Builder.CreateIntrinsic(
2172 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2173 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2174 return IC.replaceInstUsesWith(II, FPIItoInt);
2175}
2176
2177static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2178 IntrinsicInst &II) {
2179 LLVMContext &Ctx = II.getContext();
2180 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2181 // can work with RDFFR_PP for ptest elimination.
2182 auto *AllPat =
2183 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2184 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2185 {II.getType()}, {AllPat});
2186 auto *RDFFR =
2187 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue});
2188 RDFFR->takeName(&II);
2189 return IC.replaceInstUsesWith(II, RDFFR);
2190}
2191
2192static std::optional<Instruction *>
2194 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2195
2196 if (Pattern == AArch64SVEPredPattern::all) {
2198 II.getType(), ElementCount::getScalable(NumElts));
2199 Cnt->takeName(&II);
2200 return IC.replaceInstUsesWith(II, Cnt);
2201 }
2202
2203 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2204
2205 return MinNumElts && NumElts >= MinNumElts
2206 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2207 II, ConstantInt::get(II.getType(), MinNumElts)))
2208 : std::nullopt;
2209}
2210
2211static std::optional<Instruction *>
2213 const AArch64Subtarget *ST) {
2214 if (!ST->isStreaming())
2215 return std::nullopt;
2216
2217 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2218 // with SVEPredPattern::all
2219 Value *Cnt =
2221 Cnt->takeName(&II);
2222 return IC.replaceInstUsesWith(II, Cnt);
2223}
2224
2225static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2226 IntrinsicInst &II) {
2227 Value *PgVal = II.getArgOperand(0);
2228 Value *OpVal = II.getArgOperand(1);
2229
2230 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2231 // Later optimizations prefer this form.
2232 if (PgVal == OpVal &&
2233 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2234 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2235 Value *Ops[] = {PgVal, OpVal};
2236 Type *Tys[] = {PgVal->getType()};
2237
2238 auto *PTest =
2239 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2240 PTest->takeName(&II);
2241
2242 return IC.replaceInstUsesWith(II, PTest);
2243 }
2244
2247
2248 if (!Pg || !Op)
2249 return std::nullopt;
2250
2251 Intrinsic::ID OpIID = Op->getIntrinsicID();
2252
2253 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2254 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2255 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2256 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2257 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2258
2259 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2260
2261 PTest->takeName(&II);
2262 return IC.replaceInstUsesWith(II, PTest);
2263 }
2264
2265 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2266 // Later optimizations may rewrite sequence to use the flag-setting variant
2267 // of instruction X to remove PTEST.
2268 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2269 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2270 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2271 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2272 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2273 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2274 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2275 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2276 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2277 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2278 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2279 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2280 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2281 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2282 Type *Tys[] = {Pg->getType()};
2283
2284 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2285 PTest->takeName(&II);
2286
2287 return IC.replaceInstUsesWith(II, PTest);
2288 }
2289
2290 return std::nullopt;
2291}
2292
2293template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2294static std::optional<Instruction *>
2296 bool MergeIntoAddendOp) {
2297 Value *P = II.getOperand(0);
2298 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2299 if (MergeIntoAddendOp) {
2300 AddendOp = II.getOperand(1);
2301 Mul = II.getOperand(2);
2302 } else {
2303 AddendOp = II.getOperand(2);
2304 Mul = II.getOperand(1);
2305 }
2306
2308 m_Value(MulOp1))))
2309 return std::nullopt;
2310
2311 if (!Mul->hasOneUse())
2312 return std::nullopt;
2313
2314 Instruction *FMFSource = nullptr;
2315 if (II.getType()->isFPOrFPVectorTy()) {
2316 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2317 // Stop the combine when the flags on the inputs differ in case dropping
2318 // flags would lead to us missing out on more beneficial optimizations.
2319 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2320 return std::nullopt;
2321 if (!FAddFlags.allowContract())
2322 return std::nullopt;
2323 FMFSource = &II;
2324 }
2325
2326 CallInst *Res;
2327 if (MergeIntoAddendOp)
2328 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2329 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2330 else
2331 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2332 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2333
2334 return IC.replaceInstUsesWith(II, Res);
2335}
2336
2337static std::optional<Instruction *>
2339 Value *Pred = II.getOperand(0);
2340 Value *PtrOp = II.getOperand(1);
2341 Type *VecTy = II.getType();
2342
2343 if (isAllActivePredicate(Pred)) {
2344 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2345 Load->copyMetadata(II);
2346 return IC.replaceInstUsesWith(II, Load);
2347 }
2348
2349 CallInst *MaskedLoad =
2350 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2351 Pred, ConstantAggregateZero::get(VecTy));
2352 MaskedLoad->copyMetadata(II);
2353 return IC.replaceInstUsesWith(II, MaskedLoad);
2354}
2355
2356static std::optional<Instruction *>
2358 Value *VecOp = II.getOperand(0);
2359 Value *Pred = II.getOperand(1);
2360 Value *PtrOp = II.getOperand(2);
2361
2362 if (isAllActivePredicate(Pred)) {
2363 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2364 Store->copyMetadata(II);
2365 return IC.eraseInstFromFunction(II);
2366 }
2367
2368 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2369 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2370 MaskedStore->copyMetadata(II);
2371 return IC.eraseInstFromFunction(II);
2372}
2373
2375 switch (Intrinsic) {
2376 case Intrinsic::aarch64_sve_fmul_u:
2377 return Instruction::BinaryOps::FMul;
2378 case Intrinsic::aarch64_sve_fadd_u:
2379 return Instruction::BinaryOps::FAdd;
2380 case Intrinsic::aarch64_sve_fsub_u:
2381 return Instruction::BinaryOps::FSub;
2382 default:
2383 return Instruction::BinaryOpsEnd;
2384 }
2385}
2386
2387static std::optional<Instruction *>
2389 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2390 if (II.isStrictFP())
2391 return std::nullopt;
2392
2393 auto *OpPredicate = II.getOperand(0);
2394 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2395 if (BinOpCode == Instruction::BinaryOpsEnd ||
2396 !isAllActivePredicate(OpPredicate))
2397 return std::nullopt;
2398 auto BinOp = IC.Builder.CreateBinOpFMF(
2399 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2400 return IC.replaceInstUsesWith(II, BinOp);
2401}
2402
2403static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2404 IntrinsicInst &II) {
2405 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2406 Intrinsic::aarch64_sve_mla>(
2407 IC, II, true))
2408 return MLA;
2409 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2410 Intrinsic::aarch64_sve_mad>(
2411 IC, II, false))
2412 return MAD;
2413 return std::nullopt;
2414}
2415
2416static std::optional<Instruction *>
2418 if (auto FMLA =
2419 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2420 Intrinsic::aarch64_sve_fmla>(IC, II,
2421 true))
2422 return FMLA;
2423 if (auto FMAD =
2424 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2425 Intrinsic::aarch64_sve_fmad>(IC, II,
2426 false))
2427 return FMAD;
2428 if (auto FMLA =
2429 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2430 Intrinsic::aarch64_sve_fmla>(IC, II,
2431 true))
2432 return FMLA;
2433 return std::nullopt;
2434}
2435
2436static std::optional<Instruction *>
2438 if (auto FMLA =
2439 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2440 Intrinsic::aarch64_sve_fmla>(IC, II,
2441 true))
2442 return FMLA;
2443 if (auto FMAD =
2444 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2445 Intrinsic::aarch64_sve_fmad>(IC, II,
2446 false))
2447 return FMAD;
2448 if (auto FMLA_U =
2449 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2450 Intrinsic::aarch64_sve_fmla_u>(
2451 IC, II, true))
2452 return FMLA_U;
2453 return instCombineSVEVectorBinOp(IC, II);
2454}
2455
2456static std::optional<Instruction *>
2458 if (auto FMLS =
2459 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2460 Intrinsic::aarch64_sve_fmls>(IC, II,
2461 true))
2462 return FMLS;
2463 if (auto FMSB =
2464 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2465 Intrinsic::aarch64_sve_fnmsb>(
2466 IC, II, false))
2467 return FMSB;
2468 if (auto FMLS =
2469 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2470 Intrinsic::aarch64_sve_fmls>(IC, II,
2471 true))
2472 return FMLS;
2473 return std::nullopt;
2474}
2475
2476static std::optional<Instruction *>
2478 if (auto FMLS =
2479 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2480 Intrinsic::aarch64_sve_fmls>(IC, II,
2481 true))
2482 return FMLS;
2483 if (auto FMSB =
2484 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2485 Intrinsic::aarch64_sve_fnmsb>(
2486 IC, II, false))
2487 return FMSB;
2488 if (auto FMLS_U =
2489 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2490 Intrinsic::aarch64_sve_fmls_u>(
2491 IC, II, true))
2492 return FMLS_U;
2493 return instCombineSVEVectorBinOp(IC, II);
2494}
2495
2496static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2497 IntrinsicInst &II) {
2498 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2499 Intrinsic::aarch64_sve_mls>(
2500 IC, II, true))
2501 return MLS;
2502 return std::nullopt;
2503}
2504
2505static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2506 IntrinsicInst &II) {
2507 Value *UnpackArg = II.getArgOperand(0);
2508 auto *RetTy = cast<ScalableVectorType>(II.getType());
2509 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2510 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2511
2512 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2513 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2514 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2515 ScalarArg =
2516 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2517 Value *NewVal =
2518 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2519 NewVal->takeName(&II);
2520 return IC.replaceInstUsesWith(II, NewVal);
2521 }
2522
2523 return std::nullopt;
2524}
2525static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2526 IntrinsicInst &II) {
2527 auto *OpVal = II.getOperand(0);
2528 auto *OpIndices = II.getOperand(1);
2529 VectorType *VTy = cast<VectorType>(II.getType());
2530
2531 // Check whether OpIndices is a constant splat value < minimal element count
2532 // of result.
2533 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2534 if (!SplatValue ||
2535 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2536 return std::nullopt;
2537
2538 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2539 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2540 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2541 auto *VectorSplat =
2542 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2543
2544 VectorSplat->takeName(&II);
2545 return IC.replaceInstUsesWith(II, VectorSplat);
2546}
2547
2548static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2549 IntrinsicInst &II) {
2550 Value *A, *B;
2551 Type *RetTy = II.getType();
2552 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2553 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2554
2555 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2556 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2557 if ((match(II.getArgOperand(0),
2559 match(II.getArgOperand(1),
2561 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2562 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2563 auto *TyA = cast<ScalableVectorType>(A->getType());
2564 if (TyA == B->getType() &&
2566 auto *SubVec = IC.Builder.CreateInsertVector(
2567 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2568 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2569 TyA->getMinNumElements());
2570 ConcatVec->takeName(&II);
2571 return IC.replaceInstUsesWith(II, ConcatVec);
2572 }
2573 }
2574
2575 return std::nullopt;
2576}
2577
2578static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2579 IntrinsicInst &II) {
2580 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2581 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2582 Value *A, *B;
2583 if (match(II.getArgOperand(0),
2586 m_Specific(A), m_Specific(B))))
2587 return IC.replaceInstUsesWith(
2588 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2589
2590 return std::nullopt;
2591}
2592
2593static std::optional<Instruction *>
2595 Value *Mask = II.getOperand(0);
2596 Value *BasePtr = II.getOperand(1);
2597 Value *Index = II.getOperand(2);
2598 Type *Ty = II.getType();
2599 Value *PassThru = ConstantAggregateZero::get(Ty);
2600
2601 // Contiguous gather => masked load.
2602 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2603 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2604 Value *IndexBase;
2606 m_Value(IndexBase), m_SpecificInt(1)))) {
2607 Align Alignment =
2608 BasePtr->getPointerAlignment(II.getDataLayout());
2609
2610 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2611 BasePtr, IndexBase);
2612 CallInst *MaskedLoad =
2613 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2614 MaskedLoad->takeName(&II);
2615 return IC.replaceInstUsesWith(II, MaskedLoad);
2616 }
2617
2618 return std::nullopt;
2619}
2620
2621static std::optional<Instruction *>
2623 Value *Val = II.getOperand(0);
2624 Value *Mask = II.getOperand(1);
2625 Value *BasePtr = II.getOperand(2);
2626 Value *Index = II.getOperand(3);
2627 Type *Ty = Val->getType();
2628
2629 // Contiguous scatter => masked store.
2630 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2631 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2632 Value *IndexBase;
2634 m_Value(IndexBase), m_SpecificInt(1)))) {
2635 Align Alignment =
2636 BasePtr->getPointerAlignment(II.getDataLayout());
2637
2638 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2639 BasePtr, IndexBase);
2640 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2641
2642 return IC.eraseInstFromFunction(II);
2643 }
2644
2645 return std::nullopt;
2646}
2647
2648static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2649 IntrinsicInst &II) {
2651 Value *Pred = II.getOperand(0);
2652 Value *Vec = II.getOperand(1);
2653 Value *DivVec = II.getOperand(2);
2654
2655 Value *SplatValue = getSplatValue(DivVec);
2656 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2657 if (!SplatConstantInt)
2658 return std::nullopt;
2659
2660 APInt Divisor = SplatConstantInt->getValue();
2661 const int64_t DivisorValue = Divisor.getSExtValue();
2662 if (DivisorValue == -1)
2663 return std::nullopt;
2664 if (DivisorValue == 1)
2665 IC.replaceInstUsesWith(II, Vec);
2666
2667 if (Divisor.isPowerOf2()) {
2668 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2669 auto ASRD = IC.Builder.CreateIntrinsic(
2670 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2671 return IC.replaceInstUsesWith(II, ASRD);
2672 }
2673 if (Divisor.isNegatedPowerOf2()) {
2674 Divisor.negate();
2675 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2676 auto ASRD = IC.Builder.CreateIntrinsic(
2677 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2678 auto NEG = IC.Builder.CreateIntrinsic(
2679 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2680 return IC.replaceInstUsesWith(II, NEG);
2681 }
2682
2683 return std::nullopt;
2684}
2685
2686bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2687 size_t VecSize = Vec.size();
2688 if (VecSize == 1)
2689 return true;
2690 if (!isPowerOf2_64(VecSize))
2691 return false;
2692 size_t HalfVecSize = VecSize / 2;
2693
2694 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2695 RHS != Vec.end(); LHS++, RHS++) {
2696 if (*LHS != nullptr && *RHS != nullptr) {
2697 if (*LHS == *RHS)
2698 continue;
2699 else
2700 return false;
2701 }
2702 if (!AllowPoison)
2703 return false;
2704 if (*LHS == nullptr && *RHS != nullptr)
2705 *LHS = *RHS;
2706 }
2707
2708 Vec.resize(HalfVecSize);
2709 SimplifyValuePattern(Vec, AllowPoison);
2710 return true;
2711}
2712
2713// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2714// to dupqlane(f64(C)) where C is A concatenated with B
2715static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2716 IntrinsicInst &II) {
2717 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2718 if (!match(II.getOperand(0),
2720 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2721 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2722 return std::nullopt;
2723 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2724
2725 // Insert the scalars into a container ordered by InsertElement index
2726 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2727 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2728 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2729 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2730 CurrentInsertElt = InsertElt->getOperand(0);
2731 }
2732
2733 bool AllowPoison =
2734 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2735 if (!SimplifyValuePattern(Elts, AllowPoison))
2736 return std::nullopt;
2737
2738 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2739 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2740 for (size_t I = 0; I < Elts.size(); I++) {
2741 if (Elts[I] == nullptr)
2742 continue;
2743 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2744 IC.Builder.getInt64(I));
2745 }
2746 if (InsertEltChain == nullptr)
2747 return std::nullopt;
2748
2749 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2750 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2751 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2752 // be narrowed back to the original type.
2753 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2754 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2755 IIScalableTy->getMinNumElements() /
2756 PatternWidth;
2757
2758 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2759 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2760 auto *WideShuffleMaskTy =
2761 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2762
2763 auto InsertSubvector = IC.Builder.CreateInsertVector(
2764 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2765 uint64_t(0));
2766 auto WideBitcast =
2767 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2768 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2769 auto WideShuffle = IC.Builder.CreateShuffleVector(
2770 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2771 auto NarrowBitcast =
2772 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2773
2774 return IC.replaceInstUsesWith(II, NarrowBitcast);
2775}
2776
2777static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2778 IntrinsicInst &II) {
2779 Value *A = II.getArgOperand(0);
2780 Value *B = II.getArgOperand(1);
2781 if (A == B)
2782 return IC.replaceInstUsesWith(II, A);
2783
2784 return std::nullopt;
2785}
2786
2787static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2788 IntrinsicInst &II) {
2789 Value *Pred = II.getOperand(0);
2790 Value *Vec = II.getOperand(1);
2791 Value *Shift = II.getOperand(2);
2792
2793 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2794 Value *AbsPred, *MergedValue;
2796 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2798 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2799
2800 return std::nullopt;
2801
2802 // Transform is valid if any of the following are true:
2803 // * The ABS merge value is an undef or non-negative
2804 // * The ABS predicate is all active
2805 // * The ABS predicate and the SRSHL predicates are the same
2806 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2807 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2808 return std::nullopt;
2809
2810 // Only valid when the shift amount is non-negative, otherwise the rounding
2811 // behaviour of SRSHL cannot be ignored.
2812 if (!match(Shift, m_NonNegative()))
2813 return std::nullopt;
2814
2815 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2816 {II.getType()}, {Pred, Vec, Shift});
2817
2818 return IC.replaceInstUsesWith(II, LSL);
2819}
2820
2821static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2822 IntrinsicInst &II) {
2823 Value *Vec = II.getOperand(0);
2824
2825 if (getSplatValue(Vec) == II.getOperand(1))
2826 return IC.replaceInstUsesWith(II, Vec);
2827
2828 return std::nullopt;
2829}
2830
2831static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2832 IntrinsicInst &II) {
2833 // If this barrier is post-dominated by identical one we can remove it
2834 auto *NI = II.getNextNode();
2835 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2836 auto CanSkipOver = [](Instruction *I) {
2837 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2838 };
2839 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2840 auto *NIBB = NI->getParent();
2841 NI = NI->getNextNode();
2842 if (!NI) {
2843 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2844 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2845 else
2846 break;
2847 }
2848 }
2849 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2850 if (NextII && II.isIdenticalTo(NextII))
2851 return IC.eraseInstFromFunction(II);
2852
2853 return std::nullopt;
2854}
2855
2856static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
2857 IntrinsicInst &II) {
2858 return IC.replaceInstUsesWith(
2859 II,
2860 IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
2861 {II.getType(), II.getOperand(0)->getType()},
2862 {II.getOperand(0), II.getOperand(1)}));
2863}
2864
2865static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2866 IntrinsicInst &II) {
2868 return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
2869 return std::nullopt;
2870}
2871
2872static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2874 unsigned NumBits) {
2875 Value *Passthru = II.getOperand(0);
2876 Value *Pg = II.getOperand(1);
2877 Value *Op = II.getOperand(2);
2878
2879 // Convert UXT[BHW] to AND.
2880 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
2881 auto *Ty = cast<VectorType>(II.getType());
2882 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
2883 auto *Mask = ConstantInt::get(Ty, MaskValue);
2884 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
2885 {Pg, Op, Mask});
2886 return IC.replaceInstUsesWith(II, And);
2887 }
2888
2889 return std::nullopt;
2890}
2891
2892static std::optional<Instruction *>
2894 SMEAttrs FnSMEAttrs(*II.getFunction());
2895 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2896 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
2897 return IC.replaceInstUsesWith(
2898 II, ConstantInt::getBool(II.getType(), IsStreaming));
2899 return std::nullopt;
2900}
2901
2902std::optional<Instruction *>
2904 IntrinsicInst &II) const {
2906 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2907 return I;
2908
2909 Intrinsic::ID IID = II.getIntrinsicID();
2910 switch (IID) {
2911 default:
2912 break;
2913 case Intrinsic::aarch64_dmb:
2914 return instCombineDMB(IC, II);
2915 case Intrinsic::aarch64_neon_fmaxnm:
2916 case Intrinsic::aarch64_neon_fminnm:
2917 return instCombineMaxMinNM(IC, II);
2918 case Intrinsic::aarch64_sve_convert_from_svbool:
2919 return instCombineConvertFromSVBool(IC, II);
2920 case Intrinsic::aarch64_sve_dup:
2921 return instCombineSVEDup(IC, II);
2922 case Intrinsic::aarch64_sve_dup_x:
2923 return instCombineSVEDupX(IC, II);
2924 case Intrinsic::aarch64_sve_cmpne:
2925 case Intrinsic::aarch64_sve_cmpne_wide:
2926 return instCombineSVECmpNE(IC, II);
2927 case Intrinsic::aarch64_sve_rdffr:
2928 return instCombineRDFFR(IC, II);
2929 case Intrinsic::aarch64_sve_lasta:
2930 case Intrinsic::aarch64_sve_lastb:
2931 return instCombineSVELast(IC, II);
2932 case Intrinsic::aarch64_sve_clasta_n:
2933 case Intrinsic::aarch64_sve_clastb_n:
2934 return instCombineSVECondLast(IC, II);
2935 case Intrinsic::aarch64_sve_cntd:
2936 return instCombineSVECntElts(IC, II, 2);
2937 case Intrinsic::aarch64_sve_cntw:
2938 return instCombineSVECntElts(IC, II, 4);
2939 case Intrinsic::aarch64_sve_cnth:
2940 return instCombineSVECntElts(IC, II, 8);
2941 case Intrinsic::aarch64_sve_cntb:
2942 return instCombineSVECntElts(IC, II, 16);
2943 case Intrinsic::aarch64_sme_cntsd:
2944 return instCombineSMECntsd(IC, II, ST);
2945 case Intrinsic::aarch64_sve_ptest_any:
2946 case Intrinsic::aarch64_sve_ptest_first:
2947 case Intrinsic::aarch64_sve_ptest_last:
2948 return instCombineSVEPTest(IC, II);
2949 case Intrinsic::aarch64_sve_fadd:
2950 return instCombineSVEVectorFAdd(IC, II);
2951 case Intrinsic::aarch64_sve_fadd_u:
2952 return instCombineSVEVectorFAddU(IC, II);
2953 case Intrinsic::aarch64_sve_fmul_u:
2954 return instCombineSVEVectorBinOp(IC, II);
2955 case Intrinsic::aarch64_sve_fsub:
2956 return instCombineSVEVectorFSub(IC, II);
2957 case Intrinsic::aarch64_sve_fsub_u:
2958 return instCombineSVEVectorFSubU(IC, II);
2959 case Intrinsic::aarch64_sve_add:
2960 return instCombineSVEVectorAdd(IC, II);
2961 case Intrinsic::aarch64_sve_add_u:
2962 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2963 Intrinsic::aarch64_sve_mla_u>(
2964 IC, II, true);
2965 case Intrinsic::aarch64_sve_sub:
2966 return instCombineSVEVectorSub(IC, II);
2967 case Intrinsic::aarch64_sve_sub_u:
2968 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2969 Intrinsic::aarch64_sve_mls_u>(
2970 IC, II, true);
2971 case Intrinsic::aarch64_sve_tbl:
2972 return instCombineSVETBL(IC, II);
2973 case Intrinsic::aarch64_sve_uunpkhi:
2974 case Intrinsic::aarch64_sve_uunpklo:
2975 case Intrinsic::aarch64_sve_sunpkhi:
2976 case Intrinsic::aarch64_sve_sunpklo:
2977 return instCombineSVEUnpack(IC, II);
2978 case Intrinsic::aarch64_sve_uzp1:
2979 return instCombineSVEUzp1(IC, II);
2980 case Intrinsic::aarch64_sve_zip1:
2981 case Intrinsic::aarch64_sve_zip2:
2982 return instCombineSVEZip(IC, II);
2983 case Intrinsic::aarch64_sve_ld1_gather_index:
2984 return instCombineLD1GatherIndex(IC, II);
2985 case Intrinsic::aarch64_sve_st1_scatter_index:
2986 return instCombineST1ScatterIndex(IC, II);
2987 case Intrinsic::aarch64_sve_ld1:
2988 return instCombineSVELD1(IC, II, DL);
2989 case Intrinsic::aarch64_sve_st1:
2990 return instCombineSVEST1(IC, II, DL);
2991 case Intrinsic::aarch64_sve_sdiv:
2992 return instCombineSVESDIV(IC, II);
2993 case Intrinsic::aarch64_sve_sel:
2994 return instCombineSVESel(IC, II);
2995 case Intrinsic::aarch64_sve_srshl:
2996 return instCombineSVESrshl(IC, II);
2997 case Intrinsic::aarch64_sve_dupq_lane:
2998 return instCombineSVEDupqLane(IC, II);
2999 case Intrinsic::aarch64_sve_insr:
3000 return instCombineSVEInsr(IC, II);
3001 case Intrinsic::aarch64_sve_whilelo:
3002 return instCombineWhilelo(IC, II);
3003 case Intrinsic::aarch64_sve_ptrue:
3004 return instCombinePTrue(IC, II);
3005 case Intrinsic::aarch64_sve_uxtb:
3006 return instCombineSVEUxt(IC, II, 8);
3007 case Intrinsic::aarch64_sve_uxth:
3008 return instCombineSVEUxt(IC, II, 16);
3009 case Intrinsic::aarch64_sve_uxtw:
3010 return instCombineSVEUxt(IC, II, 32);
3011 case Intrinsic::aarch64_sme_in_streaming_mode:
3012 return instCombineInStreamingMode(IC, II);
3013 }
3014
3015 return std::nullopt;
3016}
3017
3019 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
3020 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
3021 std::function<void(Instruction *, unsigned, APInt, APInt &)>
3022 SimplifyAndSetOp) const {
3023 switch (II.getIntrinsicID()) {
3024 default:
3025 break;
3026 case Intrinsic::aarch64_neon_fcvtxn:
3027 case Intrinsic::aarch64_neon_rshrn:
3028 case Intrinsic::aarch64_neon_sqrshrn:
3029 case Intrinsic::aarch64_neon_sqrshrun:
3030 case Intrinsic::aarch64_neon_sqshrn:
3031 case Intrinsic::aarch64_neon_sqshrun:
3032 case Intrinsic::aarch64_neon_sqxtn:
3033 case Intrinsic::aarch64_neon_sqxtun:
3034 case Intrinsic::aarch64_neon_uqrshrn:
3035 case Intrinsic::aarch64_neon_uqshrn:
3036 case Intrinsic::aarch64_neon_uqxtn:
3037 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
3038 break;
3039 }
3040
3041 return std::nullopt;
3042}
3043
3045 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3047}
3048
3051 switch (K) {
3053 return TypeSize::getFixed(64);
3055 if (ST->useSVEForFixedLengthVectors() &&
3056 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
3057 return TypeSize::getFixed(
3058 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
3059 else if (ST->isNeonAvailable())
3060 return TypeSize::getFixed(128);
3061 else
3062 return TypeSize::getFixed(0);
3064 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3066 return TypeSize::getScalable(128);
3067 else
3068 return TypeSize::getScalable(0);
3069 }
3070 llvm_unreachable("Unsupported register kind");
3071}
3072
3073bool AArch64TTIImpl::isSingleExtWideningInstruction(
3074 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3075 Type *SrcOverrideTy) const {
3076 // A helper that returns a vector type from the given type. The number of
3077 // elements in type Ty determines the vector width.
3078 auto toVectorTy = [&](Type *ArgTy) {
3079 return VectorType::get(ArgTy->getScalarType(),
3080 cast<VectorType>(DstTy)->getElementCount());
3081 };
3082
3083 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3084 // i32, i64]. SVE doesn't generally have the same set of instructions to
3085 // perform an extend with the add/sub/mul. There are SMULLB style
3086 // instructions, but they operate on top/bottom, requiring some sort of lane
3087 // interleaving to be used with zext/sext.
3088 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3089 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3090 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3091 return false;
3092
3093 Type *SrcTy = SrcOverrideTy;
3094 switch (Opcode) {
3095 case Instruction::Add: // UADDW(2), SADDW(2).
3096 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3097 // The second operand needs to be an extend
3098 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
3099 if (!SrcTy)
3100 SrcTy =
3101 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
3102 break;
3103 }
3104
3105 if (Opcode == Instruction::Sub)
3106 return false;
3107
3108 // UADDW(2), SADDW(2) can be commutted.
3109 if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
3110 if (!SrcTy)
3111 SrcTy =
3112 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3113 break;
3114 }
3115 return false;
3116 }
3117 default:
3118 return false;
3119 }
3120
3121 // Legalize the destination type and ensure it can be used in a widening
3122 // operation.
3123 auto DstTyL = getTypeLegalizationCost(DstTy);
3124 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3125 return false;
3126
3127 // Legalize the source type and ensure it can be used in a widening
3128 // operation.
3129 assert(SrcTy && "Expected some SrcTy");
3130 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3131 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3132 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3133 return false;
3134
3135 // Get the total number of vector elements in the legalized types.
3136 InstructionCost NumDstEls =
3137 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3138 InstructionCost NumSrcEls =
3139 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3140
3141 // Return true if the legalized types have the same number of vector elements
3142 // and the destination element type size is twice that of the source type.
3143 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3144}
3145
3146Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3148 Type *SrcOverrideTy) const {
3149 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3150 Opcode != Instruction::Mul)
3151 return nullptr;
3152
3153 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3154 // i32, i64]. SVE doesn't generally have the same set of instructions to
3155 // perform an extend with the add/sub/mul. There are SMULLB style
3156 // instructions, but they operate on top/bottom, requiring some sort of lane
3157 // interleaving to be used with zext/sext.
3158 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3159 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3160 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3161 return nullptr;
3162
3163 auto getScalarSizeWithOverride = [&](const Value *V) {
3164 if (SrcOverrideTy)
3165 return SrcOverrideTy->getScalarSizeInBits();
3166 return cast<Instruction>(V)
3167 ->getOperand(0)
3168 ->getType()
3169 ->getScalarSizeInBits();
3170 };
3171
3172 unsigned MaxEltSize = 0;
3173 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3174 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3175 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3176 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3177 MaxEltSize = std::max(EltSize0, EltSize1);
3178 } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
3179 isa<SExtInst, ZExtInst>(Args[1])) {
3180 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3181 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3182 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3183 // enough.
3184 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3185 return nullptr;
3186 MaxEltSize = DstEltSize / 2;
3187 } else if (Opcode == Instruction::Mul &&
3188 (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
3189 // If one of the operands is a Zext and the other has enough zero bits
3190 // to be treated as unsigned, we can still generate a umull, meaning the
3191 // zext is free.
3192 KnownBits Known =
3193 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3194 if (Args[0]->getType()->getScalarSizeInBits() -
3195 Known.Zero.countLeadingOnes() >
3196 DstTy->getScalarSizeInBits() / 2)
3197 return nullptr;
3198
3199 MaxEltSize =
3200 getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
3201 } else
3202 return nullptr;
3203
3204 if (MaxEltSize * 2 > DstEltSize)
3205 return nullptr;
3206
3207 Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
3208 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3209 return nullptr;
3210 return ExtTy;
3211}
3212
3213// s/urhadd instructions implement the following pattern, making the
3214// extends free:
3215// %x = add ((zext i8 -> i16), 1)
3216// %y = (zext i8 -> i16)
3217// trunc i16 (lshr (add %x, %y), 1) -> i8
3218//
3220 Type *Src) const {
3221 // The source should be a legal vector type.
3222 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3223 (Src->isScalableTy() && !ST->hasSVE2()))
3224 return false;
3225
3226 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3227 return false;
3228
3229 // Look for trunc/shl/add before trying to match the pattern.
3230 const Instruction *Add = ExtUser;
3231 auto *AddUser =
3232 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3233 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3234 Add = AddUser;
3235
3236 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3237 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3238 return false;
3239
3240 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3241 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3242 Src->getScalarSizeInBits() !=
3243 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3244 return false;
3245
3246 // Try to match the whole pattern. Ext could be either the first or second
3247 // m_ZExtOrSExt matched.
3248 Instruction *Ex1, *Ex2;
3249 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3250 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
3251 return false;
3252
3253 // Ensure both extends are of the same type
3254 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3255 Ex1->getOpcode() == Ex2->getOpcode())
3256 return true;
3257
3258 return false;
3259}
3260
3262 Type *Src,
3265 const Instruction *I) const {
3266 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3267 assert(ISD && "Invalid opcode");
3268 // If the cast is observable, and it is used by a widening instruction (e.g.,
3269 // uaddl, saddw, etc.), it may be free.
3270 if (I && I->hasOneUser()) {
3271 auto *SingleUser = cast<Instruction>(*I->user_begin());
3272 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3273 if (Type *ExtTy = isBinExtWideningInstruction(
3274 SingleUser->getOpcode(), Dst, Operands,
3275 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3276 // The cost from Src->Src*2 needs to be added if required, the cost from
3277 // Src*2->ExtTy is free.
3278 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3279 Type *DoubleSrcTy =
3280 Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
3281 return getCastInstrCost(Opcode, DoubleSrcTy, Src,
3283 }
3284
3285 return 0;
3286 }
3287
3288 if (isSingleExtWideningInstruction(
3289 SingleUser->getOpcode(), Dst, Operands,
3290 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3291 // For adds only count the second operand as free if both operands are
3292 // extends but not the same operation. (i.e both operands are not free in
3293 // add(sext, zext)).
3294 if (SingleUser->getOpcode() == Instruction::Add) {
3295 if (I == SingleUser->getOperand(1) ||
3296 (isa<CastInst>(SingleUser->getOperand(1)) &&
3297 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3298 return 0;
3299 } else {
3300 // Others are free so long as isSingleExtWideningInstruction
3301 // returned true.
3302 return 0;
3303 }
3304 }
3305
3306 // The cast will be free for the s/urhadd instructions
3307 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3308 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3309 return 0;
3310 }
3311
3312 EVT SrcTy = TLI->getValueType(DL, Src);
3313 EVT DstTy = TLI->getValueType(DL, Dst);
3314
3315 if (!SrcTy.isSimple() || !DstTy.isSimple())
3316 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3317
3318 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3319 // we use fcvtx under SVE2. Give them invalid costs.
3320 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3321 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3322 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3324
3325 static const TypeConversionCostTblEntry BF16Tbl[] = {
3326 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3327 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3328 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3329 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3330 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3331 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3332 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3333 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3334 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3335 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3336 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3337 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3338 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3339 };
3340
3341 if (ST->hasBF16())
3342 if (const auto *Entry = ConvertCostTableLookup(
3343 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3344 return Entry->Cost;
3345
3346 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3347 // The cost of unpacking twice is artificially increased for now in order
3348 // to avoid regressions against NEON, which will use tbl instructions directly
3349 // instead of multiple layers of [s|u]unpk[lo|hi].
3350 // We use the unpacks in cases where the destination type is illegal and
3351 // requires splitting of the input, even if the input type itself is legal.
3352 const unsigned int SVE_EXT_COST = 1;
3353 const unsigned int SVE_FCVT_COST = 1;
3354 const unsigned int SVE_UNPACK_ONCE = 4;
3355 const unsigned int SVE_UNPACK_TWICE = 16;
3356
3357 static const TypeConversionCostTblEntry ConversionTbl[] = {
3358 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3359 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3360 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3361 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3362 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3363 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3364 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3365 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3366 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3367 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3368 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3369 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3370 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3371 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3372 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3373 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3374 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3375 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3376 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3377 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3378
3379 // Truncations on nxvmiN
3380 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3381 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3382 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3383 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3384 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3385 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3386 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3387 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3388 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3389 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3390 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3391 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3392 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3393 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3394 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3395 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3396 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3397 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3398 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3399 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3400 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3401 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3402 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3403 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3404 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3405 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3406 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3407 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3408 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3409 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3410 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3411 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3412 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3413
3414 // The number of shll instructions for the extension.
3415 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3416 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3417 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3418 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3419 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3420 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3421 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3422 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3423 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3424 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3425 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3426 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3427 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3428 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3429 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3430 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3431
3432 // FP Ext and trunc
3433 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3434 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3435 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3436 // FP16
3437 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3438 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3439 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3440 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3441 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3442 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3443 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3444 // BF16 (uses shift)
3445 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3446 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3447 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3448 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3449 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3450 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3451 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3452 // FP Ext and trunc
3453 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3454 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3455 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3456 // FP16
3457 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3458 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3459 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3460 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3461 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3462 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3463 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3464 // BF16 (more complex, with +bf16 is handled above)
3465 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3466 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3467 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3468 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3469 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3470 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3471 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3472 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3473
3474 // LowerVectorINT_TO_FP:
3475 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3476 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3477 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3478 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3479 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3480 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3481
3482 // SVE: to nxv2f16
3483 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3484 SVE_EXT_COST + SVE_FCVT_COST},
3485 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3486 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3487 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3488 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3489 SVE_EXT_COST + SVE_FCVT_COST},
3490 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3491 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3492 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3493
3494 // SVE: to nxv4f16
3495 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3496 SVE_EXT_COST + SVE_FCVT_COST},
3497 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3498 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3499 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3500 SVE_EXT_COST + SVE_FCVT_COST},
3501 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3502 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3503
3504 // SVE: to nxv8f16
3505 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3506 SVE_EXT_COST + SVE_FCVT_COST},
3507 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3508 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3509 SVE_EXT_COST + SVE_FCVT_COST},
3510 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3511
3512 // SVE: to nxv16f16
3513 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3514 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3515 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3516 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3517
3518 // Complex: to v2f32
3519 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3520 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3521 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3522 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3523
3524 // SVE: to nxv2f32
3525 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3526 SVE_EXT_COST + SVE_FCVT_COST},
3527 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3528 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3529 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3530 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3531 SVE_EXT_COST + SVE_FCVT_COST},
3532 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3533 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3534 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3535
3536 // Complex: to v4f32
3537 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3538 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3539 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3540 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3541
3542 // SVE: to nxv4f32
3543 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3544 SVE_EXT_COST + SVE_FCVT_COST},
3545 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3546 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3547 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3548 SVE_EXT_COST + SVE_FCVT_COST},
3549 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3550 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3551
3552 // Complex: to v8f32
3553 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3554 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3555 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3556 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3557
3558 // SVE: to nxv8f32
3559 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3560 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3561 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3562 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3563 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3564 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3565 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3566 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3567
3568 // SVE: to nxv16f32
3569 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3570 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3571 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3572 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3573
3574 // Complex: to v16f32
3575 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3576 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3577
3578 // Complex: to v2f64
3579 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3580 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3581 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3582 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3583 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3584 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3585
3586 // SVE: to nxv2f64
3587 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3588 SVE_EXT_COST + SVE_FCVT_COST},
3589 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3590 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3591 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3592 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3593 SVE_EXT_COST + SVE_FCVT_COST},
3594 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3595 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3596 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3597
3598 // Complex: to v4f64
3599 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3600 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3601
3602 // SVE: to nxv4f64
3603 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3604 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3605 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3606 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3607 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3608 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3609 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3610 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3611 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3612 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3613 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3614 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3615
3616 // SVE: to nxv8f64
3617 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3618 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3619 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3620 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3621 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3622 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3623 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3624 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3625
3626 // LowerVectorFP_TO_INT
3627 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3628 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3629 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3630 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3631 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3632 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3633
3634 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3635 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3636 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3637 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3638 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3639 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3640 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3641
3642 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3643 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3644 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3645 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3646 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3647
3648 // Complex, from nxv2f32.
3649 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3650 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3651 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3652 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3653 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3654 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3655 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3656 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3657
3658 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3659 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3660 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3661 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3662 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3663 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3664 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3665
3666 // Complex, from nxv2f64.
3667 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3668 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3669 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3670 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3671 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3672 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3673 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3674 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3675 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3676 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3677
3678 // Complex, from nxv4f32.
3679 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3680 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3681 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3682 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3683 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3684 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3685 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3686 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3687 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3688 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3689
3690 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3691 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3692 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3693 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3694 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3695
3696 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3697 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3698 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3699 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3700 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3701 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3702 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3703
3704 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3705 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3706 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3707 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3708 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3709
3710 // Complex, from nxv8f16.
3711 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3712 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3713 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3714 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3715 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3716 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3717 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3718 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3719 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3720 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3721
3722 // Complex, from nxv4f16.
3723 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3724 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3725 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3726 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3727 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3728 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3729 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3730 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3731
3732 // Complex, from nxv2f16.
3733 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3734 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3735 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3736 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3737 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3738 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3739 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3740 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3741
3742 // Truncate from nxvmf32 to nxvmf16.
3743 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3744 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3745 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3746
3747 // Truncate from nxvmf32 to nxvmbf16.
3748 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3749 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
3750 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
3751
3752 // Truncate from nxvmf64 to nxvmf16.
3753 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3754 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3755 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3756
3757 // Truncate from nxvmf64 to nxvmbf16.
3758 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
3759 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
3760 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
3761
3762 // Truncate from nxvmf64 to nxvmf32.
3763 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3764 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3765 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3766
3767 // Extend from nxvmf16 to nxvmf32.
3768 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3769 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3770 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3771
3772 // Extend from nxvmbf16 to nxvmf32.
3773 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
3774 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
3775 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
3776
3777 // Extend from nxvmf16 to nxvmf64.
3778 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3779 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3780 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3781
3782 // Extend from nxvmbf16 to nxvmf64.
3783 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
3784 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
3785 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
3786
3787 // Extend from nxvmf32 to nxvmf64.
3788 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3789 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3790 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3791
3792 // Bitcasts from float to integer
3793 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3794 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3795 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3796
3797 // Bitcasts from integer to float
3798 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3799 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3800 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3801
3802 // Add cost for extending to illegal -too wide- scalable vectors.
3803 // zero/sign extend are implemented by multiple unpack operations,
3804 // where each operation has a cost of 1.
3805 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3806 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3807 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3808 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3809 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3810 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3811
3812 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3813 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3814 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3815 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3816 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3817 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3818 };
3819
3820 // We have to estimate a cost of fixed length operation upon
3821 // SVE registers(operations) with the number of registers required
3822 // for a fixed type to be represented upon SVE registers.
3823 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3824 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3825 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3826 ST->useSVEForFixedLengthVectors(WiderTy)) {
3827 std::pair<InstructionCost, MVT> LT =
3828 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3829 unsigned NumElements =
3830 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3831 return LT.first *
3833 Opcode,
3834 ScalableVectorType::get(Dst->getScalarType(), NumElements),
3835 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3836 CostKind, I);
3837 }
3838
3839 if (const auto *Entry = ConvertCostTableLookup(
3840 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3841 return Entry->Cost;
3842
3843 static const TypeConversionCostTblEntry FP16Tbl[] = {
3844 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3845 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3846 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3847 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3848 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3849 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3850 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3851 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3852 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3853 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3854 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3855 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3856 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3857 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3858 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3859 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3860 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3861 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3862 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3863 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3864 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3865 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3866 };
3867
3868 if (ST->hasFullFP16())
3869 if (const auto *Entry = ConvertCostTableLookup(
3870 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3871 return Entry->Cost;
3872
3873 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3874 // double-rounding issues.
3875 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3876 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3878 return cast<FixedVectorType>(Dst)->getNumElements() *
3879 getCastInstrCost(Opcode, Dst->getScalarType(),
3880 Src->getScalarType(), CCH, CostKind) +
3882 true, CostKind) +
3884 false, CostKind);
3885
3886 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3888 ST->isSVEorStreamingSVEAvailable() &&
3889 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3891 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3893 // The standard behaviour in the backend for these cases is to split the
3894 // extend up into two parts:
3895 // 1. Perform an extending load or masked load up to the legal type.
3896 // 2. Extend the loaded data to the final type.
3897 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3898 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3900 Opcode, LegalTy, Src, CCH, CostKind, I);
3902 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3903 return Part1 + Part2;
3904 }
3905
3906 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3907 // but we also want to include the TTI::CastContextHint::Masked case too.
3908 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3910 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3912
3913 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3914}
3915
3918 VectorType *VecTy, unsigned Index,
3920
3921 // Make sure we were given a valid extend opcode.
3922 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3923 "Invalid opcode");
3924
3925 // We are extending an element we extract from a vector, so the source type
3926 // of the extend is the element type of the vector.
3927 auto *Src = VecTy->getElementType();
3928
3929 // Sign- and zero-extends are for integer types only.
3930 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3931
3932 // Get the cost for the extract. We compute the cost (if any) for the extend
3933 // below.
3934 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3935 CostKind, Index, nullptr, nullptr);
3936
3937 // Legalize the types.
3938 auto VecLT = getTypeLegalizationCost(VecTy);
3939 auto DstVT = TLI->getValueType(DL, Dst);
3940 auto SrcVT = TLI->getValueType(DL, Src);
3941
3942 // If the resulting type is still a vector and the destination type is legal,
3943 // we may get the extension for free. If not, get the default cost for the
3944 // extend.
3945 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3946 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3947 CostKind);
3948
3949 // The destination type should be larger than the element type. If not, get
3950 // the default cost for the extend.
3951 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3952 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3953 CostKind);
3954
3955 switch (Opcode) {
3956 default:
3957 llvm_unreachable("Opcode should be either SExt or ZExt");
3958
3959 // For sign-extends, we only need a smov, which performs the extension
3960 // automatically.
3961 case Instruction::SExt:
3962 return Cost;
3963
3964 // For zero-extends, the extend is performed automatically by a umov unless
3965 // the destination type is i64 and the element type is i8 or i16.
3966 case Instruction::ZExt:
3967 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3968 return Cost;
3969 }
3970
3971 // If we are unable to perform the extend for free, get the default cost.
3972 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3973 CostKind);
3974}
3975
3978 const Instruction *I) const {
3980 return Opcode == Instruction::PHI ? 0 : 1;
3981 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3982 // Branches are assumed to be predicted.
3983 return 0;
3984}
3985
3986InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3987 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3988 const Instruction *I, Value *Scalar,
3989 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
3990 TTI::VectorInstrContext VIC) const {
3991 assert(Val->isVectorTy() && "This must be a vector type");
3992
3993 if (Index != -1U) {
3994 // Legalize the type.
3995 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
3996
3997 // This type is legalized to a scalar type.
3998 if (!LT.second.isVector())
3999 return 0;
4000
4001 // The type may be split. For fixed-width vectors we can normalize the
4002 // index to the new type.
4003 if (LT.second.isFixedLengthVector()) {
4004 unsigned Width = LT.second.getVectorNumElements();
4005 Index = Index % Width;
4006 }
4007
4008 // The element at index zero is already inside the vector.
4009 // - For a insert-element or extract-element
4010 // instruction that extracts integers, an explicit FPR -> GPR move is
4011 // needed. So it has non-zero cost.
4012 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
4013 return 0;
4014
4015 // This is recognising a LD1 single-element structure to one lane of one
4016 // register instruction. I.e., if this is an `insertelement` instruction,
4017 // and its second operand is a load, then we will generate a LD1, which
4018 // are expensive instructions on some uArchs.
4019 if (VIC == TTI::VectorInstrContext::Load) {
4020 if (ST->hasFastLD1Single())
4021 return 0;
4022 return CostKind == TTI::TCK_CodeSize
4023 ? 0
4025 }
4026
4027 // i1 inserts and extract will include an extra cset or cmp of the vector
4028 // value. Increase the cost by 1 to account.
4029 if (Val->getScalarSizeInBits() == 1)
4030 return CostKind == TTI::TCK_CodeSize
4031 ? 2
4032 : ST->getVectorInsertExtractBaseCost() + 1;
4033
4034 // FIXME:
4035 // If the extract-element and insert-element instructions could be
4036 // simplified away (e.g., could be combined into users by looking at use-def
4037 // context), they have no cost. This is not done in the first place for
4038 // compile-time considerations.
4039 }
4040
4041 // In case of Neon, if there exists extractelement from lane != 0 such that
4042 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
4043 // 2. extractelement result feeds into fmul.
4044 // 3. Other operand of fmul is an extractelement from lane 0 or lane
4045 // equivalent to 0.
4046 // then the extractelement can be merged with fmul in the backend and it
4047 // incurs no cost.
4048 // e.g.
4049 // define double @foo(<2 x double> %a) {
4050 // %1 = extractelement <2 x double> %a, i32 0
4051 // %2 = extractelement <2 x double> %a, i32 1
4052 // %res = fmul double %1, %2
4053 // ret double %res
4054 // }
4055 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4056 auto ExtractCanFuseWithFmul = [&]() {
4057 // We bail out if the extract is from lane 0.
4058 if (Index == 0)
4059 return false;
4060
4061 // Check if the scalar element type of the vector operand of ExtractElement
4062 // instruction is one of the allowed types.
4063 auto IsAllowedScalarTy = [&](const Type *T) {
4064 return T->isFloatTy() || T->isDoubleTy() ||
4065 (T->isHalfTy() && ST->hasFullFP16());
4066 };
4067
4068 // Check if the extractelement user is scalar fmul.
4069 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4070 // Check if the user is scalar fmul.
4071 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
4072 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4073 !BO->getType()->isVectorTy();
4074 };
4075
4076 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4077 // certain scalar type and a certain vector register width.
4078 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4079 auto RegWidth =
4081 .getFixedValue();
4082 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4083 };
4084
4085 // Check if the type constraints on input vector type and result scalar type
4086 // of extractelement instruction are satisfied.
4087 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4088 return false;
4089
4090 if (Scalar) {
4091 DenseMap<User *, unsigned> UserToExtractIdx;
4092 for (auto *U : Scalar->users()) {
4093 if (!IsUserFMulScalarTy(U))
4094 return false;
4095 // Recording entry for the user is important. Index value is not
4096 // important.
4097 UserToExtractIdx[U];
4098 }
4099 if (UserToExtractIdx.empty())
4100 return false;
4101 for (auto &[S, U, L] : ScalarUserAndIdx) {
4102 for (auto *U : S->users()) {
4103 if (UserToExtractIdx.contains(U)) {
4104 auto *FMul = cast<BinaryOperator>(U);
4105 auto *Op0 = FMul->getOperand(0);
4106 auto *Op1 = FMul->getOperand(1);
4107 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4108 UserToExtractIdx[U] = L;
4109 break;
4110 }
4111 }
4112 }
4113 }
4114 for (auto &[U, L] : UserToExtractIdx) {
4115 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4116 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4117 return false;
4118 }
4119 } else {
4120 const auto *EE = cast<ExtractElementInst>(I);
4121
4122 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
4123 if (!IdxOp)
4124 return false;
4125
4126 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
4127 if (!IsUserFMulScalarTy(U))
4128 return false;
4129
4130 // Check if the other operand of extractelement is also extractelement
4131 // from lane equivalent to 0.
4132 const auto *BO = cast<BinaryOperator>(U);
4133 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4134 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
4135 if (OtherEE) {
4136 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
4137 if (!IdxOp)
4138 return false;
4139 return IsExtractLaneEquivalentToZero(
4140 cast<ConstantInt>(OtherEE->getIndexOperand())
4141 ->getValue()
4142 .getZExtValue(),
4143 OtherEE->getType()->getScalarSizeInBits());
4144 }
4145 return true;
4146 });
4147 }
4148 return true;
4149 };
4150
4151 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4152 ExtractCanFuseWithFmul())
4153 return 0;
4154
4155 // All other insert/extracts cost this much.
4156 return CostKind == TTI::TCK_CodeSize ? 1
4157 : ST->getVectorInsertExtractBaseCost();
4158}
4159
4161 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4162 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4163 // Treat insert at lane 0 into a poison vector as having zero cost. This
4164 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4165 // single dup) are treated as cheap.
4166 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4167 isa<PoisonValue>(Op0))
4168 return 0;
4169 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr,
4170 nullptr, {}, VIC);
4171}
4172
4174 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4175 Value *Scalar, ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4176 TTI::VectorInstrContext VIC) const {
4177 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4178 ScalarUserAndIdx, VIC);
4179}
4180
4183 TTI::TargetCostKind CostKind, unsigned Index,
4184 TTI::VectorInstrContext VIC) const {
4185 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I,
4186 nullptr, {}, VIC);
4187}
4188
4192 unsigned Index) const {
4193 if (isa<FixedVectorType>(Val))
4195 Index);
4196
4197 // This typically requires both while and lastb instructions in order
4198 // to extract the last element. If this is in a loop the while
4199 // instruction can at least be hoisted out, although it will consume a
4200 // predicate register. The cost should be more expensive than the base
4201 // extract cost, which is 2 for most CPUs.
4202 return CostKind == TTI::TCK_CodeSize
4203 ? 2
4204 : ST->getVectorInsertExtractBaseCost() + 1;
4205}
4206
4208 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4209 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
4210 TTI::VectorInstrContext VIC) const {
4213 if (Ty->getElementType()->isFloatingPointTy())
4214 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4215 CostKind);
4216 unsigned VecInstCost =
4217 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4218 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4219}
4220
4221std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4223 TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4224 std::function<InstructionCost(Type *)> InstCost) const {
4225 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4226 return std::nullopt;
4227 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4228 return std::nullopt;
4229 if (CanUseSVE && Ty->isScalableTy() && ST->hasSVEB16B16() &&
4230 ST->isNonStreamingSVEorSME2Available())
4231 return std::nullopt;
4232
4233 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4234 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4236 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4237 Cost *= 2;
4238 Cost += InstCost(PromotedTy);
4239 if (IncludeTrunc)
4240 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4242 return Cost;
4243}
4244
4246 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4248 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4249
4250 // The code-generator is currently not able to handle scalable vectors
4251 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4252 // it. This change will be removed when code-generation for these types is
4253 // sufficiently reliable.
4254 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4255 if (VTy->getElementCount() == ElementCount::getScalable(1))
4257
4258 // TODO: Handle more cost kinds.
4260 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4261 Op2Info, Args, CxtI);
4262
4263 // Legalize the type.
4264 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4265 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4266
4267 // Increase the cost for half and bfloat types if not architecturally
4268 // supported.
4269 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4270 ISD == ISD::FDIV || ISD == ISD::FREM)
4271 if (auto PromotedCost = getFP16BF16PromoteCost(
4272 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4273 // There is not native support for fdiv/frem even with +sve-b16b16.
4274 /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
4275 [&](Type *PromotedTy) {
4276 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4277 Op1Info, Op2Info);
4278 }))
4279 return *PromotedCost;
4280
4281 // If the operation is a widening instruction (smull or umull) and both
4282 // operands are extends the cost can be cheaper by considering that the
4283 // operation will operate on the narrowest type size possible (double the
4284 // largest input size) and a further extend.
4285 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
4286 if (ExtTy != Ty)
4287 return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
4288 getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
4290 return LT.first;
4291 }
4292
4293 switch (ISD) {
4294 default:
4295 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4296 Op2Info);
4297 case ISD::SREM:
4298 case ISD::SDIV:
4299 /*
4300 Notes for sdiv/srem specific costs:
4301 1. This only considers the cases where the divisor is constant, uniform and
4302 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4303 result in some form of (ldr + adrp), corresponding to constant vectors, or
4304 scalarization of the division operation.
4305 2. Constant divisors, either negative in whole or partially, don't result in
4306 significantly different codegen as compared to positive constant divisors.
4307 So, we don't consider negative divisors separately.
4308 3. If the codegen is significantly different with SVE, it has been indicated
4309 using comments at appropriate places.
4310
4311 sdiv specific cases:
4312 -----------------------------------------------------------------------
4313 codegen | pow-of-2 | Type
4314 -----------------------------------------------------------------------
4315 add + cmp + csel + asr | Y | i64
4316 add + cmp + csel + asr | Y | i32
4317 -----------------------------------------------------------------------
4318
4319 srem specific cases:
4320 -----------------------------------------------------------------------
4321 codegen | pow-of-2 | Type
4322 -----------------------------------------------------------------------
4323 negs + and + and + csneg | Y | i64
4324 negs + and + and + csneg | Y | i32
4325 -----------------------------------------------------------------------
4326
4327 other sdiv/srem cases:
4328 -------------------------------------------------------------------------
4329 common codegen | + srem | + sdiv | pow-of-2 | Type
4330 -------------------------------------------------------------------------
4331 smulh + asr + add + add | - | - | N | i64
4332 smull + lsr + add + add | - | - | N | i32
4333 usra | and + sub | sshr | Y | <2 x i64>
4334 2 * (scalar code) | - | - | N | <2 x i64>
4335 usra | bic + sub | sshr + neg | Y | <4 x i32>
4336 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4337 + sshr + usra | | | |
4338 -------------------------------------------------------------------------
4339 */
4340 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4341 InstructionCost AddCost =
4342 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4343 Op1Info.getNoProps(), Op2Info.getNoProps());
4344 InstructionCost AsrCost =
4345 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4346 Op1Info.getNoProps(), Op2Info.getNoProps());
4347 InstructionCost MulCost =
4348 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4349 Op1Info.getNoProps(), Op2Info.getNoProps());
4350 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4351 // have similar cost.
4352 auto VT = TLI->getValueType(DL, Ty);
4353 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4354 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4355 // Neg can be folded into the asr instruction.
4356 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4357 : (3 * AsrCost + AddCost);
4358 } else {
4359 return MulCost + AsrCost + 2 * AddCost;
4360 }
4361 } else if (VT.isVector()) {
4362 InstructionCost UsraCost = 2 * AsrCost;
4363 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4364 // Division with scalable types corresponds to native 'asrd'
4365 // instruction when SVE is available.
4366 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4367
4368 // One more for the negation in SDIV
4370 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4371 if (Ty->isScalableTy() && ST->hasSVE())
4372 Cost += 2 * AsrCost;
4373 else {
4374 Cost +=
4375 UsraCost +
4376 (ISD == ISD::SDIV
4377 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4378 : 2 * AddCost);
4379 }
4380 return Cost;
4381 } else if (LT.second == MVT::v2i64) {
4382 return VT.getVectorNumElements() *
4383 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4384 Op1Info.getNoProps(),
4385 Op2Info.getNoProps());
4386 } else {
4387 // When SVE is available, we get:
4388 // smulh + lsr + add/sub + asr + add/sub.
4389 if (Ty->isScalableTy() && ST->hasSVE())
4390 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4391 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4392 }
4393 }
4394 }
4395 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4396 LT.second.isFixedLengthVector()) {
4397 // FIXME: When the constant vector is non-uniform, this may result in
4398 // loading the vector from constant pool or in some cases, may also result
4399 // in scalarization. For now, we are approximating this with the
4400 // scalarization cost.
4401 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4402 CostKind, -1, nullptr, nullptr);
4403 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4404 CostKind, -1, nullptr, nullptr);
4405 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4406 return ExtractCost + InsertCost +
4407 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4408 CostKind, Op1Info.getNoProps(),
4409 Op2Info.getNoProps());
4410 }
4411 [[fallthrough]];
4412 case ISD::UDIV:
4413 case ISD::UREM: {
4414 auto VT = TLI->getValueType(DL, Ty);
4415 if (Op2Info.isConstant()) {
4416 // If the operand is a power of 2 we can use the shift or and cost.
4417 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4418 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4419 Op1Info.getNoProps(),
4420 Op2Info.getNoProps());
4421 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4422 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4423 Op1Info.getNoProps(),
4424 Op2Info.getNoProps());
4425
4426 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4427 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4428 // The MULHU will be expanded to UMULL for the types not listed below,
4429 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4430 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4431 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4432 LT.second == MVT::nxv16i8;
4433 bool Is128bit = LT.second.is128BitVector();
4434
4435 InstructionCost MulCost =
4436 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4437 Op1Info.getNoProps(), Op2Info.getNoProps());
4438 InstructionCost AddCost =
4439 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4440 Op1Info.getNoProps(), Op2Info.getNoProps());
4441 InstructionCost ShrCost =
4442 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4443 Op1Info.getNoProps(), Op2Info.getNoProps());
4444 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4445 (HasMULH ? 0 : ShrCost) + // UMULL shift
4446 AddCost * 2 + ShrCost;
4447 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4448 }
4449 }
4450
4451 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4452 // emitted by the backend even when those functions are not declared in the
4453 // module.
4454 if (!VT.isVector() && VT.getSizeInBits() > 64)
4455 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4456
4458 Opcode, Ty, CostKind, Op1Info, Op2Info);
4459 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4460 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4461 // SDIV/UDIV operations are lowered using SVE, then we can have less
4462 // costs.
4463 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4464 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4465 static const CostTblEntry DivTbl[]{
4466 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4467 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4468 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4469 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4470 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4471 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4472
4473 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4474 if (nullptr != Entry)
4475 return Entry->Cost;
4476 }
4477 // For 8/16-bit elements, the cost is higher because the type
4478 // requires promotion and possibly splitting:
4479 if (LT.second.getScalarType() == MVT::i8)
4480 Cost *= 8;
4481 else if (LT.second.getScalarType() == MVT::i16)
4482 Cost *= 4;
4483 return Cost;
4484 } else {
4485 // If one of the operands is a uniform constant then the cost for each
4486 // element is Cost for insertion, extraction and division.
4487 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4488 // operation with scalar type
4489 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4490 (Op2Info.isConstant() && Op2Info.isUniform())) {
4491 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4493 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4494 return (4 + DivCost) * VTy->getNumElements();
4495 }
4496 }
4497 // On AArch64, without SVE, vector divisions are expanded
4498 // into scalar divisions of each pair of elements.
4499 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4500 -1, nullptr, nullptr);
4501 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4502 nullptr, nullptr);
4503 }
4504
4505 // TODO: if one of the arguments is scalar, then it's not necessary to
4506 // double the cost of handling the vector elements.
4507 Cost += Cost;
4508 }
4509 return Cost;
4510 }
4511 case ISD::MUL:
4512 // When SVE is available, then we can lower the v2i64 operation using
4513 // the SVE mul instruction, which has a lower cost.
4514 if (LT.second == MVT::v2i64 && ST->hasSVE())
4515 return LT.first;
4516
4517 // When SVE is not available, there is no MUL.2d instruction,
4518 // which means mul <2 x i64> is expensive as elements are extracted
4519 // from the vectors and the muls scalarized.
4520 // As getScalarizationOverhead is a bit too pessimistic, we
4521 // estimate the cost for a i64 vector directly here, which is:
4522 // - four 2-cost i64 extracts,
4523 // - two 2-cost i64 inserts, and
4524 // - two 1-cost muls.
4525 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4526 // LT.first = 2 the cost is 28.
4527 if (LT.second != MVT::v2i64)
4528 return LT.first;
4529 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4530 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4531 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4532 nullptr, nullptr) *
4533 2 +
4534 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4535 nullptr, nullptr));
4536 case ISD::ADD:
4537 case ISD::XOR:
4538 case ISD::OR:
4539 case ISD::AND:
4540 case ISD::SRL:
4541 case ISD::SRA:
4542 case ISD::SHL:
4543 // These nodes are marked as 'custom' for combining purposes only.
4544 // We know that they are legal. See LowerAdd in ISelLowering.
4545 return LT.first;
4546
4547 case ISD::FNEG:
4548 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4549 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4550 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4551 CxtI &&
4552 ((CxtI->hasOneUse() &&
4553 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4554 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4555 return 0;
4556 [[fallthrough]];
4557 case ISD::FADD:
4558 case ISD::FSUB:
4559 if (!Ty->getScalarType()->isFP128Ty())
4560 return LT.first;
4561 [[fallthrough]];
4562 case ISD::FMUL:
4563 case ISD::FDIV:
4564 // These nodes are marked as 'custom' just to lower them to SVE.
4565 // We know said lowering will incur no additional cost.
4566 if (!Ty->getScalarType()->isFP128Ty())
4567 return 2 * LT.first;
4568
4569 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4570 Op2Info);
4571 case ISD::FREM:
4572 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4573 // those functions are not declared in the module.
4574 if (!Ty->isVectorTy())
4575 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4576 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4577 Op2Info);
4578 }
4579}
4580
4583 const SCEV *Ptr,
4585 // Address computations in vectorized code with non-consecutive addresses will
4586 // likely result in more instructions compared to scalar code where the
4587 // computation can more often be merged into the index mode. The resulting
4588 // extra micro-ops can significantly decrease throughput.
4589 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4590 int MaxMergeDistance = 64;
4591
4592 if (PtrTy->isVectorTy() && SE &&
4593 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4594 return NumVectorInstToHideOverhead;
4595
4596 // In many cases the address computation is not merged into the instruction
4597 // addressing mode.
4598 return 1;
4599}
4600
4601/// Check whether Opcode1 has less throughput according to the scheduling
4602/// model than Opcode2.
4604 unsigned Opcode1, unsigned Opcode2) const {
4605 const MCSchedModel &Sched = ST->getSchedModel();
4606 const TargetInstrInfo *TII = ST->getInstrInfo();
4607 if (!Sched.hasInstrSchedModel())
4608 return false;
4609
4610 const MCSchedClassDesc *SCD1 =
4611 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4612 const MCSchedClassDesc *SCD2 =
4613 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4614 // We cannot handle variant scheduling classes without an MI. If we need to
4615 // support them for any of the instructions we query the information of we
4616 // might need to add a way to resolve them without a MI or not use the
4617 // scheduling info.
4618 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4619 "Cannot handle variant scheduling classes without an MI");
4620 if (!SCD1->isValid() || !SCD2->isValid())
4621 return false;
4622
4623 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4625}
4626
4628 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4630 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4631 // We don't lower some vector selects well that are wider than the register
4632 // width. TODO: Improve this with different cost kinds.
4633 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4634 // We would need this many instructions to hide the scalarization happening.
4635 const int AmortizationCost = 20;
4636
4637 // If VecPred is not set, check if we can get a predicate from the context
4638 // instruction, if its type matches the requested ValTy.
4639 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4640 CmpPredicate CurrentPred;
4641 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4642 m_Value())))
4643 VecPred = CurrentPred;
4644 }
4645 // Check if we have a compare/select chain that can be lowered using
4646 // a (F)CMxx & BFI pair.
4647 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4648 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4649 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4650 VecPred == CmpInst::FCMP_UNE) {
4651 static const auto ValidMinMaxTys = {
4652 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4653 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4654 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4655
4656 auto LT = getTypeLegalizationCost(ValTy);
4657 if (any_of(ValidMinMaxTys, equal_to(LT.second)) ||
4658 (ST->hasFullFP16() &&
4659 any_of(ValidFP16MinMaxTys, equal_to(LT.second))))
4660 return LT.first;
4661 }
4662
4663 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4664 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4665 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4666 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4667 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4668 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4669 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4670 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4671 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4672 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4673 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4674 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4675
4676 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4677 EVT SelValTy = TLI->getValueType(DL, ValTy);
4678 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4679 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4680 SelCondTy.getSimpleVT(),
4681 SelValTy.getSimpleVT()))
4682 return Entry->Cost;
4683 }
4684 }
4685
4686 if (Opcode == Instruction::FCmp) {
4687 if (auto PromotedCost = getFP16BF16PromoteCost(
4688 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4689 // TODO: Consider costing SVE FCMPs.
4690 /*CanUseSVE=*/false, [&](Type *PromotedTy) {
4692 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4693 CostKind, Op1Info, Op2Info);
4694 if (isa<VectorType>(PromotedTy))
4696 Instruction::Trunc,
4700 return Cost;
4701 }))
4702 return *PromotedCost;
4703
4704 auto LT = getTypeLegalizationCost(ValTy);
4705 // Model unknown fp compares as a libcall.
4706 if (LT.second.getScalarType() != MVT::f64 &&
4707 LT.second.getScalarType() != MVT::f32 &&
4708 LT.second.getScalarType() != MVT::f16)
4709 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4710 {ValTy, ValTy}, CostKind);
4711
4712 // Some comparison operators require expanding to multiple compares + or.
4713 unsigned Factor = 1;
4714 if (!CondTy->isVectorTy() &&
4715 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4716 Factor = 2; // fcmp with 2 selects
4717 else if (isa<FixedVectorType>(ValTy) &&
4718 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4719 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4720 Factor = 3; // fcmxx+fcmyy+or
4721 else if (isa<ScalableVectorType>(ValTy) &&
4722 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4723 Factor = 3; // fcmxx+fcmyy+or
4724
4725 if (isa<ScalableVectorType>(ValTy) &&
4727 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4728 AArch64::FCMEQv4f32))
4729 Factor *= 2;
4730
4731 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4732 }
4733
4734 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4735 // icmp(and, 0) as free, as we can make use of ands, but only if the
4736 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4737 // providing it will not cause performance regressions.
4738 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4739 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4740 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4741 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4742 if (match(I->getOperand(1), m_Zero()))
4743 return 0;
4744
4745 // x >= 1 / x < 1 -> x > 0 / x <= 0
4746 if (match(I->getOperand(1), m_One()) &&
4747 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4748 return 0;
4749
4750 // x <= -1 / x > -1 -> x > 0 / x <= 0
4751 if (match(I->getOperand(1), m_AllOnes()) &&
4752 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4753 return 0;
4754 }
4755
4756 // The base case handles scalable vectors fine for now, since it treats the
4757 // cost as 1 * legalization cost.
4758 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4759 Op1Info, Op2Info, I);
4760}
4761
4763AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4765 if (ST->requiresStrictAlign()) {
4766 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4767 // a bunch of instructions when strict align is enabled.
4768 return Options;
4769 }
4770 Options.AllowOverlappingLoads = true;
4771 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4772 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4773 // TODO: Though vector loads usually perform well on AArch64, in some targets
4774 // they may wake up the FP unit, which raises the power consumption. Perhaps
4775 // they could be used with no holds barred (-O3).
4776 Options.LoadSizes = {8, 4, 2, 1};
4777 Options.AllowedTailExpansions = {3, 5, 6};
4778 return Options;
4779}
4780
4782 return ST->hasSVE();
4783}
4784
4788 switch (MICA.getID()) {
4789 case Intrinsic::masked_scatter:
4790 case Intrinsic::masked_gather:
4791 return getGatherScatterOpCost(MICA, CostKind);
4792 case Intrinsic::masked_load:
4793 case Intrinsic::masked_store:
4794 return getMaskedMemoryOpCost(MICA, CostKind);
4795 }
4797}
4798
4802 Type *Src = MICA.getDataType();
4803
4804 if (useNeonVector(Src))
4806 auto LT = getTypeLegalizationCost(Src);
4807 if (!LT.first.isValid())
4809
4810 // Return an invalid cost for element types that we are unable to lower.
4811 auto *VT = cast<VectorType>(Src);
4812 if (VT->getElementType()->isIntegerTy(1))
4814
4815 // The code-generator is currently not able to handle scalable vectors
4816 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4817 // it. This change will be removed when code-generation for these types is
4818 // sufficiently reliable.
4819 if (VT->getElementCount() == ElementCount::getScalable(1))
4821
4822 return LT.first;
4823}
4824
4825// This function returns gather/scatter overhead either from
4826// user-provided value or specialized values per-target from \p ST.
4827static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4828 const AArch64Subtarget *ST) {
4829 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4830 "Should be called on only load or stores.");
4831 switch (Opcode) {
4832 case Instruction::Load:
4833 if (SVEGatherOverhead.getNumOccurrences() > 0)
4834 return SVEGatherOverhead;
4835 return ST->getGatherOverhead();
4836 break;
4837 case Instruction::Store:
4838 if (SVEScatterOverhead.getNumOccurrences() > 0)
4839 return SVEScatterOverhead;
4840 return ST->getScatterOverhead();
4841 break;
4842 default:
4843 llvm_unreachable("Shouldn't have reached here");
4844 }
4845}
4846
4850
4851 unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather ||
4852 MICA.getID() == Intrinsic::vp_gather)
4853 ? Instruction::Load
4854 : Instruction::Store;
4855
4856 Type *DataTy = MICA.getDataType();
4857 Align Alignment = MICA.getAlignment();
4858 const Instruction *I = MICA.getInst();
4859
4860 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
4862 auto *VT = cast<VectorType>(DataTy);
4863 auto LT = getTypeLegalizationCost(DataTy);
4864 if (!LT.first.isValid())
4866
4867 // Return an invalid cost for element types that we are unable to lower.
4868 if (!LT.second.isVector() ||
4869 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
4870 VT->getElementType()->isIntegerTy(1))
4872
4873 // The code-generator is currently not able to handle scalable vectors
4874 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4875 // it. This change will be removed when code-generation for these types is
4876 // sufficiently reliable.
4877 if (VT->getElementCount() == ElementCount::getScalable(1))
4879
4880 ElementCount LegalVF = LT.second.getVectorElementCount();
4881 InstructionCost MemOpCost =
4882 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
4883 {TTI::OK_AnyValue, TTI::OP_None}, I);
4884 // Add on an overhead cost for using gathers/scatters.
4885 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
4886 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
4887}
4888
4890 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
4891}
4892
4894 Align Alignment,
4895 unsigned AddressSpace,
4897 TTI::OperandValueInfo OpInfo,
4898 const Instruction *I) const {
4899 EVT VT = TLI->getValueType(DL, Ty, true);
4900 // Type legalization can't handle structs
4901 if (VT == MVT::Other)
4902 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
4903 CostKind);
4904
4905 auto LT = getTypeLegalizationCost(Ty);
4906 if (!LT.first.isValid())
4908
4909 // The code-generator is currently not able to handle scalable vectors
4910 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4911 // it. This change will be removed when code-generation for these types is
4912 // sufficiently reliable.
4913 // We also only support full register predicate loads and stores.
4914 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4915 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
4916 (VTy->getElementType()->isIntegerTy(1) &&
4917 !VTy->getElementCount().isKnownMultipleOf(
4920
4921 // TODO: consider latency as well for TCK_SizeAndLatency.
4923 return LT.first;
4924
4926 return 1;
4927
4928 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
4929 LT.second.is128BitVector() && Alignment < Align(16)) {
4930 // Unaligned stores are extremely inefficient. We don't split all
4931 // unaligned 128-bit stores because the negative impact that has shown in
4932 // practice on inlined block copy code.
4933 // We make such stores expensive so that we will only vectorize if there
4934 // are 6 other instructions getting vectorized.
4935 const int AmortizationCost = 6;
4936
4937 return LT.first * 2 * AmortizationCost;
4938 }
4939
4940 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
4941 if (Ty->isPtrOrPtrVectorTy())
4942 return LT.first;
4943
4944 if (useNeonVector(Ty)) {
4945 // Check truncating stores and extending loads.
4946 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
4947 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
4948 if (VT == MVT::v4i8)
4949 return 2;
4950 // Otherwise we need to scalarize.
4951 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
4952 }
4953 EVT EltVT = VT.getVectorElementType();
4954 unsigned EltSize = EltVT.getScalarSizeInBits();
4955 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
4956 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
4957 return LT.first;
4958 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
4959 // widening to v4i8, which produces suboptimal results.
4960 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
4961 return LT.first;
4962
4963 // Check non-power-of-2 loads/stores for legal vector element types with
4964 // NEON. Non-power-of-2 memory ops will get broken down to a set of
4965 // operations on smaller power-of-2 ops, including ld1/st1.
4966 LLVMContext &C = Ty->getContext();
4968 SmallVector<EVT> TypeWorklist;
4969 TypeWorklist.push_back(VT);
4970 while (!TypeWorklist.empty()) {
4971 EVT CurrVT = TypeWorklist.pop_back_val();
4972 unsigned CurrNumElements = CurrVT.getVectorNumElements();
4973 if (isPowerOf2_32(CurrNumElements)) {
4974 Cost += 1;
4975 continue;
4976 }
4977
4978 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
4979 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
4980 TypeWorklist.push_back(
4981 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
4982 }
4983 return Cost;
4984 }
4985
4986 return LT.first;
4987}
4988
4990 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
4991 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
4992 bool UseMaskForCond, bool UseMaskForGaps) const {
4993 assert(Factor >= 2 && "Invalid interleave factor");
4994 auto *VecVTy = cast<VectorType>(VecTy);
4995
4996 if (VecTy->isScalableTy() && !ST->hasSVE())
4998
4999 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
5000 // only have lowering for power-of-2 factors.
5001 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
5002 // InterleavedAccessPass for ld3/st3
5003 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
5005
5006 // Vectorization for masked interleaved accesses is only enabled for scalable
5007 // VF.
5008 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
5010
5011 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
5012 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
5013 auto *SubVecTy =
5014 VectorType::get(VecVTy->getElementType(),
5015 VecVTy->getElementCount().divideCoefficientBy(Factor));
5016
5017 // ldN/stN only support legal vector types of size 64 or 128 in bits.
5018 // Accesses having vector types that are a multiple of 128 bits can be
5019 // matched to more than one ldN/stN instruction.
5020 bool UseScalable;
5021 if (MinElts % Factor == 0 &&
5022 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
5023 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
5024 }
5025
5026 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5027 Alignment, AddressSpace, CostKind,
5028 UseMaskForCond, UseMaskForGaps);
5029}
5030
5035 for (auto *I : Tys) {
5036 if (!I->isVectorTy())
5037 continue;
5038 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
5039 128)
5040 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
5041 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
5042 }
5043 return Cost;
5044}
5045
5047 return ST->getMaxInterleaveFactor();
5048}
5049
5050// For Falkor, we want to avoid having too many strided loads in a loop since
5051// that can exhaust the HW prefetcher resources. We adjust the unroller
5052// MaxCount preference below to attempt to ensure unrolling doesn't create too
5053// many strided loads.
5054static void
5057 enum { MaxStridedLoads = 7 };
5058 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
5059 int StridedLoads = 0;
5060 // FIXME? We could make this more precise by looking at the CFG and
5061 // e.g. not counting loads in each side of an if-then-else diamond.
5062 for (const auto BB : L->blocks()) {
5063 for (auto &I : *BB) {
5064 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
5065 if (!LMemI)
5066 continue;
5067
5068 Value *PtrValue = LMemI->getPointerOperand();
5069 if (L->isLoopInvariant(PtrValue))
5070 continue;
5071
5072 const SCEV *LSCEV = SE.getSCEV(PtrValue);
5073 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
5074 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
5075 continue;
5076
5077 // FIXME? We could take pairing of unrolled load copies into account
5078 // by looking at the AddRec, but we would probably have to limit this
5079 // to loops with no stores or other memory optimization barriers.
5080 ++StridedLoads;
5081 // We've seen enough strided loads that seeing more won't make a
5082 // difference.
5083 if (StridedLoads > MaxStridedLoads / 2)
5084 return StridedLoads;
5085 }
5086 }
5087 return StridedLoads;
5088 };
5089
5090 int StridedLoads = countStridedLoads(L, SE);
5091 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5092 << " strided loads\n");
5093 // Pick the largest power of 2 unroll count that won't result in too many
5094 // strided loads.
5095 if (StridedLoads) {
5096 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
5097 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5098 << UP.MaxCount << '\n');
5099 }
5100}
5101
5102// This function returns true if the loop:
5103// 1. Has a valid cost, and
5104// 2. Has a cost within the supplied budget.
5105// Otherwise it returns false.
5107 InstructionCost Budget,
5108 unsigned *FinalSize) {
5109 // Estimate the size of the loop.
5110 InstructionCost LoopCost = 0;
5111
5112 for (auto *BB : L->getBlocks()) {
5113 for (auto &I : *BB) {
5114 SmallVector<const Value *, 4> Operands(I.operand_values());
5115 InstructionCost Cost =
5116 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
5117 // This can happen with intrinsics that don't currently have a cost model
5118 // or for some operations that require SVE.
5119 if (!Cost.isValid())
5120 return false;
5121
5122 LoopCost += Cost;
5123 if (LoopCost > Budget)
5124 return false;
5125 }
5126 }
5127
5128 if (FinalSize)
5129 *FinalSize = LoopCost.getValue();
5130 return true;
5131}
5132
5134 const AArch64TTIImpl &TTI) {
5135 // Only consider loops with unknown trip counts for which we can determine
5136 // a symbolic expression. Multi-exit loops with small known trip counts will
5137 // likely be unrolled anyway.
5138 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5140 return false;
5141
5142 // It might not be worth unrolling loops with low max trip counts. Restrict
5143 // this to max trip counts > 32 for now.
5144 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5145 if (MaxTC > 0 && MaxTC <= 32)
5146 return false;
5147
5148 // Make sure the loop size is <= 5.
5149 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
5150 return false;
5151
5152 // Small search loops with multiple exits can be highly beneficial to unroll.
5153 // We only care about loops with exactly two exiting blocks, although each
5154 // block could jump to the same exit block.
5155 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5156 if (Blocks.size() != 2)
5157 return false;
5158
5159 if (any_of(Blocks, [](BasicBlock *BB) {
5160 return !isa<BranchInst>(BB->getTerminator());
5161 }))
5162 return false;
5163
5164 return true;
5165}
5166
5167/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5168/// OOO engine's wide instruction window and various predictors.
5169static void
5172 const AArch64TTIImpl &TTI) {
5173 // Limit loops with structure that is highly likely to benefit from runtime
5174 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5175 // likely with complex control flow). Note that the heuristics here may be
5176 // overly conservative and we err on the side of avoiding runtime unrolling
5177 // rather than unroll excessively. They are all subject to further refinement.
5178 if (!L->isInnermost() || L->getNumBlocks() > 8)
5179 return;
5180
5181 // Loops with multiple exits are handled by common code.
5182 if (!L->getExitBlock())
5183 return;
5184
5185 // Check if the loop contains any reductions that could be parallelized when
5186 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5187 // a multiple of 2.
5188 bool HasParellelizableReductions =
5189 L->getNumBlocks() == 1 &&
5190 any_of(L->getHeader()->phis(),
5191 [&SE, L](PHINode &Phi) {
5192 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
5193 }) &&
5194 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
5195 if (HasParellelizableReductions &&
5196 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
5197 UP.Partial = true;
5198 UP.MaxCount = 4;
5199 UP.AddAdditionalAccumulators = true;
5200 }
5201
5202 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5204 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5205 SE.getSmallConstantMaxTripCount(L) <= 32))
5206 return;
5207
5208 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
5209 return;
5210
5212 return;
5213
5214 // Limit to loops with trip counts that are cheap to expand.
5215 UP.SCEVExpansionBudget = 1;
5216
5217 if (HasParellelizableReductions) {
5218 UP.Runtime = true;
5220 UP.AddAdditionalAccumulators = true;
5221 }
5222
5223 // Try to unroll small loops, of few-blocks with low budget, if they have
5224 // load/store dependencies, to expose more parallel memory access streams,
5225 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5226 BasicBlock *Header = L->getHeader();
5227 BasicBlock *Latch = L->getLoopLatch();
5228 if (Header == Latch) {
5229 // Estimate the size of the loop.
5230 unsigned Size;
5231 unsigned Width = 10;
5232 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5233 return;
5234
5235 // Try to find an unroll count that maximizes the use of the instruction
5236 // window, i.e. trying to fetch as many instructions per cycle as possible.
5237 unsigned MaxInstsPerLine = 16;
5238 unsigned UC = 1;
5239 unsigned BestUC = 1;
5240 unsigned SizeWithBestUC = BestUC * Size;
5241 while (UC <= 8) {
5242 unsigned SizeWithUC = UC * Size;
5243 if (SizeWithUC > 48)
5244 break;
5245 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5246 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5247 BestUC = UC;
5248 SizeWithBestUC = BestUC * Size;
5249 }
5250 UC++;
5251 }
5252
5253 if (BestUC == 1)
5254 return;
5255
5256 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5258 for (auto *BB : L->blocks()) {
5259 for (auto &I : *BB) {
5261 if (!Ptr)
5262 continue;
5263 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5264 if (SE.isLoopInvariant(PtrSCEV, L))
5265 continue;
5266 if (isa<LoadInst>(&I)) {
5267 LoadedValuesPlus.insert(&I);
5268 // Include in-loop 1st users of loaded values.
5269 for (auto *U : I.users())
5270 if (L->contains(cast<Instruction>(U)))
5271 LoadedValuesPlus.insert(U);
5272 } else
5273 Stores.push_back(cast<StoreInst>(&I));
5274 }
5275 }
5276
5277 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5278 return LoadedValuesPlus.contains(SI->getOperand(0));
5279 }))
5280 return;
5281
5282 UP.Runtime = true;
5283 UP.DefaultUnrollRuntimeCount = BestUC;
5284 return;
5285 }
5286
5287 // Try to runtime-unroll loops with early-continues depending on loop-varying
5288 // loads; this helps with branch-prediction for the early-continues.
5289 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
5291 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
5292 !llvm::is_contained(Preds, Header) ||
5293 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5294 return;
5295
5296 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5297 [&](Instruction *I, unsigned Depth) -> bool {
5298 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5299 return false;
5300
5301 if (isa<LoadInst>(I))
5302 return true;
5303
5304 return any_of(I->operands(), [&](Value *V) {
5305 auto *I = dyn_cast<Instruction>(V);
5306 return I && DependsOnLoopLoad(I, Depth + 1);
5307 });
5308 };
5309 CmpPredicate Pred;
5310 Instruction *I;
5311 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5312 m_Value())) &&
5313 DependsOnLoopLoad(I, 0)) {
5314 UP.Runtime = true;
5315 }
5316}
5317
5320 OptimizationRemarkEmitter *ORE) const {
5321 // Enable partial unrolling and runtime unrolling.
5322 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5323
5324 UP.UpperBound = true;
5325
5326 // For inner loop, it is more likely to be a hot one, and the runtime check
5327 // can be promoted out from LICM pass, so the overhead is less, let's try
5328 // a larger threshold to unroll more loops.
5329 if (L->getLoopDepth() > 1)
5330 UP.PartialThreshold *= 2;
5331
5332 // Disable partial & runtime unrolling on -Os.
5334
5335 // Scan the loop: don't unroll loops with calls as this could prevent
5336 // inlining. Don't unroll auto-vectorized loops either, though do allow
5337 // unrolling of the scalar remainder.
5338 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5340 for (auto *BB : L->getBlocks()) {
5341 for (auto &I : *BB) {
5342 // Both auto-vectorized loops and the scalar remainder have the
5343 // isvectorized attribute, so differentiate between them by the presence
5344 // of vector instructions.
5345 if (IsVectorized && I.getType()->isVectorTy())
5346 return;
5347 if (isa<CallBase>(I)) {
5350 if (!isLoweredToCall(F))
5351 continue;
5352 return;
5353 }
5354
5355 SmallVector<const Value *, 4> Operands(I.operand_values());
5356 Cost += getInstructionCost(&I, Operands,
5358 }
5359 }
5360
5361 // Apply subtarget-specific unrolling preferences.
5362 if (ST->isAppleMLike())
5363 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5364 else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
5367
5368 // If this is a small, multi-exit loop similar to something like std::find,
5369 // then there is typically a performance improvement achieved by unrolling.
5370 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5371 UP.RuntimeUnrollMultiExit = true;
5372 UP.Runtime = true;
5373 // Limit unroll count.
5375 // Allow slightly more costly trip-count expansion to catch search loops
5376 // with pointer inductions.
5377 UP.SCEVExpansionBudget = 5;
5378 return;
5379 }
5380
5381 // Enable runtime unrolling for in-order models
5382 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5383 // checking for that case, we can ensure that the default behaviour is
5384 // unchanged
5385 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5386 !ST->getSchedModel().isOutOfOrder()) {
5387 UP.Runtime = true;
5388 UP.Partial = true;
5389 UP.UnrollRemainder = true;
5391
5392 UP.UnrollAndJam = true;
5394 }
5395
5396 // Force unrolling small loops can be very useful because of the branch
5397 // taken cost of the backedge.
5399 UP.Force = true;
5400}
5401
5406
5408 Type *ExpectedType,
5409 bool CanCreate) const {
5410 switch (Inst->getIntrinsicID()) {
5411 default:
5412 return nullptr;
5413 case Intrinsic::aarch64_neon_st2:
5414 case Intrinsic::aarch64_neon_st3:
5415 case Intrinsic::aarch64_neon_st4: {
5416 // Create a struct type
5417 StructType *ST = dyn_cast<StructType>(ExpectedType);
5418 if (!CanCreate || !ST)
5419 return nullptr;
5420 unsigned NumElts = Inst->arg_size() - 1;
5421 if (ST->getNumElements() != NumElts)
5422 return nullptr;
5423 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5424 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5425 return nullptr;
5426 }
5427 Value *Res = PoisonValue::get(ExpectedType);
5428 IRBuilder<> Builder(Inst);
5429 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5430 Value *L = Inst->getArgOperand(i);
5431 Res = Builder.CreateInsertValue(Res, L, i);
5432 }
5433 return Res;
5434 }
5435 case Intrinsic::aarch64_neon_ld2:
5436 case Intrinsic::aarch64_neon_ld3:
5437 case Intrinsic::aarch64_neon_ld4:
5438 if (Inst->getType() == ExpectedType)
5439 return Inst;
5440 return nullptr;
5441 }
5442}
5443
5445 MemIntrinsicInfo &Info) const {
5446 switch (Inst->getIntrinsicID()) {
5447 default:
5448 break;
5449 case Intrinsic::aarch64_neon_ld2:
5450 case Intrinsic::aarch64_neon_ld3:
5451 case Intrinsic::aarch64_neon_ld4:
5452 Info.ReadMem = true;
5453 Info.WriteMem = false;
5454 Info.PtrVal = Inst->getArgOperand(0);
5455 break;
5456 case Intrinsic::aarch64_neon_st2:
5457 case Intrinsic::aarch64_neon_st3:
5458 case Intrinsic::aarch64_neon_st4:
5459 Info.ReadMem = false;
5460 Info.WriteMem = true;
5461 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5462 break;
5463 }
5464
5465 switch (Inst->getIntrinsicID()) {
5466 default:
5467 return false;
5468 case Intrinsic::aarch64_neon_ld2:
5469 case Intrinsic::aarch64_neon_st2:
5470 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5471 break;
5472 case Intrinsic::aarch64_neon_ld3:
5473 case Intrinsic::aarch64_neon_st3:
5474 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5475 break;
5476 case Intrinsic::aarch64_neon_ld4:
5477 case Intrinsic::aarch64_neon_st4:
5478 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5479 break;
5480 }
5481 return true;
5482}
5483
5484/// See if \p I should be considered for address type promotion. We check if \p
5485/// I is a sext with right type and used in memory accesses. If it used in a
5486/// "complex" getelementptr, we allow it to be promoted without finding other
5487/// sext instructions that sign extended the same initial value. A getelementptr
5488/// is considered as "complex" if it has more than 2 operands.
5490 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5491 bool Considerable = false;
5492 AllowPromotionWithoutCommonHeader = false;
5493 if (!isa<SExtInst>(&I))
5494 return false;
5495 Type *ConsideredSExtType =
5496 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5497 if (I.getType() != ConsideredSExtType)
5498 return false;
5499 // See if the sext is the one with the right type and used in at least one
5500 // GetElementPtrInst.
5501 for (const User *U : I.users()) {
5502 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5503 Considerable = true;
5504 // A getelementptr is considered as "complex" if it has more than 2
5505 // operands. We will promote a SExt used in such complex GEP as we
5506 // expect some computation to be merged if they are done on 64 bits.
5507 if (GEPInst->getNumOperands() > 2) {
5508 AllowPromotionWithoutCommonHeader = true;
5509 break;
5510 }
5511 }
5512 }
5513 return Considerable;
5514}
5515
5517 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5518 if (!VF.isScalable())
5519 return true;
5520
5521 Type *Ty = RdxDesc.getRecurrenceType();
5522 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5523 return false;
5524
5525 switch (RdxDesc.getRecurrenceKind()) {
5526 case RecurKind::Sub:
5528 case RecurKind::Add:
5529 case RecurKind::FAdd:
5530 case RecurKind::And:
5531 case RecurKind::Or:
5532 case RecurKind::Xor:
5533 case RecurKind::SMin:
5534 case RecurKind::SMax:
5535 case RecurKind::UMin:
5536 case RecurKind::UMax:
5537 case RecurKind::FMin:
5538 case RecurKind::FMax:
5539 case RecurKind::FMulAdd:
5540 case RecurKind::AnyOf:
5542 return true;
5543 default:
5544 return false;
5545 }
5546}
5547
5550 FastMathFlags FMF,
5552 // The code-generator is currently not able to handle scalable vectors
5553 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5554 // it. This change will be removed when code-generation for these types is
5555 // sufficiently reliable.
5556 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5557 if (VTy->getElementCount() == ElementCount::getScalable(1))
5559
5560 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5561
5562 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5563 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5564
5565 InstructionCost LegalizationCost = 0;
5566 if (LT.first > 1) {
5567 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5568 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5569 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5570 }
5571
5572 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5573}
5574
5576 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5577 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5578 InstructionCost LegalizationCost = 0;
5579 if (LT.first > 1) {
5580 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5581 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5582 LegalizationCost *= LT.first - 1;
5583 }
5584
5585 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5586 assert(ISD && "Invalid opcode");
5587 // Add the final reduction cost for the legal horizontal reduction
5588 switch (ISD) {
5589 case ISD::ADD:
5590 case ISD::AND:
5591 case ISD::OR:
5592 case ISD::XOR:
5593 case ISD::FADD:
5594 return LegalizationCost + 2;
5595 default:
5597 }
5598}
5599
5602 std::optional<FastMathFlags> FMF,
5604 // The code-generator is currently not able to handle scalable vectors
5605 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5606 // it. This change will be removed when code-generation for these types is
5607 // sufficiently reliable.
5608 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5609 if (VTy->getElementCount() == ElementCount::getScalable(1))
5611
5613 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5614 InstructionCost BaseCost =
5615 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5616 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5617 // end up vectorizing for more computationally intensive loops.
5618 return BaseCost + FixedVTy->getNumElements();
5619 }
5620
5621 if (Opcode != Instruction::FAdd)
5623
5624 auto *VTy = cast<ScalableVectorType>(ValTy);
5626 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5627 Cost *= getMaxNumElements(VTy->getElementCount());
5628 return Cost;
5629 }
5630
5631 if (isa<ScalableVectorType>(ValTy))
5632 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5633
5634 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5635 MVT MTy = LT.second;
5636 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5637 assert(ISD && "Invalid opcode");
5638
5639 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5640 // instructions as twice a normal vector add, plus 1 for each legalization
5641 // step (LT.first). This is the only arithmetic vector reduction operation for
5642 // which we have an instruction.
5643 // OR, XOR and AND costs should match the codegen from:
5644 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5645 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5646 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5647 static const CostTblEntry CostTblNoPairwise[]{
5648 {ISD::ADD, MVT::v8i8, 2},
5649 {ISD::ADD, MVT::v16i8, 2},
5650 {ISD::ADD, MVT::v4i16, 2},
5651 {ISD::ADD, MVT::v8i16, 2},
5652 {ISD::ADD, MVT::v2i32, 2},
5653 {ISD::ADD, MVT::v4i32, 2},
5654 {ISD::ADD, MVT::v2i64, 2},
5655 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5656 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
5657 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
5658 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
5659 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
5660 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
5661 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
5662 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
5663 {ISD::XOR, MVT::v16i8, 7},
5664 {ISD::XOR, MVT::v4i16, 4},
5665 {ISD::XOR, MVT::v8i16, 6},
5666 {ISD::XOR, MVT::v2i32, 3},
5667 {ISD::XOR, MVT::v4i32, 5},
5668 {ISD::XOR, MVT::v2i64, 3},
5669 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
5670 {ISD::AND, MVT::v16i8, 7},
5671 {ISD::AND, MVT::v4i16, 4},
5672 {ISD::AND, MVT::v8i16, 6},
5673 {ISD::AND, MVT::v2i32, 3},
5674 {ISD::AND, MVT::v4i32, 5},
5675 {ISD::AND, MVT::v2i64, 3},
5676 };
5677 switch (ISD) {
5678 default:
5679 break;
5680 case ISD::FADD:
5681 if (Type *EltTy = ValTy->getScalarType();
5682 // FIXME: For half types without fullfp16 support, this could extend and
5683 // use a fp32 faddp reduction but current codegen unrolls.
5684 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5685 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5686 const unsigned NElts = MTy.getVectorNumElements();
5687 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5688 isPowerOf2_32(NElts))
5689 // Reduction corresponding to series of fadd instructions is lowered to
5690 // series of faddp instructions. faddp has latency/throughput that
5691 // matches fadd instruction and hence, every faddp instruction can be
5692 // considered to have a relative cost = 1 with
5693 // CostKind = TCK_RecipThroughput.
5694 // An faddp will pairwise add vector elements, so the size of input
5695 // vector reduces by half every time, requiring
5696 // #(faddp instructions) = log2_32(NElts).
5697 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5698 }
5699 break;
5700 case ISD::ADD:
5701 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
5702 return (LT.first - 1) + Entry->Cost;
5703 break;
5704 case ISD::XOR:
5705 case ISD::AND:
5706 case ISD::OR:
5707 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5708 if (!Entry)
5709 break;
5710 auto *ValVTy = cast<FixedVectorType>(ValTy);
5711 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5712 isPowerOf2_32(ValVTy->getNumElements())) {
5713 InstructionCost ExtraCost = 0;
5714 if (LT.first != 1) {
5715 // Type needs to be split, so there is an extra cost of LT.first - 1
5716 // arithmetic ops.
5717 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5718 MTy.getVectorNumElements());
5719 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5720 ExtraCost *= LT.first - 1;
5721 }
5722 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5723 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
5724 return Cost + ExtraCost;
5725 }
5726 break;
5727 }
5728 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5729}
5730
5732 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5733 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5734 EVT VecVT = TLI->getValueType(DL, VecTy);
5735 EVT ResVT = TLI->getValueType(DL, ResTy);
5736
5737 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5738 VecVT.getSizeInBits() >= 64) {
5739 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5740
5741 // The legal cases are:
5742 // UADDLV 8/16/32->32
5743 // UADDLP 32->64
5744 unsigned RevVTSize = ResVT.getSizeInBits();
5745 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5746 RevVTSize <= 32) ||
5747 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5748 RevVTSize <= 32) ||
5749 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5750 RevVTSize <= 64))
5751 return (LT.first - 1) * 2 + 2;
5752 }
5753
5754 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
5755 CostKind);
5756}
5757
5759AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5760 Type *ResTy, VectorType *VecTy,
5762 EVT VecVT = TLI->getValueType(DL, VecTy);
5763 EVT ResVT = TLI->getValueType(DL, ResTy);
5764
5765 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5766 RedOpcode == Instruction::Add) {
5767 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5768
5769 // The legal cases with dotprod are
5770 // UDOT 8->32
5771 // Which requires an additional uaddv to sum the i32 values.
5772 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5773 ResVT == MVT::i32)
5774 return LT.first + 2;
5775 }
5776
5777 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
5778 CostKind);
5779}
5780
5784 static const CostTblEntry ShuffleTbl[] = {
5785 { TTI::SK_Splice, MVT::nxv16i8, 1 },
5786 { TTI::SK_Splice, MVT::nxv8i16, 1 },
5787 { TTI::SK_Splice, MVT::nxv4i32, 1 },
5788 { TTI::SK_Splice, MVT::nxv2i64, 1 },
5789 { TTI::SK_Splice, MVT::nxv2f16, 1 },
5790 { TTI::SK_Splice, MVT::nxv4f16, 1 },
5791 { TTI::SK_Splice, MVT::nxv8f16, 1 },
5792 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
5793 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
5794 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
5795 { TTI::SK_Splice, MVT::nxv2f32, 1 },
5796 { TTI::SK_Splice, MVT::nxv4f32, 1 },
5797 { TTI::SK_Splice, MVT::nxv2f64, 1 },
5798 };
5799
5800 // The code-generator is currently not able to handle scalable vectors
5801 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5802 // it. This change will be removed when code-generation for these types is
5803 // sufficiently reliable.
5806
5807 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
5808 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
5809 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5810 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
5811 : LT.second;
5812 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
5813 InstructionCost LegalizationCost = 0;
5814 if (Index < 0) {
5815 LegalizationCost =
5816 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
5818 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
5820 }
5821
5822 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5823 // Cost performed on a promoted type.
5824 if (LT.second.getScalarType() == MVT::i1) {
5825 LegalizationCost +=
5826 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
5828 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
5830 }
5831 const auto *Entry =
5832 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
5833 assert(Entry && "Illegal Type for Splice");
5834 LegalizationCost += Entry->Cost;
5835 return LegalizationCost * LT.first;
5836}
5837
5839 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5841 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5842 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
5844
5846 return Invalid;
5847
5848 if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
5849 (!ST->isNeonAvailable() || !ST->hasDotProd()))
5850 return Invalid;
5851
5852 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub &&
5853 Opcode != Instruction::FAdd) ||
5854 OpAExtend == TTI::PR_None)
5855 return Invalid;
5856
5857 // Floating-point partial reductions are invalid if `reassoc` and `contract`
5858 // are not allowed.
5859 if (AccumType->isFloatingPointTy()) {
5860 assert(FMF && "Missing FastMathFlags for floating-point partial reduction");
5861 if (!FMF->allowReassoc() || !FMF->allowContract())
5862 return Invalid;
5863 } else {
5864 assert(!FMF &&
5865 "FastMathFlags only apply to floating-point partial reductions");
5866 }
5867
5868 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5869 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
5870 "Unexpected values for OpBExtend or InputTypeB");
5871
5872 // We only support multiply binary operations for now, and for muls we
5873 // require the types being extended to be the same.
5874 if (BinOp && ((*BinOp != Instruction::Mul && *BinOp != Instruction::FMul) ||
5875 InputTypeA != InputTypeB))
5876 return Invalid;
5877
5878 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
5879 if (IsUSDot && !ST->hasMatMulInt8())
5880 return Invalid;
5881
5882 unsigned Ratio =
5883 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
5884 if (VF.getKnownMinValue() <= Ratio)
5885 return Invalid;
5886
5887 VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
5888 VectorType *AccumVectorType =
5889 VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
5890 // We don't yet support all kinds of legalization.
5891 auto TC = TLI->getTypeConversion(AccumVectorType->getContext(),
5892 EVT::getEVT(AccumVectorType));
5893 switch (TC.first) {
5894 default:
5895 return Invalid;
5899 // The legalised type (e.g. after splitting) must be legal too.
5900 if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) !=
5902 return Invalid;
5903 break;
5904 }
5905
5906 std::pair<InstructionCost, MVT> AccumLT =
5907 getTypeLegalizationCost(AccumVectorType);
5908 std::pair<InstructionCost, MVT> InputLT =
5909 getTypeLegalizationCost(InputVectorType);
5910
5911 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
5912
5913 // Prefer using full types by costing half-full input types as more expensive.
5914 if (TypeSize::isKnownLT(InputVectorType->getPrimitiveSizeInBits(),
5916 // FIXME: This can be removed after the cost of the extends are folded into
5917 // the dot-product expression in VPlan, after landing:
5918 // https://github.com/llvm/llvm-project/pull/147302
5919 Cost *= 2;
5920
5921 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
5922 // i16 -> i64 is natively supported for udot/sdot
5923 if (AccumLT.second.getScalarType() == MVT::i64 &&
5924 InputLT.second.getScalarType() == MVT::i16)
5925 return Cost;
5926 // i16 -> i32 is natively supported with SVE2p1
5927 if (AccumLT.second.getScalarType() == MVT::i32 &&
5928 InputLT.second.getScalarType() == MVT::i16 &&
5929 (ST->hasSVE2p1() || ST->hasSME2()))
5930 return Cost;
5931 // i8 -> i64 is supported with an extra level of extends
5932 if (AccumLT.second.getScalarType() == MVT::i64 &&
5933 InputLT.second.getScalarType() == MVT::i8)
5934 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
5935 // because it requires two extra extends on the inputs. But if we'd change
5936 // that now, a regular reduction would be cheaper because the costs of
5937 // the extends in the IR are still counted. This can be fixed
5938 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
5939 return Cost;
5940 }
5941
5942 // i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE.
5943 if (ST->isSVEorStreamingSVEAvailable() ||
5944 (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
5945 ST->hasDotProd())) {
5946 if (AccumLT.second.getScalarType() == MVT::i32 &&
5947 InputLT.second.getScalarType() == MVT::i8)
5948 return Cost;
5949 }
5950
5951 // f16 -> f32 is natively supported for fdot
5952 if (Opcode == Instruction::FAdd && (ST->hasSME2() || ST->hasSVE2p1())) {
5953 if (AccumLT.second.getScalarType() == MVT::f32 &&
5954 InputLT.second.getScalarType() == MVT::f16 &&
5955 AccumLT.second.getVectorMinNumElements() == 4 &&
5956 InputLT.second.getVectorMinNumElements() == 8)
5957 return Cost;
5958 // Floating-point types aren't promoted, so expanding the partial reduction
5959 // is more expensive.
5960 return Cost + 20;
5961 }
5962
5963 // Add additional cost for the extends that would need to be inserted.
5964 return Cost + 2;
5965}
5966
5969 VectorType *SrcTy, ArrayRef<int> Mask,
5970 TTI::TargetCostKind CostKind, int Index,
5972 const Instruction *CxtI) const {
5973 assert((Mask.empty() || DstTy->isScalableTy() ||
5974 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
5975 "Expected the Mask to match the return size if given");
5976 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
5977 "Expected the same scalar types");
5978 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
5979
5980 // If we have a Mask, and the LT is being legalized somehow, split the Mask
5981 // into smaller vectors and sum the cost of each shuffle.
5982 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
5983 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
5984 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
5985 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
5986 // Check for LD3/LD4 instructions, which are represented in llvm IR as
5987 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
5988 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
5989 // cost than just the load.
5990 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
5993 return std::max<InstructionCost>(1, LT.first / 4);
5994
5995 // Check for ST3/ST4 instructions, which are represented in llvm IR as
5996 // store(interleaving-shuffle). The shuffle cost could potentially be free,
5997 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
5998 // cost than just the store.
5999 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
6001 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
6003 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
6004 return LT.first;
6005
6006 unsigned TpNumElts = Mask.size();
6007 unsigned LTNumElts = LT.second.getVectorNumElements();
6008 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
6009 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
6010 LT.second.getVectorElementCount());
6012 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
6013 PreviousCosts;
6014 for (unsigned N = 0; N < NumVecs; N++) {
6015 SmallVector<int> NMask;
6016 // Split the existing mask into chunks of size LTNumElts. Track the source
6017 // sub-vectors to ensure the result has at most 2 inputs.
6018 unsigned Source1 = -1U, Source2 = -1U;
6019 unsigned NumSources = 0;
6020 for (unsigned E = 0; E < LTNumElts; E++) {
6021 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
6023 if (MaskElt < 0) {
6025 continue;
6026 }
6027
6028 // Calculate which source from the input this comes from and whether it
6029 // is new to us.
6030 unsigned Source = MaskElt / LTNumElts;
6031 if (NumSources == 0) {
6032 Source1 = Source;
6033 NumSources = 1;
6034 } else if (NumSources == 1 && Source != Source1) {
6035 Source2 = Source;
6036 NumSources = 2;
6037 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
6038 NumSources++;
6039 }
6040
6041 // Add to the new mask. For the NumSources>2 case these are not correct,
6042 // but are only used for the modular lane number.
6043 if (Source == Source1)
6044 NMask.push_back(MaskElt % LTNumElts);
6045 else if (Source == Source2)
6046 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
6047 else
6048 NMask.push_back(MaskElt % LTNumElts);
6049 }
6050 // Check if we have already generated this sub-shuffle, which means we
6051 // will have already generated the output. For example a <16 x i32> splat
6052 // will be the same sub-splat 4 times, which only needs to be generated
6053 // once and reused.
6054 auto Result =
6055 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
6056 // Check if it was already in the map (already costed).
6057 if (!Result.second)
6058 continue;
6059 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
6060 // getShuffleCost. If not then cost it using the worst case as the number
6061 // of element moves into a new vector.
6062 InstructionCost NCost =
6063 NumSources <= 2
6064 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
6066 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
6067 CxtI)
6068 : LTNumElts;
6069 Result.first->second = NCost;
6070 Cost += NCost;
6071 }
6072 return Cost;
6073 }
6074
6075 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
6076 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
6077 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
6078 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
6079 // This currently only handles low or high extracts to prevent SLP vectorizer
6080 // regressions.
6081 // Note that SVE's ext instruction is destructive, but it can be fused with
6082 // a movprfx to act like a constructive instruction.
6083 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6084 if (LT.second.getFixedSizeInBits() >= 128 &&
6085 cast<FixedVectorType>(SubTp)->getNumElements() ==
6086 LT.second.getVectorNumElements() / 2) {
6087 if (Index == 0)
6088 return 0;
6089 if (Index == (int)LT.second.getVectorNumElements() / 2)
6090 return 1;
6091 }
6093 }
6094 // FIXME: This was added to keep the costs equal when adding DstTys. Update
6095 // the code to handle length-changing shuffles.
6096 if (Kind == TTI::SK_InsertSubvector) {
6097 LT = getTypeLegalizationCost(DstTy);
6098 SrcTy = DstTy;
6099 }
6100
6101 // Check for identity masks, which we can treat as free for both fixed and
6102 // scalable vector paths.
6103 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6104 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6105 all_of(enumerate(Mask), [](const auto &M) {
6106 return M.value() < 0 || M.value() == (int)M.index();
6107 }))
6108 return 0;
6109
6110 // Segmented shuffle matching.
6111 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
6112 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6113 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6115
6117 unsigned Segments =
6119 unsigned SegmentElts = VTy->getNumElements() / Segments;
6120
6121 // dupq zd.t, zn.t[idx]
6122 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6123 ST->isSVEorStreamingSVEAvailable() &&
6124 isDUPQMask(Mask, Segments, SegmentElts))
6125 return LT.first;
6126
6127 // mov zd.q, vn
6128 if (ST->isSVEorStreamingSVEAvailable() &&
6129 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
6130 return LT.first;
6131 }
6132
6133 // Check for broadcast loads, which are supported by the LD1R instruction.
6134 // In terms of code-size, the shuffle vector is free when a load + dup get
6135 // folded into a LD1R. That's what we check and return here. For performance
6136 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6137 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6138 // that we model the load + dup sequence slightly higher because LD1R is a
6139 // high latency instruction.
6140 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6141 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
6142 if (IsLoad && LT.second.isVector() &&
6143 isLegalBroadcastLoad(SrcTy->getElementType(),
6144 LT.second.getVectorElementCount()))
6145 return 0;
6146 }
6147
6148 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6149 // from the perfect shuffle tables.
6150 if (Mask.size() == 4 &&
6151 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
6152 (SrcTy->getScalarSizeInBits() == 16 ||
6153 SrcTy->getScalarSizeInBits() == 32) &&
6154 all_of(Mask, [](int E) { return E < 8; }))
6155 return getPerfectShuffleCost(Mask);
6156
6157 // Check for other shuffles that are not SK_ kinds but we have native
6158 // instructions for, for example ZIP and UZP.
6159 unsigned Unused;
6160 if (LT.second.isFixedLengthVector() &&
6161 LT.second.getVectorNumElements() == Mask.size() &&
6162 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc ||
6163 // Discrepancies between isTRNMask and ShuffleVectorInst::isTransposeMask
6164 // mean that we can end up with shuffles that satisfy isTRNMask, but end
6165 // up labelled as TTI::SK_InsertSubvector. (e.g. {2, 0}).
6166 Kind == TTI::SK_InsertSubvector) &&
6167 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6168 isTRNMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6169 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6170 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6171 LT.second.getVectorNumElements(), 16) ||
6172 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6173 LT.second.getVectorNumElements(), 32) ||
6174 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6175 LT.second.getVectorNumElements(), 64) ||
6176 // Check for non-zero lane splats
6177 all_of(drop_begin(Mask),
6178 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6179 return 1;
6180
6181 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6182 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6183 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6184 static const CostTblEntry ShuffleTbl[] = {
6185 // Broadcast shuffle kinds can be performed with 'dup'.
6186 {TTI::SK_Broadcast, MVT::v8i8, 1},
6187 {TTI::SK_Broadcast, MVT::v16i8, 1},
6188 {TTI::SK_Broadcast, MVT::v4i16, 1},
6189 {TTI::SK_Broadcast, MVT::v8i16, 1},
6190 {TTI::SK_Broadcast, MVT::v2i32, 1},
6191 {TTI::SK_Broadcast, MVT::v4i32, 1},
6192 {TTI::SK_Broadcast, MVT::v2i64, 1},
6193 {TTI::SK_Broadcast, MVT::v4f16, 1},
6194 {TTI::SK_Broadcast, MVT::v8f16, 1},
6195 {TTI::SK_Broadcast, MVT::v4bf16, 1},
6196 {TTI::SK_Broadcast, MVT::v8bf16, 1},
6197 {TTI::SK_Broadcast, MVT::v2f32, 1},
6198 {TTI::SK_Broadcast, MVT::v4f32, 1},
6199 {TTI::SK_Broadcast, MVT::v2f64, 1},
6200 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6201 // 'zip1/zip2' instructions.
6202 {TTI::SK_Transpose, MVT::v8i8, 1},
6203 {TTI::SK_Transpose, MVT::v16i8, 1},
6204 {TTI::SK_Transpose, MVT::v4i16, 1},
6205 {TTI::SK_Transpose, MVT::v8i16, 1},
6206 {TTI::SK_Transpose, MVT::v2i32, 1},
6207 {TTI::SK_Transpose, MVT::v4i32, 1},
6208 {TTI::SK_Transpose, MVT::v2i64, 1},
6209 {TTI::SK_Transpose, MVT::v4f16, 1},
6210 {TTI::SK_Transpose, MVT::v8f16, 1},
6211 {TTI::SK_Transpose, MVT::v4bf16, 1},
6212 {TTI::SK_Transpose, MVT::v8bf16, 1},
6213 {TTI::SK_Transpose, MVT::v2f32, 1},
6214 {TTI::SK_Transpose, MVT::v4f32, 1},
6215 {TTI::SK_Transpose, MVT::v2f64, 1},
6216 // Select shuffle kinds.
6217 // TODO: handle vXi8/vXi16.
6218 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
6219 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
6220 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
6221 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
6222 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
6223 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
6224 // PermuteSingleSrc shuffle kinds.
6225 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
6226 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
6227 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
6228 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
6229 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
6230 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
6231 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
6232 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
6233 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
6234 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
6235 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
6236 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
6237 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
6238 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
6239 // Reverse can be lowered with `rev`.
6240 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
6241 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
6242 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
6243 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
6244 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
6245 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
6246 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
6247 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
6248 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
6249 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
6250 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
6251 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
6252 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
6253 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
6254 // Splice can all be lowered as `ext`.
6255 {TTI::SK_Splice, MVT::v2i32, 1},
6256 {TTI::SK_Splice, MVT::v4i32, 1},
6257 {TTI::SK_Splice, MVT::v2i64, 1},
6258 {TTI::SK_Splice, MVT::v2f32, 1},
6259 {TTI::SK_Splice, MVT::v4f32, 1},
6260 {TTI::SK_Splice, MVT::v2f64, 1},
6261 {TTI::SK_Splice, MVT::v8f16, 1},
6262 {TTI::SK_Splice, MVT::v8bf16, 1},
6263 {TTI::SK_Splice, MVT::v8i16, 1},
6264 {TTI::SK_Splice, MVT::v16i8, 1},
6265 {TTI::SK_Splice, MVT::v4f16, 1},
6266 {TTI::SK_Splice, MVT::v4bf16, 1},
6267 {TTI::SK_Splice, MVT::v4i16, 1},
6268 {TTI::SK_Splice, MVT::v8i8, 1},
6269 // Broadcast shuffle kinds for scalable vectors
6270 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
6271 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6272 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6273 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6274 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6275 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6276 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6277 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6278 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6279 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6280 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6281 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6282 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6283 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6284 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6285 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6286 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6287 // Handle the cases for vector.reverse with scalable vectors
6288 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6289 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6290 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6291 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6292 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6293 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6294 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6295 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6296 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6297 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6298 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6299 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6300 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6301 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6302 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6303 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6304 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6305 };
6306 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6307 return LT.first * Entry->Cost;
6308 }
6309
6310 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6311 return getSpliceCost(SrcTy, Index, CostKind);
6312
6313 // Inserting a subvector can often be done with either a D, S or H register
6314 // move, so long as the inserted vector is "aligned".
6315 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6316 LT.second.getSizeInBits() <= 128 && SubTp) {
6317 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6318 if (SubLT.second.isVector()) {
6319 int NumElts = LT.second.getVectorNumElements();
6320 int NumSubElts = SubLT.second.getVectorNumElements();
6321 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6322 return SubLT.first;
6323 }
6324 }
6325
6326 // Restore optimal kind.
6327 if (IsExtractSubvector)
6329 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6330 Args, CxtI);
6331}
6332
6335 const DominatorTree &DT) {
6336 const auto &Strides = DenseMap<Value *, const SCEV *>();
6337 for (BasicBlock *BB : TheLoop->blocks()) {
6338 // Scan the instructions in the block and look for addresses that are
6339 // consecutive and decreasing.
6340 for (Instruction &I : *BB) {
6341 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6343 Type *AccessTy = getLoadStoreType(&I);
6344 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
6345 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6346 .value_or(0) < 0)
6347 return true;
6348 }
6349 }
6350 }
6351 return false;
6352}
6353
6355 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6357 // For cases like post-LTO vectorization, when we eventually know the trip
6358 // count, epilogue with fixed-width vectorization can be deleted if the trip
6359 // count is less than the epilogue iterations. That's why we prefer
6360 // fixed-width vectorization in epilogue in case of equal costs.
6361 if (IsEpilogue)
6362 return true;
6363 return ST->useFixedOverScalableIfEqualCost();
6364}
6365
6367 return ST->getEpilogueVectorizationMinVF();
6368}
6369
6371 if (!ST->hasSVE())
6372 return false;
6373
6374 // We don't currently support vectorisation with interleaving for SVE - with
6375 // such loops we're better off not using tail-folding. This gives us a chance
6376 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6377 if (TFI->IAI->hasGroups())
6378 return false;
6379
6381 if (TFI->LVL->getReductionVars().size())
6383 if (TFI->LVL->getFixedOrderRecurrences().size())
6385
6386 // We call this to discover whether any load/store pointers in the loop have
6387 // negative strides. This will require extra work to reverse the loop
6388 // predicate, which may be expensive.
6391 *TFI->LVL->getDominatorTree()))
6395
6396 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6397 Required))
6398 return false;
6399
6400 // Don't tail-fold for tight loops where we would be better off interleaving
6401 // with an unpredicated loop.
6402 unsigned NumInsns = 0;
6403 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6404 NumInsns += BB->sizeWithoutDebug();
6405 }
6406
6407 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6408 return NumInsns >= SVETailFoldInsnThreshold;
6409}
6410
6413 StackOffset BaseOffset, bool HasBaseReg,
6414 int64_t Scale, unsigned AddrSpace) const {
6415 // Scaling factors are not free at all.
6416 // Operands | Rt Latency
6417 // -------------------------------------------
6418 // Rt, [Xn, Xm] | 4
6419 // -------------------------------------------
6420 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6421 // Rt, [Xn, Wm, <extend> #imm] |
6423 AM.BaseGV = BaseGV;
6424 AM.BaseOffs = BaseOffset.getFixed();
6425 AM.HasBaseReg = HasBaseReg;
6426 AM.Scale = Scale;
6427 AM.ScalableOffset = BaseOffset.getScalable();
6428 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6429 // Scale represents reg2 * scale, thus account for 1 if
6430 // it is not equal to 0 or 1.
6431 return AM.Scale != 0 && AM.Scale != 1;
6433}
6434
6436 const Instruction *I) const {
6438 // For the binary operators (e.g. or) we need to be more careful than
6439 // selects, here we only transform them if they are already at a natural
6440 // break point in the code - the end of a block with an unconditional
6441 // terminator.
6442 if (I->getOpcode() == Instruction::Or &&
6443 isa<BranchInst>(I->getNextNode()) &&
6444 cast<BranchInst>(I->getNextNode())->isUnconditional())
6445 return true;
6446
6447 if (I->getOpcode() == Instruction::Add ||
6448 I->getOpcode() == Instruction::Sub)
6449 return true;
6450 }
6452}
6453
6456 const TargetTransformInfo::LSRCost &C2) const {
6457 // AArch64 specific here is adding the number of instructions to the
6458 // comparison (though not as the first consideration, as some targets do)
6459 // along with changing the priority of the base additions.
6460 // TODO: Maybe a more nuanced tradeoff between instruction count
6461 // and number of registers? To be investigated at a later date.
6462 if (EnableLSRCostOpt)
6463 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6464 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6465 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6466 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6467
6469}
6470
6471static bool isSplatShuffle(Value *V) {
6472 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6473 return all_equal(Shuf->getShuffleMask());
6474 return false;
6475}
6476
6477/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6478/// or upper half of the vector elements.
6479static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6480 bool AllowSplat = false) {
6481 // Scalable types can't be extract shuffle vectors.
6482 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6483 return false;
6484
6485 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6486 auto *FullTy = FullV->getType();
6487 auto *HalfTy = HalfV->getType();
6488 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6489 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6490 };
6491
6492 auto extractHalf = [](Value *FullV, Value *HalfV) {
6493 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6494 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6495 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6496 };
6497
6498 ArrayRef<int> M1, M2;
6499 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6500 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6501 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6502 return false;
6503
6504 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6505 // it is not checked as an extract below.
6506 if (AllowSplat && isSplatShuffle(Op1))
6507 S1Op1 = nullptr;
6508 if (AllowSplat && isSplatShuffle(Op2))
6509 S2Op1 = nullptr;
6510
6511 // Check that the operands are half as wide as the result and we extract
6512 // half of the elements of the input vectors.
6513 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6514 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6515 return false;
6516
6517 // Check the mask extracts either the lower or upper half of vector
6518 // elements.
6519 int M1Start = 0;
6520 int M2Start = 0;
6521 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6522 if ((S1Op1 &&
6523 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6524 (S2Op1 &&
6525 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6526 return false;
6527
6528 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6529 (M2Start != 0 && M2Start != (NumElements / 2)))
6530 return false;
6531 if (S1Op1 && S2Op1 && M1Start != M2Start)
6532 return false;
6533
6534 return true;
6535}
6536
6537/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6538/// of the vector elements.
6539static bool areExtractExts(Value *Ext1, Value *Ext2) {
6540 auto areExtDoubled = [](Instruction *Ext) {
6541 return Ext->getType()->getScalarSizeInBits() ==
6542 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6543 };
6544
6545 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6546 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6547 !areExtDoubled(cast<Instruction>(Ext1)) ||
6548 !areExtDoubled(cast<Instruction>(Ext2)))
6549 return false;
6550
6551 return true;
6552}
6553
6554/// Check if Op could be used with vmull_high_p64 intrinsic.
6556 Value *VectorOperand = nullptr;
6557 ConstantInt *ElementIndex = nullptr;
6558 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6559 m_ConstantInt(ElementIndex))) &&
6560 ElementIndex->getValue() == 1 &&
6561 isa<FixedVectorType>(VectorOperand->getType()) &&
6562 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6563}
6564
6565/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6566static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6568}
6569
6571 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6572 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6573 if (!GEP || GEP->getNumOperands() != 2)
6574 return false;
6575
6576 Value *Base = GEP->getOperand(0);
6577 Value *Offsets = GEP->getOperand(1);
6578
6579 // We only care about scalar_base+vector_offsets.
6580 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6581 return false;
6582
6583 // Sink extends that would allow us to use 32-bit offset vectors.
6584 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6585 auto *OffsetsInst = cast<Instruction>(Offsets);
6586 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6587 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6588 Ops.push_back(&GEP->getOperandUse(1));
6589 }
6590
6591 // Sink the GEP.
6592 return true;
6593}
6594
6595/// We want to sink following cases:
6596/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6597/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6599 if (match(Op, m_VScale()))
6600 return true;
6601 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
6603 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6604 return true;
6605 }
6606 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
6608 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
6609 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
6610 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6611 return true;
6612 }
6613 return false;
6614}
6615
6616static bool isFNeg(Value *Op) { return match(Op, m_FNeg(m_Value())); }
6617
6618/// Check if sinking \p I's operands to I's basic block is profitable, because
6619/// the operands can be folded into a target instruction, e.g.
6620/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6624 switch (II->getIntrinsicID()) {
6625 case Intrinsic::aarch64_neon_smull:
6626 case Intrinsic::aarch64_neon_umull:
6627 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6628 /*AllowSplat=*/true)) {
6629 Ops.push_back(&II->getOperandUse(0));
6630 Ops.push_back(&II->getOperandUse(1));
6631 return true;
6632 }
6633 [[fallthrough]];
6634
6635 case Intrinsic::fma:
6636 case Intrinsic::fmuladd:
6637 if (isa<VectorType>(I->getType()) &&
6638 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6639 !ST->hasFullFP16())
6640 return false;
6641
6642 if (isFNeg(II->getOperand(0)))
6643 Ops.push_back(&II->getOperandUse(0));
6644 if (isFNeg(II->getOperand(1)))
6645 Ops.push_back(&II->getOperandUse(1));
6646
6647 [[fallthrough]];
6648 case Intrinsic::aarch64_neon_sqdmull:
6649 case Intrinsic::aarch64_neon_sqdmulh:
6650 case Intrinsic::aarch64_neon_sqrdmulh:
6651 // Sink splats for index lane variants
6652 if (isSplatShuffle(II->getOperand(0)))
6653 Ops.push_back(&II->getOperandUse(0));
6654 if (isSplatShuffle(II->getOperand(1)))
6655 Ops.push_back(&II->getOperandUse(1));
6656 return !Ops.empty();
6657 case Intrinsic::aarch64_neon_fmlal:
6658 case Intrinsic::aarch64_neon_fmlal2:
6659 case Intrinsic::aarch64_neon_fmlsl:
6660 case Intrinsic::aarch64_neon_fmlsl2:
6661 // Sink splats for index lane variants
6662 if (isSplatShuffle(II->getOperand(1)))
6663 Ops.push_back(&II->getOperandUse(1));
6664 if (isSplatShuffle(II->getOperand(2)))
6665 Ops.push_back(&II->getOperandUse(2));
6666 return !Ops.empty();
6667 case Intrinsic::aarch64_sve_ptest_first:
6668 case Intrinsic::aarch64_sve_ptest_last:
6669 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6670 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6671 Ops.push_back(&II->getOperandUse(0));
6672 return !Ops.empty();
6673 case Intrinsic::aarch64_sme_write_horiz:
6674 case Intrinsic::aarch64_sme_write_vert:
6675 case Intrinsic::aarch64_sme_writeq_horiz:
6676 case Intrinsic::aarch64_sme_writeq_vert: {
6677 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
6678 if (!Idx || Idx->getOpcode() != Instruction::Add)
6679 return false;
6680 Ops.push_back(&II->getOperandUse(1));
6681 return true;
6682 }
6683 case Intrinsic::aarch64_sme_read_horiz:
6684 case Intrinsic::aarch64_sme_read_vert:
6685 case Intrinsic::aarch64_sme_readq_horiz:
6686 case Intrinsic::aarch64_sme_readq_vert:
6687 case Intrinsic::aarch64_sme_ld1b_vert:
6688 case Intrinsic::aarch64_sme_ld1h_vert:
6689 case Intrinsic::aarch64_sme_ld1w_vert:
6690 case Intrinsic::aarch64_sme_ld1d_vert:
6691 case Intrinsic::aarch64_sme_ld1q_vert:
6692 case Intrinsic::aarch64_sme_st1b_vert:
6693 case Intrinsic::aarch64_sme_st1h_vert:
6694 case Intrinsic::aarch64_sme_st1w_vert:
6695 case Intrinsic::aarch64_sme_st1d_vert:
6696 case Intrinsic::aarch64_sme_st1q_vert:
6697 case Intrinsic::aarch64_sme_ld1b_horiz:
6698 case Intrinsic::aarch64_sme_ld1h_horiz:
6699 case Intrinsic::aarch64_sme_ld1w_horiz:
6700 case Intrinsic::aarch64_sme_ld1d_horiz:
6701 case Intrinsic::aarch64_sme_ld1q_horiz:
6702 case Intrinsic::aarch64_sme_st1b_horiz:
6703 case Intrinsic::aarch64_sme_st1h_horiz:
6704 case Intrinsic::aarch64_sme_st1w_horiz:
6705 case Intrinsic::aarch64_sme_st1d_horiz:
6706 case Intrinsic::aarch64_sme_st1q_horiz: {
6707 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
6708 if (!Idx || Idx->getOpcode() != Instruction::Add)
6709 return false;
6710 Ops.push_back(&II->getOperandUse(3));
6711 return true;
6712 }
6713 case Intrinsic::aarch64_neon_pmull:
6714 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
6715 return false;
6716 Ops.push_back(&II->getOperandUse(0));
6717 Ops.push_back(&II->getOperandUse(1));
6718 return true;
6719 case Intrinsic::aarch64_neon_pmull64:
6720 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
6721 II->getArgOperand(1)))
6722 return false;
6723 Ops.push_back(&II->getArgOperandUse(0));
6724 Ops.push_back(&II->getArgOperandUse(1));
6725 return true;
6726 case Intrinsic::masked_gather:
6727 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
6728 return false;
6729 Ops.push_back(&II->getArgOperandUse(0));
6730 return true;
6731 case Intrinsic::masked_scatter:
6732 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
6733 return false;
6734 Ops.push_back(&II->getArgOperandUse(1));
6735 return true;
6736 default:
6737 return false;
6738 }
6739 }
6740
6741 auto ShouldSinkCondition = [](Value *Cond,
6742 SmallVectorImpl<Use *> &Ops) -> bool {
6744 return false;
6746 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6747 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
6748 return false;
6749 if (isa<CmpInst>(II->getOperand(0)))
6750 Ops.push_back(&II->getOperandUse(0));
6751 return true;
6752 };
6753
6754 switch (I->getOpcode()) {
6755 case Instruction::GetElementPtr:
6756 case Instruction::Add:
6757 case Instruction::Sub:
6758 // Sink vscales closer to uses for better isel
6759 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6760 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
6761 Ops.push_back(&I->getOperandUse(Op));
6762 return true;
6763 }
6764 }
6765 break;
6766 case Instruction::Select: {
6767 if (!ShouldSinkCondition(I->getOperand(0), Ops))
6768 return false;
6769
6770 Ops.push_back(&I->getOperandUse(0));
6771 return true;
6772 }
6773 case Instruction::Br: {
6774 if (cast<BranchInst>(I)->isUnconditional())
6775 return false;
6776
6777 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition(), Ops))
6778 return false;
6779
6780 Ops.push_back(&I->getOperandUse(0));
6781 return true;
6782 }
6783 case Instruction::FMul:
6784 // fmul with contract flag can be combined with fadd into fma.
6785 // Sinking fneg into this block enables fmls pattern.
6786 if (cast<FPMathOperator>(I)->hasAllowContract()) {
6787 if (isFNeg(I->getOperand(0)))
6788 Ops.push_back(&I->getOperandUse(0));
6789 if (isFNeg(I->getOperand(1)))
6790 Ops.push_back(&I->getOperandUse(1));
6791 }
6792 break;
6793
6794 default:
6795 break;
6796 }
6797
6798 if (!I->getType()->isVectorTy())
6799 return !Ops.empty();
6800
6801 switch (I->getOpcode()) {
6802 case Instruction::Sub:
6803 case Instruction::Add: {
6804 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
6805 return false;
6806
6807 // If the exts' operands extract either the lower or upper elements, we
6808 // can sink them too.
6809 auto Ext1 = cast<Instruction>(I->getOperand(0));
6810 auto Ext2 = cast<Instruction>(I->getOperand(1));
6811 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
6812 Ops.push_back(&Ext1->getOperandUse(0));
6813 Ops.push_back(&Ext2->getOperandUse(0));
6814 }
6815
6816 Ops.push_back(&I->getOperandUse(0));
6817 Ops.push_back(&I->getOperandUse(1));
6818
6819 return true;
6820 }
6821 case Instruction::Or: {
6822 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6823 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6824 if (ST->hasNEON()) {
6825 Instruction *OtherAnd, *IA, *IB;
6826 Value *MaskValue;
6827 // MainAnd refers to And instruction that has 'Not' as one of its operands
6828 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
6829 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
6830 m_Instruction(IA)))))) {
6831 if (match(OtherAnd,
6832 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
6833 Instruction *MainAnd = I->getOperand(0) == OtherAnd
6834 ? cast<Instruction>(I->getOperand(1))
6835 : cast<Instruction>(I->getOperand(0));
6836
6837 // Both Ands should be in same basic block as Or
6838 if (I->getParent() != MainAnd->getParent() ||
6839 I->getParent() != OtherAnd->getParent())
6840 return false;
6841
6842 // Non-mask operands of both Ands should also be in same basic block
6843 if (I->getParent() != IA->getParent() ||
6844 I->getParent() != IB->getParent())
6845 return false;
6846
6847 Ops.push_back(
6848 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
6849 Ops.push_back(&I->getOperandUse(0));
6850 Ops.push_back(&I->getOperandUse(1));
6851
6852 return true;
6853 }
6854 }
6855 }
6856
6857 return false;
6858 }
6859 case Instruction::Mul: {
6860 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6861 auto *Ty = cast<VectorType>(V->getType());
6862 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6863 if (Ty->isScalableTy())
6864 return false;
6865
6866 // Indexed variants of Mul exist for i16 and i32 element types only.
6867 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
6868 };
6869
6870 int NumZExts = 0, NumSExts = 0;
6871 for (auto &Op : I->operands()) {
6872 // Make sure we are not already sinking this operand
6873 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
6874 continue;
6875
6876 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
6877 auto *Ext = cast<Instruction>(Op);
6878 auto *ExtOp = Ext->getOperand(0);
6879 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
6880 Ops.push_back(&Ext->getOperandUse(0));
6881 Ops.push_back(&Op);
6882
6883 if (isa<SExtInst>(Ext)) {
6884 NumSExts++;
6885 } else {
6886 NumZExts++;
6887 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
6888 if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
6889 I->getType()->getScalarSizeInBits())
6890 NumSExts++;
6891 }
6892
6893 continue;
6894 }
6895
6897 if (!Shuffle)
6898 continue;
6899
6900 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
6901 // operand and the s/zext can help create indexed s/umull. This is
6902 // especially useful to prevent i64 mul being scalarized.
6903 if (isSplatShuffle(Shuffle) &&
6904 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
6905 Ops.push_back(&Shuffle->getOperandUse(0));
6906 Ops.push_back(&Op);
6907 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
6908 NumSExts++;
6909 else
6910 NumZExts++;
6911 continue;
6912 }
6913
6914 Value *ShuffleOperand = Shuffle->getOperand(0);
6915 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
6916 if (!Insert)
6917 continue;
6918
6919 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
6920 if (!OperandInstr)
6921 continue;
6922
6923 ConstantInt *ElementConstant =
6924 dyn_cast<ConstantInt>(Insert->getOperand(2));
6925 // Check that the insertelement is inserting into element 0
6926 if (!ElementConstant || !ElementConstant->isZero())
6927 continue;
6928
6929 unsigned Opcode = OperandInstr->getOpcode();
6930 if (Opcode == Instruction::SExt)
6931 NumSExts++;
6932 else if (Opcode == Instruction::ZExt)
6933 NumZExts++;
6934 else {
6935 // If we find that the top bits are known 0, then we can sink and allow
6936 // the backend to generate a umull.
6937 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
6938 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
6939 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
6940 continue;
6941 NumZExts++;
6942 }
6943
6944 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
6945 // the And, just to hoist it again back to the load.
6946 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
6947 Ops.push_back(&Insert->getOperandUse(1));
6948 Ops.push_back(&Shuffle->getOperandUse(0));
6949 Ops.push_back(&Op);
6950 }
6951
6952 // It is profitable to sink if we found two of the same type of extends.
6953 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
6954 return true;
6955
6956 // Otherwise, see if we should sink splats for indexed variants.
6957 if (!ShouldSinkSplatForIndexedVariant(I))
6958 return false;
6959
6960 Ops.clear();
6961 if (isSplatShuffle(I->getOperand(0)))
6962 Ops.push_back(&I->getOperandUse(0));
6963 if (isSplatShuffle(I->getOperand(1)))
6964 Ops.push_back(&I->getOperandUse(1));
6965
6966 return !Ops.empty();
6967 }
6968 case Instruction::FMul: {
6969 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6970 if (I->getType()->isScalableTy())
6971 return !Ops.empty();
6972
6973 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6974 !ST->hasFullFP16())
6975 return !Ops.empty();
6976
6977 // Sink splats for index lane variants
6978 if (isSplatShuffle(I->getOperand(0)))
6979 Ops.push_back(&I->getOperandUse(0));
6980 if (isSplatShuffle(I->getOperand(1)))
6981 Ops.push_back(&I->getOperandUse(1));
6982 return !Ops.empty();
6983 }
6984 default:
6985 return false;
6986 }
6987 return false;
6988}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineWhilelo(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST)
static void extractAttrFeatures(const Function &F, const AArch64TTIImpl *TTI, SmallVectorImpl< StringRef > &Features)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< int > Aarch64ForceUnrollThreshold("aarch64-force-unroll-threshold", cl::init(0), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in AArch64"))
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static bool isFNeg(Value *Op)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE, const DominatorTree &DT)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
static Value * getCondition(Instruction *I)
Hexagon Common GEP
const HexagonInstrInfo * TII
#define _
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
#define LLVM_DEBUG(...)
Definition Debug.h:114
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
APInt getPriorityMask(const Function &F) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const override
bool enableScalableVectorization() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1679
unsigned countLeadingOnes() const
Definition APInt.h:1633
void negate()
Negate this APInt in place.
Definition APInt.h:1477
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1052
unsigned logBase2() const
Definition APInt.h:1770
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:828
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1571
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:219
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
bool isUnsigned() const
Definition InstrTypes.h:936
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:771
bool empty() const
Definition DenseMap.h:109
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowContract() const
Definition FMF.h:69
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2555
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1111
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2543
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:595
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:562
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:580
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:1945
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2259
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2467
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1718
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2177
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1855
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2577
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1868
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:590
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2250
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2776
This instruction inserts a single (scalar) element into a VectorType value.
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition MapVector.h:56
Information for memory intrinsic cost model.
const Instruction * getInst() const
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:712
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:962
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
LLVM_ABI APInt getCpuSupportsMask(ArrayRef< StringRef > Features)
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0,...
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Uninitialized
Definition Threading.h:60
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2163
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y are an integer type, one is the current recur...
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2156
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0,...
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool isFixedLengthVector() const
Definition ValueTypes.h:181
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
bool isVariant() const
Definition MCSchedule.h:144
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...